[PATCH] D69818: [HIP] Fix pointer type kernel arg for amdgpu
yaxunl abandoned this revision. yaxunl added a comment. Please review Michael's patch https://reviews.llvm.org/D69826 which supersedes this one. Thanks. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D69818/new/ https://reviews.llvm.org/D69818 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D69818: [HIP] Fix pointer type kernel arg for amdgpu
yaxunl marked an inline comment as done. yaxunl added a comment. BTW Michael Liao will create another patch which will supersede this patch. That patch contains similar changes and also handles pointers in a byval struct or array. Comment at: clang/lib/CodeGen/CGCall.cpp:1172 if (isa(Ty)) - return CGF.Builder.CreateBitCast(Val, Ty, "coerce.val"); + return CGF.Builder.CreatePointerCast(Val, Ty, "coerce.val"); tra wrote: > What's the purpose of this change? this is supposed to cast the coerced type to the original type. Previously, no coerced type changes the address space, therefore a bitcast is sufficient. Now we coerce a type to different address space, therefore we need a pointer cast. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D69818/new/ https://reviews.llvm.org/D69818 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D69818: [HIP] Fix pointer type kernel arg for amdgpu
tra added inline comments. Comment at: clang/lib/CodeGen/CGCall.cpp:1172 if (isa(Ty)) - return CGF.Builder.CreateBitCast(Val, Ty, "coerce.val"); + return CGF.Builder.CreatePointerCast(Val, Ty, "coerce.val"); What's the purpose of this change? Comment at: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu:15-16 +// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +__global__ void kernel3(__attribute__((address_space(2))) int *x, +__attribute__((address_space(1))) int *y) { + y[0] = x[0]; Interesting. Clang used to crash on explicit address space attribute in the past. I'm glad it works now. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D69818/new/ https://reviews.llvm.org/D69818 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D69818: [HIP] Fix pointer type kernel arg for amdgpu
yaxunl updated this revision to Diff 227764. yaxunl added a comment. add a test for non-kernel function. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D69818/new/ https://reviews.llvm.org/D69818 Files: clang/lib/CodeGen/CGCall.cpp clang/lib/CodeGen/TargetInfo.cpp clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu === --- /dev/null +++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -emit-llvm -x hip %s -o - | FileCheck %s +#include "Inputs/cuda.h" +// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) +__global__ void kernel1(int *x) { + x[0]++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* dereferenceable(4) %x.coerce) +__global__ void kernel2(int &x) { + x++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +__global__ void kernel3(__attribute__((address_space(2))) int *x, +__attribute__((address_space(1))) int *y) { + y[0] = x[0]; +} + +// CHECK: define void @_Z4funcPi(i32* %x) +__device__ void func(int *x) { + x[0]++; +} Index: clang/lib/CodeGen/TargetInfo.cpp === --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -7816,6 +7816,27 @@ if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + // Coerce pointer type kernel arguments in default address space to + // device address space for HIP. + QualType PointeeTy; + if (getContext().getLangOpts().HIP) { +if (auto *PT = Ty->getAs()) { + if (PT->getPointeeType().getAddressSpace() == LangAS::Default) { +PointeeTy = PT->getPointeeType(); + } +} else if (auto *RT = Ty->getAs()) { + if (RT->getPointeeType().getAddressSpace() == LangAS::Default) { +PointeeTy = RT->getPointeeType(); + } +} + +if (PointeeTy != QualType()) { + return ABIArgInfo::getDirect( +CGT.ConvertType(PointeeTy) +->getPointerTo( +getContext().getTargetAddressSpace(LangAS::cuda_device))); +} + } // If we set CanBeFlattened to true, CodeGen will expand the struct to its // individual elements, which confuses the Clover OpenCL backend; therefore we // have to set it to false here. Other args of getDirect() are just defaults. Index: clang/lib/CodeGen/CGCall.cpp === --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -1169,7 +1169,7 @@ if (isa(Val->getType())) { // If this is Pointer->Pointer avoid conversion to and from int. if (isa(Ty)) - return CGF.Builder.CreateBitCast(Val, Ty, "coerce.val"); + return CGF.Builder.CreatePointerCast(Val, Ty, "coerce.val"); // Convert the pointer to an integer so we can play with its width. Val = CGF.Builder.CreatePtrToInt(Val, CGF.IntPtrTy, "coerce.val.pi"); Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu === --- /dev/null +++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -emit-llvm -x hip %s -o - | FileCheck %s +#include "Inputs/cuda.h" +// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) +__global__ void kernel1(int *x) { + x[0]++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* dereferenceable(4) %x.coerce) +__global__ void kernel2(int &x) { + x++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +__global__ void kernel3(__attribute__((address_space(2))) int *x, +__attribute__((address_space(1))) int *y) { + y[0] = x[0]; +} + +// CHECK: define void @_Z4funcPi(i32* %x) +__device__ void func(int *x) { + x[0]++; +} Index: clang/lib/CodeGen/TargetInfo.cpp === --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -7816,6 +7816,27 @@ if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + // Coerce pointer type kernel arguments in default address space to + // device address space for HIP. + QualType PointeeTy; + if (getContext().getLangOpts().HIP) { +if (auto *PT = Ty->getAs()) { + if (PT->getPointeeType().getAddressSpace() == LangAS::Default) { +PointeeTy = PT->getPointeeType(); + } +} else if (auto *RT = Ty->getAs()) { + if (RT
[PATCH] D69818: [HIP] Fix pointer type kernel arg for amdgpu
yaxunl created this revision. yaxunl added reviewers: tra, rjmccall. Herald added subscribers: t-tye, tpr, dstuttard, nhaehnle, wdng, jvesely, kzhuravl. amdgpu target prefers pointer type kernel arg in default address space to be coerced to device address space for better performance. This patch fixes that. https://reviews.llvm.org/D69818 Files: clang/lib/CodeGen/CGCall.cpp clang/lib/CodeGen/TargetInfo.cpp clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu === --- /dev/null +++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -emit-llvm -x hip %s -o - | FileCheck %s +#include "Inputs/cuda.h" +// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) +__global__ void kernel1(int *x) { + x[0]++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* dereferenceable(4) %x.coerce) +__global__ void kernel2(int &x) { + x++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +__global__ void kernel3(__attribute__((address_space(2))) int *x, +__attribute__((address_space(1))) int *y) { + y[0] = x[0]; +} Index: clang/lib/CodeGen/TargetInfo.cpp === --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -7816,6 +7816,27 @@ if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + // Coerce pointer type kernel arguments in default address space to + // device address space for HIP. + QualType PointeeTy; + if (getContext().getLangOpts().HIP) { +if (auto *PT = Ty->getAs()) { + if (PT->getPointeeType().getAddressSpace() == LangAS::Default) { +PointeeTy = PT->getPointeeType(); + } +} else if (auto *RT = Ty->getAs()) { + if (RT->getPointeeType().getAddressSpace() == LangAS::Default) { +PointeeTy = RT->getPointeeType(); + } +} + +if (PointeeTy != QualType()) { + return ABIArgInfo::getDirect( +CGT.ConvertType(PointeeTy) +->getPointerTo( +getContext().getTargetAddressSpace(LangAS::cuda_device))); +} + } // If we set CanBeFlattened to true, CodeGen will expand the struct to its // individual elements, which confuses the Clover OpenCL backend; therefore we // have to set it to false here. Other args of getDirect() are just defaults. Index: clang/lib/CodeGen/CGCall.cpp === --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -1169,7 +1169,7 @@ if (isa(Val->getType())) { // If this is Pointer->Pointer avoid conversion to and from int. if (isa(Ty)) - return CGF.Builder.CreateBitCast(Val, Ty, "coerce.val"); + return CGF.Builder.CreatePointerCast(Val, Ty, "coerce.val"); // Convert the pointer to an integer so we can play with its width. Val = CGF.Builder.CreatePtrToInt(Val, CGF.IntPtrTy, "coerce.val.pi"); Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu === --- /dev/null +++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -emit-llvm -x hip %s -o - | FileCheck %s +#include "Inputs/cuda.h" +// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) +__global__ void kernel1(int *x) { + x[0]++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* dereferenceable(4) %x.coerce) +__global__ void kernel2(int &x) { + x++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +__global__ void kernel3(__attribute__((address_space(2))) int *x, +__attribute__((address_space(1))) int *y) { + y[0] = x[0]; +} Index: clang/lib/CodeGen/TargetInfo.cpp === --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -7816,6 +7816,27 @@ if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + // Coerce pointer type kernel arguments in default address space to + // device address space for HIP. + QualType PointeeTy; + if (getContext().getLangOpts().HIP) { +if (auto *PT = Ty->getAs()) { + if (PT->getPointeeType().getAddressSpace() == LangAS::Default) { +PointeeTy = PT->getPointeeType(); + } +} else if (auto *RT = Ty->getAs()) { + if (RT->getPointeeType().getAddressSpace