https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/157484
>From 7bf7fe1df8a873964df2ebc17328d9bef00f1347 Mon Sep 17 00:00:00 2001 From: Ross Brunton <r...@codeplay.com> Date: Mon, 8 Sep 2025 10:45:42 +0100 Subject: [PATCH 1/6] [Offload] Add GenericPluginTy::get_mem_info This takes a pointer allocated by the plugin, and returns a struct containing important information about it. This is now used in `olMemFree` instead of using a map to track allocation info. --- offload/include/omptarget.h | 2 + offload/liboffload/src/OffloadImpl.cpp | 27 +-- .../amdgpu/dynamic_hsa/hsa.cpp | 1 + .../amdgpu/dynamic_hsa/hsa_ext_amd.h | 3 + offload/plugins-nextgen/amdgpu/src/rtl.cpp | 31 ++- .../common/include/PluginInterface.h | 13 ++ offload/plugins-nextgen/cuda/src/rtl.cpp | 216 +++++++++++------- offload/plugins-nextgen/host/src/rtl.cpp | 5 + 8 files changed, 193 insertions(+), 105 deletions(-) diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 8fd722bb15022..197cbd3806d91 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -96,6 +96,8 @@ enum OpenMPOffloadingDeclareTargetFlags { OMP_REGISTER_REQUIRES = 0x10, }; +// Note: This type should be no larger than 3 bits, as the amdgpu platform uses +// the lower 3 bits of a pointer to store it enum TargetAllocTy : int32_t { TARGET_ALLOC_DEVICE = 0, TARGET_ALLOC_HOST, diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index fef3a5669e0d5..9620c35ac5c10 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -201,8 +201,6 @@ struct OffloadContext { bool TracingEnabled = false; bool ValidationEnabled = true; - DenseMap<void *, AllocInfo> AllocInfoMap{}; - std::mutex AllocInfoMapMutex{}; SmallVector<ol_platform_impl_t, 4> Platforms{}; size_t RefCount; @@ -624,32 +622,15 @@ Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type, return Alloc.takeError(); *AllocationOut = *Alloc; - { - std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex); - OffloadContext::get().AllocInfoMap.insert_or_assign( - *Alloc, AllocInfo{Device, Type}); - } return Error::success(); } Error olMemFree_impl(ol_platform_handle_t Platform, void *Address) { - ol_device_handle_t Device; - ol_alloc_type_t Type; - { - std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex); - if (!OffloadContext::get().AllocInfoMap.contains(Address)) - return createOffloadError(ErrorCode::INVALID_ARGUMENT, - "address is not a known allocation"); - - auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address); - Device = AllocInfo.Device; - Type = AllocInfo.Type; - OffloadContext::get().AllocInfoMap.erase(Address); - } - assert(Platform == Device->Platform); + auto MemInfo = Platform->Plugin->get_memory_info(Address); + if (auto Err = MemInfo.takeError()) + return Err; - if (auto Res = - Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type))) + if (auto Res = MemInfo->Device->dataDelete(Address, MemInfo->Type)) return Res; return Error::success(); diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp index bc92f4a46a5c0..7f0e75cb9b500 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp @@ -68,6 +68,7 @@ DLWRAP(hsa_amd_register_system_event_handler, 2) DLWRAP(hsa_amd_signal_create, 5) DLWRAP(hsa_amd_signal_async_handler, 5) DLWRAP(hsa_amd_pointer_info, 5) +DLWRAP(hsa_amd_pointer_info_set_userdata, 2) DLWRAP(hsa_code_object_reader_create_from_memory, 3) DLWRAP(hsa_code_object_reader_destroy, 1) DLWRAP(hsa_executable_load_agent_code_object, 5) diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h index 29cfe78082dbb..5c2fbd127c86d 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h @@ -160,6 +160,7 @@ typedef struct hsa_amd_pointer_info_s { void* agentBaseAddress; void* hostBaseAddress; size_t sizeInBytes; + void *userData; } hsa_amd_pointer_info_t; hsa_status_t hsa_amd_pointer_info(const void* ptr, @@ -168,6 +169,8 @@ hsa_status_t hsa_amd_pointer_info(const void* ptr, uint32_t* num_agents_accessible, hsa_agent_t** accessible); +hsa_status_t hsa_amd_pointer_info_set_userdata(const void *ptr, void *userdata); + #ifdef __cplusplus } #endif diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index c26cfe961aa0e..90d9ca9f787e7 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -96,6 +96,8 @@ struct AMDGPUMemoryPoolTy; namespace hsa_utils { +using UserDataPair = PointerIntPair<AMDGPUDeviceTy *, 3>; + /// Iterate elements using an HSA iterate function. Do not use this function /// directly but the specialized ones below instead. template <typename ElemTy, typename IterFuncTy, typename CallbackTy> @@ -2032,7 +2034,8 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy { /// Class implementing the AMDGPU device functionalities which derives from the /// generic device class. -struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { +struct alignas(8) alignas(void *) AMDGPUDeviceTy : public GenericDeviceTy, + AMDGenericDeviceTy { // Create an AMDGPU device with a device id and default AMDGPU grid values. AMDGPUDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, AMDHostDeviceTy &HostDevice, hsa_agent_t Agent) @@ -3561,6 +3564,26 @@ struct AMDGPUPluginTy final : public GenericPluginTy { return KernelAgents; } + Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) override { + hsa_amd_pointer_info_t Info; + if (auto Err = Plugin::check( + hsa_amd_pointer_info(TgtPtr, &Info, nullptr, nullptr, nullptr), + "error in hsa_amd_pointer_info: %s")) + return Err; + + // The pointer info struct contains an "agent" field, but that doesn't + // necessarily map to the device that created it + MemoryInfoTy ToReturn; + ToReturn.Base = Info.agentBaseAddress; + ToReturn.Size = Info.size; + auto UserData = hsa_utils::UserDataPair::getFromOpaqueValue(Info.userData); + ToReturn.Type = static_cast<TargetAllocTy>(UserData.getInt()); + ToReturn.Device = + reinterpret_cast<GenericDeviceTy *>(UserData.getPointer()); + + return ToReturn; + } + private: /// Event handler that will be called by ROCr if an event is detected. static hsa_status_t eventHandler(const hsa_amd_event_t *Event, @@ -3882,6 +3905,12 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { REPORT("%s\n", toString(std::move(Err)).data()); return nullptr; } + + auto UserData = hsa_utils::UserDataPair(this, Kind); + if (auto Err = Plugin::check( + hsa_amd_pointer_info_set_userdata(Alloc, UserData.getOpaqueValue()), + "error in hsa_amd_pointer_info_set_userdata: %s")) + REPORT("%s\n", toString(std::move(Err)).data()); } return Alloc; diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index f0c05a1b90716..ce270b3f1ab9a 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -1271,6 +1271,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy { DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; }; +struct MemoryInfoTy { + void *Base; + size_t Size; + TargetAllocTy Type; + GenericDeviceTy *Device; + + void *limit() const { return reinterpret_cast<char *>(Base) + Size; } +}; + /// Class implementing common functionalities of offload plugins. Each plugin /// should define the specific plugin class, derive from this generic one, and /// implement the necessary virtual function members. @@ -1567,6 +1576,10 @@ struct GenericPluginTy { /// object and return immediately. int32_t async_barrier(omp_interop_val_t *Interop); + /// Given a pointer previously allocated by GenericDeviceTy::Allocate, return + /// information about that allocation + virtual Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) = 0; + private: /// Indicates if the platform runtime has been fully initialized. bool Initialized = false; diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index af3c74636bff3..f04049cd8bdfd 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -566,89 +566,10 @@ struct CUDADeviceTy : public GenericDeviceTy { } /// Allocate memory on the device or related to the device. - void *allocate(size_t Size, void *, TargetAllocTy Kind) override { - if (Size == 0) - return nullptr; - - if (auto Err = setContext()) { - REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); - return nullptr; - } - - void *MemAlloc = nullptr; - CUdeviceptr DevicePtr; - CUresult Res; - - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - Res = cuMemAlloc(&DevicePtr, Size); - MemAlloc = (void *)DevicePtr; - break; - case TARGET_ALLOC_HOST: - Res = cuMemAllocHost(&MemAlloc, Size); - break; - case TARGET_ALLOC_SHARED: - Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL); - MemAlloc = (void *)DevicePtr; - break; - case TARGET_ALLOC_DEVICE_NON_BLOCKING: { - CUstream Stream; - if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) - break; - if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream))) - break; - cuStreamSynchronize(Stream); - Res = cuStreamDestroy(Stream); - MemAlloc = (void *)DevicePtr; - } - } - - if (auto Err = - Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) { - REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); - return nullptr; - } - return MemAlloc; - } + void *allocate(size_t Size, void *, TargetAllocTy Kind) override; /// Deallocate memory on the device or related to the device. - int free(void *TgtPtr, TargetAllocTy Kind) override { - if (TgtPtr == nullptr) - return OFFLOAD_SUCCESS; - - if (auto Err = setContext()) { - REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } - - CUresult Res; - switch (Kind) { - case TARGET_ALLOC_DEFAULT: - case TARGET_ALLOC_DEVICE: - case TARGET_ALLOC_SHARED: - Res = cuMemFree((CUdeviceptr)TgtPtr); - break; - case TARGET_ALLOC_HOST: - Res = cuMemFreeHost(TgtPtr); - break; - case TARGET_ALLOC_DEVICE_NON_BLOCKING: { - CUstream Stream; - if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) - break; - cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream); - cuStreamSynchronize(Stream); - if ((Res = cuStreamDestroy(Stream))) - break; - } - } - - if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) { - REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; - } + int free(void *TgtPtr, TargetAllocTy Kind) override; /// Synchronize current thread with the pending operations on the async info. Error synchronizeImpl(__tgt_async_info &AsyncInfo, @@ -1612,8 +1533,141 @@ struct CUDAPluginTy final : public GenericPluginTy { // revision. return Major == ImageMajor && Minor >= ImageMinor; } + + Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) override { + std::lock_guard<std::mutex> Lock(MemoryInfoMutex); + + // TODO: Is it possible to query this information using CUDA directly? + + // Fast case, we have been given the base pointer directly + if (MemoryInfo.contains(TgtPtr)) + return MemoryInfo[TgtPtr]; + + // Slower case, we need to look up the base pointer first + auto Loc = std::lower_bound(MemoryBases.begin(), MemoryBases.end(), TgtPtr, + [&](const void *Iter, const void *Val) { + return MemoryInfo[Iter].limit() < Val; + }); + if (Loc == MemoryBases.end() || TgtPtr > MemoryInfo[*Loc].limit()) + return Plugin::error(ErrorCode::NOT_FOUND, + "allocated memory information not found"); + return MemoryInfo[*Loc]; + } + +private: + friend CUDADeviceTy; + + std::mutex MemoryInfoMutex; + llvm::SmallVector<const void *> MemoryBases; + llvm::DenseMap<const void *, MemoryInfoTy> MemoryInfo; }; +void *CUDADeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { + if (Size == 0) + return nullptr; + + if (auto Err = setContext()) { + REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); + return nullptr; + } + + void *MemAlloc = nullptr; + CUdeviceptr DevicePtr; + CUresult Res; + + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + Res = cuMemAlloc(&DevicePtr, Size); + MemAlloc = (void *)DevicePtr; + break; + case TARGET_ALLOC_HOST: + Res = cuMemAllocHost(&MemAlloc, Size); + break; + case TARGET_ALLOC_SHARED: + Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL); + MemAlloc = (void *)DevicePtr; + break; + case TARGET_ALLOC_DEVICE_NON_BLOCKING: { + CUstream Stream; + if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) + break; + if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream))) + break; + cuStreamSynchronize(Stream); + Res = cuStreamDestroy(Stream); + MemAlloc = (void *)DevicePtr; + } + } + + if (auto Err = Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) { + REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); + return nullptr; + } + + auto CudaPlugin = reinterpret_cast<CUDAPluginTy *>(&Plugin); + { + std::lock_guard<std::mutex> Lock(CudaPlugin->MemoryInfoMutex); + CudaPlugin->MemoryBases.insert( + std::lower_bound(CudaPlugin->MemoryBases.begin(), + CudaPlugin->MemoryBases.end(), MemAlloc), + MemAlloc); + CudaPlugin->MemoryInfo[MemAlloc] = MemoryInfoTy{ + /*Base=*/MemAlloc, + /*Size=*/Size, + /*Type=*/Kind, + /*Device=*/this, + }; + } + return MemAlloc; +} + +int CUDADeviceTy::free(void *TgtPtr, TargetAllocTy Kind) { + if (TgtPtr == nullptr) + return OFFLOAD_SUCCESS; + + if (auto Err = setContext()) { + REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + CUresult Res; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_SHARED: + Res = cuMemFree((CUdeviceptr)TgtPtr); + break; + case TARGET_ALLOC_HOST: + Res = cuMemFreeHost(TgtPtr); + break; + case TARGET_ALLOC_DEVICE_NON_BLOCKING: { + CUstream Stream; + if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))) + break; + cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream); + cuStreamSynchronize(Stream); + if ((Res = cuStreamDestroy(Stream))) + break; + } + } + + if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) { + REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + auto CudaPlugin = reinterpret_cast<CUDAPluginTy *>(&Plugin); + { + std::lock_guard<std::mutex> Lock(CudaPlugin->MemoryInfoMutex); + CudaPlugin->MemoryBases.erase(std::find(CudaPlugin->MemoryBases.begin(), + CudaPlugin->MemoryBases.end(), + TgtPtr)); + assert(CudaPlugin->MemoryInfo.erase(TgtPtr)); + } + return OFFLOAD_SUCCESS; +} + Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, void *DstPtr, int64_t Size, diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index f440ebaf17fe4..0f57043da448c 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -517,6 +517,11 @@ struct GenELF64PluginTy final : public GenericPluginTy { #endif } + Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) override { + return Plugin::error(ErrorCode::UNIMPLEMENTED, + "get_memory_info not implemented for host device"); + } + const char *getName() const override { return GETNAME(TARGET_NAME); } }; >From dfc89b4699156c0f82841cf0681405377ebc5763 Mon Sep 17 00:00:00 2001 From: Ross Brunton <r...@codeplay.com> Date: Tue, 9 Sep 2025 11:11:58 +0100 Subject: [PATCH 2/6] Don't use auto for MemInfo --- offload/liboffload/src/OffloadImpl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 9620c35ac5c10..f399a74989036 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -626,7 +626,7 @@ Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type, } Error olMemFree_impl(ol_platform_handle_t Platform, void *Address) { - auto MemInfo = Platform->Plugin->get_memory_info(Address); + Expected<MemoryInfoTy> MemInfo = Platform->Plugin->get_memory_info(Address); if (auto Err = MemInfo.takeError()) return Err; >From 4bc13339d32b36834e47b9a09dc821fb5593af03 Mon Sep 17 00:00:00 2001 From: Ross Brunton <r...@codeplay.com> Date: Tue, 9 Sep 2025 12:04:42 +0100 Subject: [PATCH 3/6] Use proper size in amdgpu --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 90d9ca9f787e7..b7028b2a9d4c3 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3575,7 +3575,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy { // necessarily map to the device that created it MemoryInfoTy ToReturn; ToReturn.Base = Info.agentBaseAddress; - ToReturn.Size = Info.size; + ToReturn.Size = Info.sizeInBytes; auto UserData = hsa_utils::UserDataPair::getFromOpaqueValue(Info.userData); ToReturn.Type = static_cast<TargetAllocTy>(UserData.getInt()); ToReturn.Device = >From f7a0354d13a321ebf49c8e8a7e5c34f3318f9c37 Mon Sep 17 00:00:00 2001 From: Ross Brunton <r...@codeplay.com> Date: Tue, 9 Sep 2025 12:29:43 +0100 Subject: [PATCH 4/6] Rename get_memory_info -> getMemInfo --- offload/liboffload/src/OffloadImpl.cpp | 2 +- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 2 +- offload/plugins-nextgen/common/include/PluginInterface.h | 2 +- offload/plugins-nextgen/cuda/src/rtl.cpp | 2 +- offload/plugins-nextgen/host/src/rtl.cpp | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index f399a74989036..00d1b83e91b05 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -626,7 +626,7 @@ Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type, } Error olMemFree_impl(ol_platform_handle_t Platform, void *Address) { - Expected<MemoryInfoTy> MemInfo = Platform->Plugin->get_memory_info(Address); + Expected<MemoryInfoTy> MemInfo = Platform->Plugin->getMemoryInfo(Address); if (auto Err = MemInfo.takeError()) return Err; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index b7028b2a9d4c3..b4eb55506927f 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3564,7 +3564,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy { return KernelAgents; } - Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) override { + Expected<MemoryInfoTy> getMemoryInfo(const void *TgtPtr) override { hsa_amd_pointer_info_t Info; if (auto Err = Plugin::check( hsa_amd_pointer_info(TgtPtr, &Info, nullptr, nullptr, nullptr), diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index ce270b3f1ab9a..163b02ac124b8 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -1578,7 +1578,7 @@ struct GenericPluginTy { /// Given a pointer previously allocated by GenericDeviceTy::Allocate, return /// information about that allocation - virtual Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) = 0; + virtual Expected<MemoryInfoTy> getMemoryInfo(const void *TgtPtr) = 0; private: /// Indicates if the platform runtime has been fully initialized. diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index f04049cd8bdfd..2765453504ae9 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1534,7 +1534,7 @@ struct CUDAPluginTy final : public GenericPluginTy { return Major == ImageMajor && Minor >= ImageMinor; } - Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) override { + Expected<MemoryInfoTy> getMemoryInfo(const void *TgtPtr) override { std::lock_guard<std::mutex> Lock(MemoryInfoMutex); // TODO: Is it possible to query this information using CUDA directly? diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 0f57043da448c..aad537e355872 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -517,9 +517,9 @@ struct GenELF64PluginTy final : public GenericPluginTy { #endif } - Expected<MemoryInfoTy> get_memory_info(const void *TgtPtr) override { + Expected<MemoryInfoTy> getMemoryInfo(const void *TgtPtr) override { return Plugin::error(ErrorCode::UNIMPLEMENTED, - "get_memory_info not implemented for host device"); + "getMemoryInfo not implemented for host device"); } const char *getName() const override { return GETNAME(TARGET_NAME); } >From d2a99ef7081ed92ae4ed334ede1d8bec193daa8d Mon Sep 17 00:00:00 2001 From: Ross Brunton <r...@codeplay.com> Date: Tue, 9 Sep 2025 12:32:59 +0100 Subject: [PATCH 5/6] Properly support "NOT_FOUND" in amdgpu --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index b4eb55506927f..e06e4c63c9cda 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3570,6 +3570,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy { hsa_amd_pointer_info(TgtPtr, &Info, nullptr, nullptr, nullptr), "error in hsa_amd_pointer_info: %s")) return Err; + if (Info.type == HSA_EXT_POINTER_TYPE_UNKNOWN) + return Plugin::error(ErrorCode::NOT_FOUND, "could not find allocation"); // The pointer info struct contains an "agent" field, but that doesn't // necessarily map to the device that created it >From e3678fc63d13698153d8448eabb4421bc2703ebc Mon Sep 17 00:00:00 2001 From: Ross Brunton <bruntonr...@protonmail.com> Date: Fri, 12 Sep 2025 11:25:50 +0100 Subject: [PATCH 6/6] Respond to PR feedback --- offload/include/omptarget.h | 2 +- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 12 ++++-------- .../common/include/PluginInterface.h | 4 ++++ offload/plugins-nextgen/cuda/src/rtl.cpp | 19 +++++++++---------- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 197cbd3806d91..4818ff67afefa 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -98,7 +98,7 @@ enum OpenMPOffloadingDeclareTargetFlags { // Note: This type should be no larger than 3 bits, as the amdgpu platform uses // the lower 3 bits of a pointer to store it -enum TargetAllocTy : int32_t { +enum TargetAllocTy : uint8_t { TARGET_ALLOC_DEVICE = 0, TARGET_ALLOC_HOST, TARGET_ALLOC_SHARED, diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index e06e4c63c9cda..7e6078d834d25 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3575,15 +3575,11 @@ struct AMDGPUPluginTy final : public GenericPluginTy { // The pointer info struct contains an "agent" field, but that doesn't // necessarily map to the device that created it - MemoryInfoTy ToReturn; - ToReturn.Base = Info.agentBaseAddress; - ToReturn.Size = Info.sizeInBytes; auto UserData = hsa_utils::UserDataPair::getFromOpaqueValue(Info.userData); - ToReturn.Type = static_cast<TargetAllocTy>(UserData.getInt()); - ToReturn.Device = - reinterpret_cast<GenericDeviceTy *>(UserData.getPointer()); - - return ToReturn; + return MemoryInfoTy( + /*Base=*/Info.agentBaseAddress, /*Size=*/Info.sizeInBytes, + /*Type=*/static_cast<TargetAllocTy>(UserData.getInt()), + /*Device=*/reinterpret_cast<GenericDeviceTy *>(UserData.getPointer())); } private: diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 163b02ac124b8..5f9d125d8fe74 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -1277,6 +1277,10 @@ struct MemoryInfoTy { TargetAllocTy Type; GenericDeviceTy *Device; + MemoryInfoTy(void *Base, size_t Size, TargetAllocTy Type, + GenericDeviceTy *Device) + : Base(Base), Size(Size), Type(Type), Device(Device) {} + void *limit() const { return reinterpret_cast<char *>(Base) + Size; } }; diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 2765453504ae9..f0343f5e79af1 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1541,17 +1541,17 @@ struct CUDAPluginTy final : public GenericPluginTy { // Fast case, we have been given the base pointer directly if (MemoryInfo.contains(TgtPtr)) - return MemoryInfo[TgtPtr]; + return MemoryInfo.at(TgtPtr); // Slower case, we need to look up the base pointer first auto Loc = std::lower_bound(MemoryBases.begin(), MemoryBases.end(), TgtPtr, [&](const void *Iter, const void *Val) { - return MemoryInfo[Iter].limit() < Val; + return MemoryInfo.at(Iter).limit() < Val; }); - if (Loc == MemoryBases.end() || TgtPtr > MemoryInfo[*Loc].limit()) + if (Loc == MemoryBases.end() || TgtPtr > MemoryInfo.at(*Loc).limit()) return Plugin::error(ErrorCode::NOT_FOUND, "allocated memory information not found"); - return MemoryInfo[*Loc]; + return MemoryInfo.at(*Loc); } private: @@ -1612,12 +1612,11 @@ void *CUDADeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { std::lower_bound(CudaPlugin->MemoryBases.begin(), CudaPlugin->MemoryBases.end(), MemAlloc), MemAlloc); - CudaPlugin->MemoryInfo[MemAlloc] = MemoryInfoTy{ - /*Base=*/MemAlloc, - /*Size=*/Size, - /*Type=*/Kind, - /*Device=*/this, - }; + CudaPlugin->MemoryInfo.emplace_or_assign(MemAlloc, + /*Base=*/MemAlloc, + /*Size=*/Size, + /*Type=*/Kind, + /*Device=*/this); } return MemAlloc; } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits