Re: [PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-24 Thread Philip Yang

  


On 2024-04-23 18:15, Felix Kuehling
  wrote:

On
  2024-04-23 11:28, Philip Yang wrote:
  
  If the BO pages pinned for RDMA is not
contiguous on VRAM, evict it to

system memory first to free the VRAM space, then allocate
contiguous

VRAM space, and then move it from system memory back to VRAM.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16
+++-

  1 file changed, 15 insertions(+), 1 deletion(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index ef9154043757..5d118e5580ce 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1470,13 +1470,27 @@ static int
amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)

  if (unlikely(ret))

  return ret;

  +    if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
{

+    /*

+ * If bo is not contiguous on VRAM, move to system
memory first to ensure

+ * we can get contiguous VRAM space after evicting
other BOs.

+ */

+    if (!(bo->tbo.resource->placement &
TTM_PL_FLAG_CONTIGUOUS)) {

+    ret = amdgpu_amdkfd_bo_validate(bo,
AMDGPU_GEM_DOMAIN_GTT, false);

  
  
  amdgpu_amdkfd_bo_validate is meant for use in kernel threads. It
  always runs uninterruptible. I believe pin_bo runs in the context
  of ioctls from user mode. So it should be interruptible.
  

yes, pin_bo is in the context of user mode, from KFD alloc memory
  or from rdma driver get pages, should use interruptible wait.
amdgpu_amdkfd_bo_validate is currently used by kernel threads and
  ioctl amdgpu_amdkfd_add_gws_to_process (this seems bug), does it
  make sense to add parameter interruptible, then we can remove many
  duplicate code amdgpu_bo_placement_from_domain + ttm_bo_validate
  or I can fix it here and leave the cleanup and bug fix in the
  future?
Regards,
Philip


  
  Regards,
  
    Felix
  
  
  
  +    if (unlikely(ret)) {

+    pr_debug("validate bo 0x%p to GTT failed %d\n",
&bo->tbo, ret);

+    goto out;

+    }

+    }

+    }

+

  ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);

  if (ret)

  pr_err("Error in Pinning BO to domain: %d\n", domain);

    amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);

+out:

  amdgpu_bo_unreserve(bo);

-

  return ret;

  }

  

  



Re: [PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-23 Thread Felix Kuehling

On 2024-04-23 11:28, Philip Yang wrote:

If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++-
  1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef9154043757..5d118e5580ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
  
+	if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {

+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);


amdgpu_amdkfd_bo_validate is meant for use in kernel threads. It always 
runs uninterruptible. I believe pin_bo runs in the context of ioctls 
from user mode. So it should be interruptible.


Regards,
  Felix



+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
&bo->tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
  
  	amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);

+out:
amdgpu_bo_unreserve(bo);
-
return ret;
  }
  


[PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-23 Thread Philip Yang
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef9154043757..5d118e5580ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
&bo->tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
 
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
+out:
amdgpu_bo_unreserve(bo);
-
return ret;
 }
 
-- 
2.43.2