from:"philip yang"

[PATCH] drm/amdgpu: Show retry fault message if process xnack on

2024-05-07 Thread Philip Yang

If vm_context_cntl set xnack on, then GPU vm fault has retry_fault bit
set, but the driver select xnack on or off path depending on per process
xnack setting which is also used to set qpd mem_config xnack on or off
if KFD_SUPPORT_XNACK_PER_PROCESS.

If process is xnack on, then GPU page fault show retry page fault
message, otherwise show no-retry page fault message, to avoid misleading
when debugging application page fault issue.

The process lookup from pasid is done inside retry fault handler
svm_range_restore_pages, add xnack_on parameter to pass process xnack
setting back to amdgpu_vm_handle_fault and then to gmc interrupt handler
to show vm fault message.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 7 ---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 4 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   | 2 +-
 6 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 64ddc87f7fb6..58f7ab193027 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2757,13 +2757,14 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
  *   GFX 9.4.3.
  * @addr: Address of the fault
  * @write_fault: true is write fault, false is read fault
+ * @xnack_on: return value, true if the process sets xnack on
  *
  * Try to gracefully handle a VM fault. Return true if the fault was handled 
and
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr,
-   bool write_fault)
+   bool write_fault, bool *xnack_on)
 {
bool is_compute_context = false;
struct amdgpu_bo *root;
@@ -2788,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
addr /= AMDGPU_GPU_PAGE_SIZE;
 
if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
-   node_id, addr, write_fault)) {
+   node_id, addr, write_fault, xnack_on)) {
amdgpu_bo_unref();
return true;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index bc71b44387b2..7f364f0b9a60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -549,7 +549,7 @@ void amdgpu_vm_put_task_info(struct amdgpu_task_info 
*task_info);
 
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr,
-   bool write_fault);
+   bool write_fault, bool *xnack_on);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..2f0752376236 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device 
*adev,
/* Try to handle the recoverable page faults by filling page
 * tables
 */
-   if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, 
write_fault))
+   if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, 
write_fault, NULL))
return 1;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 671a6766df5b..3db0f2304b6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -558,6 +558,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
uint32_t cam_index = 0;
int ret, xcc_id = 0;
uint32_t node_id;
+   bool xnack_on = false;
 
node_id = entry->node_id;
 
@@ -595,7 +596,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
cam_index = entry->src_data[2] & 0x3ff;
 
ret = amdgpu_vm_handle_fault(adev, entry->pasid, 
entry->vmid, node_id,
-addr, write_fault);
+addr, write_fault, 
_on);
WDOORBELL32(adev->irq.retry_cam_doorbell_index, 
cam_index);
if (ret)
return 1;
@@ -618,7 +619,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 * tables
 */
if (amdgpu_vm_handle_fault(adev, entry->pasid, 
entry->vmid, node_id,
-

Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

2024-05-02 Thread Philip Yang


  


On 2024-05-02 08:42, James Zhu wrote:


  
  On 2024-05-01 18:56, Philip Yang wrote:
  
  On system with khugepaged enabled and user
cases with THP buffer, the

hmm_range_fault may takes > 15 seconds to return -EBUSY, the
arbitrary

timeout value is not accurate, cause memory allocation failure.


Remove the arbitrary timeout value, return EAGAIN to application
if

hmm_range_fault return EBUSY, then userspace libdrm and Thunk
will call

ioctl again.


Change EAGAIN to debug message as this is not error.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  5 -

  drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c  | 12
+++-

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  5 +

  3 files changed, 8 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 54198c3928c7..02696c2102f1 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem
*mem, uint64_t user_addr,

    ret = amdgpu_ttm_tt_get_user_pages(bo,
bo->tbo.ttm->pages, );

  if (ret) {

-    pr_err("%s: Failed to get user pages: %d\n", __func__,
ret);

+    if (ret == -EAGAIN)

+    pr_debug("Failed to get user pages, try again\n");

+    else

+    pr_err("%s: Failed to get user pages: %d\n",
__func__, ret);

  goto unregister_out;

  }

  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

index 431ec72655ec..e36fede7f74c 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct
mmu_interval_notifier *notifier,

  pr_debug("hmm range: start = 0x%lx, end = 0x%lx",

  hmm_range->start, hmm_range->end);

  -    /* Assuming 64MB takes maximum 1 second to fault page
address */

-    timeout = max((hmm_range->end - hmm_range->start)
>> 26, 1UL);

-    timeout *= HMM_RANGE_DEFAULT_TIMEOUT;

-    timeout = jiffies + msecs_to_jiffies(timeout);

+    timeout = jiffies +
msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);

  
  [JZ] should we reduce MAX_WALK_BYTE to 64M in the meantime?
  

From debug log, the range size is not related, 64MB range may takes
same long time to return EBUSY too.

      retry:

  hmm_range->notifier_seq =
mmu_interval_read_begin(notifier);

  r = hmm_range_fault(hmm_range);

  if (unlikely(r)) {

-    schedule();

  
  [JZ] the above is for CPU stall WA, we may still need keep it.
  

The timeout 1 second should be long enough for normal case, if
  hmm_range_fault returns EBUSY, we release mmap_read lock and
  return to user space, so don't need explicit schedule to fix the
  CPU stale warning. Will run overnight KFDTest LargestSysBufferTest
  on larger memory system to confirm if there is CPU stale message.
Regards,
Philip


  -    /*

- * FIXME: This timeout should encompass the retry
from

- * mmu_interval_read_retry() as well.

- */

  if (r == -EBUSY && !time_after(jiffies,
timeout))

  goto retry;

  goto out_free_pfns;

@@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct
mmu_interval_notifier *notifier,

  out_free_range:

  kfree(hmm_range);

  +    if (r == -EBUSY)

+    r = -EAGAIN;

  return r;

  }

  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

2024-05-02 Thread Philip Yang


  


On 2024-05-02 00:09, Chen, Xiaogang
  wrote:


  
  On 5/1/2024 5:56 PM, Philip Yang wrote:
  
  Caution: This message originated from an
External Source. Use proper caution when opening attachments,
clicking links, or responding.



On system with khugepaged enabled and user cases with THP
buffer, the

hmm_range_fault may takes > 15 seconds to return -EBUSY, the
arbitrary

timeout value is not accurate, cause memory allocation failure.


Remove the arbitrary timeout value, return EAGAIN to application
if

hmm_range_fault return EBUSY, then userspace libdrm and Thunk
will call

ioctl again.

  
  
  Wonder why letting user space do retry is better? Seems this issue
  is caused by hugepage merging, so how user space can avoid it?
  

The issue is caused by khugepaged + 4 processes + sdma stalls test
(to slow down sdma) + small_BAR + QPX mode, during overnight test,
hmm_range_fault 180MB buffer may takes >15 seconds returns EBUSY,
then alloc memory ioctl failed. Return EAGAIN, Thunk will call the
alloc memory ioctl again, and we don't see the alloc memory
failure.  

  
  And applications may not use Thunk or libdrm, instead, use ioctl
  directly.
  

If app calls ioctl directly, it should do the same thing, to call
  ioctl again if errno is EINTR or EAGAIN.
Regards,
Philip


  
  Regards
  
  
  Xiaogang
  
  
  Change EAGAIN to debug message as this is
not error.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  5 -

  drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c  | 12
+++-

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  5 +

  3 files changed, 8 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 54198c3928c7..02696c2102f1 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem
*mem, uint64_t user_addr,


 ret = amdgpu_ttm_tt_get_user_pages(bo,
bo->tbo.ttm->pages, );

 if (ret) {

-   pr_err("%s: Failed to get user pages: %d\n",
__func__, ret);

+   if (ret == -EAGAIN)

+   pr_debug("Failed to get user pages, try
again\n");

+   else

+   pr_err("%s: Failed to get user pages:
%d\n", __func__, ret);

 goto unregister_out;

 }


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

index 431ec72655ec..e36fede7f74c 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct
mmu_interval_notifier *notifier,

 pr_debug("hmm range: start = 0x%lx, end =
0x%lx",

 hmm_range->start,
hmm_range->end);


-   /* Assuming 64MB takes maximum 1 second to fault
page address */

-   timeout = max((hmm_range->end -
hmm_range->start) >> 26, 1UL);

-   timeout *= HMM_RANGE_DEFAULT_TIMEOUT;

-   timeout = jiffies + msecs_to_jiffies(timeout);

+   timeout = jiffies +
msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);


  retry:

 hmm_range->notifier_seq =
mmu_interval_read_begin(notifier);

 r = hmm_range_fault(hmm_range);

 if (unlikely(r)) {

-   schedule();

-   /*

-    * FIXME: This timeout should encompass
the retry from

-    * mmu_interval_read_retry() as well.

-    */

[PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

2024-05-01 Thread Philip Yang

On system with khugepaged enabled and user cases with THP buffer, the
hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary
timeout value is not accurate, cause memory allocation failure.

Remove the arbitrary timeout value, return EAGAIN to application if
hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call
ioctl again.

Change EAGAIN to debug message as this is not error.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  5 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c  | 12 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  5 +
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54198c3928c7..02696c2102f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t 
user_addr,
 
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, );
if (ret) {
-   pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
+   if (ret == -EAGAIN)
+   pr_debug("Failed to get user pages, try again\n");
+   else
+   pr_err("%s: Failed to get user pages: %d\n", __func__, 
ret);
goto unregister_out;
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
index 431ec72655ec..e36fede7f74c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct 
mmu_interval_notifier *notifier,
pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
hmm_range->start, hmm_range->end);
 
-   /* Assuming 64MB takes maximum 1 second to fault page address */
-   timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
-   timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
-   timeout = jiffies + msecs_to_jiffies(timeout);
+   timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
 
 retry:
hmm_range->notifier_seq = mmu_interval_read_begin(notifier);
r = hmm_range_fault(hmm_range);
if (unlikely(r)) {
-   schedule();
-   /*
-* FIXME: This timeout should encompass the retry from
-* mmu_interval_read_retry() as well.
-*/
if (r == -EBUSY && !time_after(jiffies, timeout))
goto retry;
goto out_free_pfns;
@@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier 
*notifier,
 out_free_range:
kfree(hmm_range);
 
+   if (r == -EBUSY)
+   r = -EAGAIN;
return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 94f83be2232d..e7040f809f33 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1670,11 +1670,8 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
   readonly, owner, NULL,
   _range);
WRITE_ONCE(p->svms.faulting_task, NULL);
-   if (r) {
+   if (r)
pr_debug("failed %d to get svm range pages\n", 
r);
-   if (r == -EBUSY)
-   r = -EAGAIN;
-   }
} else {
r = -EFAULT;
}
-- 
2.43.2

Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()

2024-05-01 Thread Philip Yang


  


On 2024-04-30 19:29, Ramesh Errabolu
  wrote:


  Analysis of code by Coverity, a static code analyser, has identified
a resource leak in the symbol hmm_range. This leak occurs when one of
the prior steps before it is released encounters an error.

Signed-off-by: Ramesh Errabolu 

Reviewed-by: Philip Yang 

  
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 386875e6eb96..dcb1d5d3f860 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 	start = map_start << PAGE_SHIFT;
 	end = (map_last + 1) << PAGE_SHIFT;
 	for (addr = start; !r && addr < end; ) {
-		struct hmm_range *hmm_range;
+		struct hmm_range *hmm_range = NULL;
 		unsigned long map_start_vma;
 		unsigned long map_last_vma;
 		struct vm_area_struct *vma;
@@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 		}
 
 		svm_range_lock(prange);
-		if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
+
+		// Free backing memory of hmm_range if it was initialized
+		if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) {
 			pr_debug("hmm update the range, need validate again\n");
 			r = -EAGAIN;
 		}

[PATCH v6 1/5] drm/amdgpu: Support contiguous VRAM allocation

2024-04-24 Thread Philip Yang

RDMA device with limited scatter-gather ability requires contiguous VRAM
buffer allocation for RDMA peer direct support.

Add a new KFD alloc memory flag and store as bo alloc flag
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA
peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask
VRAM buddy allocator to get contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 
 include/uapi/linux/kfd_ioctl.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index f672205243e0..02d66faaade5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..d09c4a18e571 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT   (1 << 26)
 #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED   (1 << 25)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT   (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS (1 << 23)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
2.43.2

[PATCH v6 5/5] drm/amdkfd: Bump kfd version for contiguous VRAM allocation

2024-04-24 Thread Philip Yang

Bump the kfd ioctl minor version to delcare the contiguous VRAM
allocation flag support.

Signed-off-by: Philip Yang 
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index d09c4a18e571..f8e9d3c1d117 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -41,9 +41,10 @@
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
+ * - 1.16 - Add contiguous VRAM allocation flag
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 15
+#define KFD_IOCTL_MINOR_VERSION 16
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.43.2

[PATCH v6 4/5] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-24 Thread Philip Yang

If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

v6: user context should use interruptible call (Felix)

Signed-off-by: Philip Yang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 02d66faaade5..acc825b84113 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,30 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   struct ttm_operation_ctx ctx = { true, false };
+
+   amdgpu_bo_placement_from_domain(bo, 
AMDGPU_GEM_DOMAIN_GTT);
+   ret = ttm_bo_validate(>tbo, >placement, );
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
>tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
 
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
+out:
amdgpu_bo_unreserve(bo);
-
return ret;
 }
 
-- 
2.43.2

[PATCH v6 0/5] Best effort contiguous VRAM allocation

2024-04-24 Thread Philip Yang

This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour")
to avoid adding the new GEM flag

v3: add patch 2 to handle sg segment size limit (Christian)

v4: remove the buddy block size limit from vram mgr because sg table creation 
already
remove the limit, and resource uses u64 to handle block start, size 
(Christian)

v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro 
name.

v6: use shorter flag name, use interruptible wait ctx, drop patch 5/6 (Felix)

Philip Yang (5):
  drm/amdgpu: Support contiguous VRAM allocation
  drm/amdgpu: Handle sg size limit for contiguous allocation
  drm/amdgpu: Evict BOs from same process for contiguous allocation
  drm/amdkfd: Evict BO itself for contiguous allocation
  drm/amdkfd: Bump kfd version for contiguous VRAM allocation

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 12 +-
 include/uapi/linux/kfd_ioctl.h|  4 +++-
 4 files changed, 33 insertions(+), 9 deletions(-)

-- 
2.43.2

[PATCH v6 3/5] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-24 Thread Philip Yang

When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system
memory then retry the allocation, this skips the KFD BOs from the same
process because KFD require all BOs are resident for user queues.

If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow
TTM evict KFD BOs from the same process, this will evict the user queues
first, and restore the queues later after contiguous VRAM allocation.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 64f5001a7dc5..c21ea808f931 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1403,7 +1403,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}
 
-- 
2.43.2

[PATCH v6 2/5] drm/amdgpu: Handle sg size limit for contiguous allocation

2024-04-24 Thread Philip Yang

Define macro AMDGPU_MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist
length is unsigned int, and some users of it cast to a signed int, so
every segment of sg table is limited to size 2GB maximum.

For contiguous VRAM allocation, don't limit the max buddy block size in
order to get contiguous VRAM memory. To workaround the sg table segment
size limit, allocate multiple segments if contiguous size is bigger than
AMDGPU_MAX_SG_SEGMENT_SIZE.

Signed-off-by: Philip Yang 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 4be8b091099a..ebffb58ea53a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -31,6 +31,8 @@
 #include "amdgpu_atomfirmware.h"
 #include "atom.h"
 
+#define AMDGPU_MAX_SG_SEGMENT_SIZE (2UL << 30)
+
 struct amdgpu_vram_reservation {
u64 start;
u64 size;
@@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 
BUG_ON(min_block_size < mm->chunk_size);
 
-   /* Limit maximum size to 2GiB due to SG table limitations */
-   size = min(remaining_size, 2ULL << 30);
-
+   size = remaining_size;
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) 
- 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
@@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, );
while (cursor.remaining) {
num_entries++;
-   amdgpu_res_next(, cursor.size);
+   amdgpu_res_next(, min(cursor.size, 
AMDGPU_MAX_SG_SEGMENT_SIZE));
}
 
r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL);
@@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, );
for_each_sgtable_sg((*sgt), sg, i) {
phys_addr_t phys = cursor.start + adev->gmc.aper_base;
-   size_t size = cursor.size;
+   unsigned long size = min(cursor.size, 
AMDGPU_MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
addr = dma_map_resource(dev, phys, size, dir,
@@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
 
-   amdgpu_res_next(, cursor.size);
+   amdgpu_res_next(, size);
}
 
return 0;
-- 
2.43.2

Re: [PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation

2024-04-24 Thread Philip Yang


  


On 2024-04-23 18:17, Felix Kuehling
  wrote:


  
  On 2024-04-23 11:28, Philip Yang wrote:
  
  RDMA device with limited scatter-gather
ability requires contiguous VRAM

buffer allocation for RDMA peer direct support.


Add a new KFD alloc memory flag and store as bo alloc flag

AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export
for RDMA

peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag,
and ask

VRAM buddy allocator to get contiguous VRAM.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 

  include/uapi/linux/kfd_ioctl.h   | 1 +

  2 files changed, 5 insertions(+)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 0ae9fd844623..ef9154043757 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1712,6 +1712,10 @@ int
amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(

  alloc_flags =
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;

  alloc_flags |= (flags &
KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ?

  AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;

+

+    /* For contiguous VRAM allocation */

+    if (flags &
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)

+    alloc_flags |=
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;

  }

  xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?

  0 : fpriv->xcp_id;

diff --git a/include/uapi/linux/kfd_ioctl.h
b/include/uapi/linux/kfd_ioctl.h

index 2040a470ddb4..c1394c162d4e 100644

--- a/include/uapi/linux/kfd_ioctl.h

+++ b/include/uapi/linux/kfd_ioctl.h

@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {

  #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT    (1 << 26)

  #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED    (1 << 25)

  #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT    (1 <<
24)

+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT    (1
<< 23)

  
  
  If I understand it correctly, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
  was redefined to mean "best effort". Maybe we can drop the
  explicit "BEST_EFFORT" from this flag as well to keep the name to
  a reasonable length.
  

yes, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is redefined, to implement
  "best effort" without adding new upstream GEM flag, so we may get
  scattered allocation if contiguous allocation failed. If we drop
  the "BEST_EFFORT" from flag name, this may mislead the users.
Regards,
Philip  

  
  Regards,
  
    Felix
  
  
  
      /* Allocate memory for later SVM
(shared virtual memory) mapping.

   *

Re: [PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-24 Thread Philip Yang


  


On 2024-04-23 18:15, Felix Kuehling
  wrote:

On
  2024-04-23 11:28, Philip Yang wrote:
  
  If the BO pages pinned for RDMA is not
contiguous on VRAM, evict it to

system memory first to free the VRAM space, then allocate
contiguous

VRAM space, and then move it from system memory back to VRAM.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16
+++-

  1 file changed, 15 insertions(+), 1 deletion(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index ef9154043757..5d118e5580ce 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1470,13 +1470,27 @@ static int
amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)

  if (unlikely(ret))

  return ret;

  +    if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
{

+    /*

+ * If bo is not contiguous on VRAM, move to system
memory first to ensure

+ * we can get contiguous VRAM space after evicting
other BOs.

+ */

+    if (!(bo->tbo.resource->placement &
TTM_PL_FLAG_CONTIGUOUS)) {

+    ret = amdgpu_amdkfd_bo_validate(bo,
AMDGPU_GEM_DOMAIN_GTT, false);

  
  
  amdgpu_amdkfd_bo_validate is meant for use in kernel threads. It
  always runs uninterruptible. I believe pin_bo runs in the context
  of ioctls from user mode. So it should be interruptible.
  

yes, pin_bo is in the context of user mode, from KFD alloc memory
  or from rdma driver get pages, should use interruptible wait.
amdgpu_amdkfd_bo_validate is currently used by kernel threads and
  ioctl amdgpu_amdkfd_add_gws_to_process (this seems bug), does it
  make sense to add parameter interruptible, then we can remove many
  duplicate code amdgpu_bo_placement_from_domain + ttm_bo_validate
  or I can fix it here and leave the cleanup and bug fix in the
  future?
Regards,
Philip


  
  Regards,
  
    Felix
  
  
  
  +    if (unlikely(ret)) {

+    pr_debug("validate bo 0x%p to GTT failed %d\n",
>tbo, ret);

+    goto out;

+    }

+    }

+    }

+

  ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);

  if (ret)

  pr_err("Error in Pinning BO to domain: %d\n", domain);

    amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);

+out:

  amdgpu_bo_unreserve(bo);

-

  return ret;

  }

[PATCH v5 6/6] drm/amdkfd: Bump kfd version for contiguous VRAM allocation

2024-04-23 Thread Philip Yang

Bump the kfd ioctl minor version to delcare the contiguous VRAM
allocation flag support.

Signed-off-by: Philip Yang 
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index c1394c162d4e..a5ebbe98ff7f 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -41,9 +41,10 @@
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
+ * - 1.16 - Add contiguous VRAM allocation flag
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 15
+#define KFD_IOCTL_MINOR_VERSION 16
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.43.2

[PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-23 Thread Philip Yang

If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef9154043757..5d118e5580ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
>tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
 
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
+out:
amdgpu_bo_unreserve(bo);
-
return ret;
 }
 
-- 
2.43.2

[PATCH v5 2/6] drm/amdgpu: Handle sg size limit for contiguous allocation

2024-04-23 Thread Philip Yang

Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length
is unsigned int, and some users of it cast to a signed int, so every
segment of sg table is limited to size 2GB maximum.

For contiguous VRAM allocation, don't limit the max buddy block size in
order to get contiguous VRAM memory. To workaround the sg table segment
size limit, allocate multiple segments if contiguous size is bigger than
MAX_SG_SEGMENT_SIZE.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 4be8b091099a..ebffb58ea53a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -31,6 +31,8 @@
 #include "amdgpu_atomfirmware.h"
 #include "atom.h"
 
+#define AMDGPU_MAX_SG_SEGMENT_SIZE (2UL << 30)
+
 struct amdgpu_vram_reservation {
u64 start;
u64 size;
@@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 
BUG_ON(min_block_size < mm->chunk_size);
 
-   /* Limit maximum size to 2GiB due to SG table limitations */
-   size = min(remaining_size, 2ULL << 30);
-
+   size = remaining_size;
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) 
- 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
@@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, );
while (cursor.remaining) {
num_entries++;
-   amdgpu_res_next(, cursor.size);
+   amdgpu_res_next(, min(cursor.size, 
AMDGPU_MAX_SG_SEGMENT_SIZE));
}
 
r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL);
@@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, );
for_each_sgtable_sg((*sgt), sg, i) {
phys_addr_t phys = cursor.start + adev->gmc.aper_base;
-   size_t size = cursor.size;
+   unsigned long size = min(cursor.size, 
AMDGPU_MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
addr = dma_map_resource(dev, phys, size, dir,
@@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
 
-   amdgpu_res_next(, cursor.size);
+   amdgpu_res_next(, size);
}
 
return 0;
-- 
2.43.2

[PATCH v5 3/6] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-23 Thread Philip Yang

When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system
memory then retry the allocation, this skips the KFD BOs from the same
process because KFD require all BOs are resident for user queues.

If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow
TTM evict KFD BOs from the same process, this will evict the user queues
first, and restore the queues later after contiguous VRAM allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 851509c6e90e..c907d6005641 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}
 
-- 
2.43.2

[PATCH v5 5/6] drm/amdkfd: Increase KFD bo restore wait time

2024-04-23 Thread Philip Yang

TTM allocate contiguous VRAM may takes more than 1 second to evict BOs
for larger size RDMA buffer. Because KFD restore bo worker reserves all
KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them,
this causes TTM failed to alloc contiguous VRAM.

Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA
pin BO to alloc the contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a81ef232fdef..c205e2d3acf9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -698,7 +698,7 @@ struct qcm_process_device {
 /* KFD Memory Eviction */
 
 /* Approx. wait time before attempting to restore evicted BOs */
-#define PROCESS_RESTORE_TIME_MS 100
+#define PROCESS_RESTORE_TIME_MS 2000
 /* Approx. back off time if restore fails due to lack of memory */
 #define PROCESS_BACK_OFF_TIME_MS 100
 /* Approx. time before evicting the process again */
-- 
2.43.2

[PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation

2024-04-23 Thread Philip Yang

RDMA device with limited scatter-gather ability requires contiguous VRAM
buffer allocation for RDMA peer direct support.

Add a new KFD alloc memory flag and store as bo alloc flag
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA
peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask
VRAM buddy allocator to get contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 
 include/uapi/linux/kfd_ioctl.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..ef9154043757 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & 
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..c1394c162d4e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT   (1 << 26)
 #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED   (1 << 25)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT   (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
2.43.2

[PATCH v5 0/6] Best effort contiguous VRAM allocation

2024-04-23 Thread Philip Yang

This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour")
to avoid adding the new GEM flag

v3: add patch 2 to handle sg segment size limit (Christian)

v4: remove the buddy block size limit from vram mgr because sg table creation 
already
remove the limit, and resource uses u64 to handle block start, size 
(Christian)

v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro 
name.

Philip Yang (6):
  drm/amdgpu: Support contiguous VRAM allocation
  drm/amdgpu: Handle sg size limit for contiguous allocation
  drm/amdgpu: Evict BOs from same process for contiguous allocation
  drm/amdkfd: Evict BO itself for contiguous allocation
  drm/amdkfd: Increase KFD bo restore wait time
  drm/amdkfd: Bump kfd version for contiguous VRAM allocation

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 12 +--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +-
 include/uapi/linux/kfd_ioctl.h|  4 +++-
 5 files changed, 31 insertions(+), 10 deletions(-)

-- 
2.43.2

Re: [PATCH v4 6/7] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-23 Thread Philip Yang


  


On 2024-04-23 09:32, Christian König
  wrote:

Am
  23.04.24 um 15:04 schrieb Philip Yang:
  
  To test RDMA using dummy driver on the
system without NIC/RDMA

device, the get/put dma pages pass in null device pointer, skip
the

dma map/unmap resource and sg table to avoid null pointer
access.

  
  
  Well just to make it clear this patch is really a no-go for
  upstreaming.
  
  
  The RDMA code isn't upstream as far as I know and doing this here
  is really not a good idea even internally.
  

Right, this change is not needed and not related to upstream,
  just to minimize the difference with upstream.
I will not upstream this patch.
Regards,
Philip


  
  Regards,
  
  Christian.
  
  
  

Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33
+++-

  1 file changed, 19 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 6c7133bf51d8..101a85263b53 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

@@ -698,12 +698,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  unsigned long size = min(cursor.size,
MAX_SG_SEGMENT_SIZE);

  dma_addr_t addr;

  -    addr = dma_map_resource(dev, phys, size, dir,

-    DMA_ATTR_SKIP_CPU_SYNC);

-    r = dma_mapping_error(dev, addr);

-    if (r)

-    goto error_unmap;

-

+    if (dev) {

+    addr = dma_map_resource(dev, phys, size, dir,

+    DMA_ATTR_SKIP_CPU_SYNC);

+    r = dma_mapping_error(dev, addr);

+    if (r)

+    goto error_unmap;

+    } else {

+    addr = phys;

+    }

  sg_set_page(sg, NULL, size, 0);

  sg_dma_address(sg) = addr;

  sg_dma_len(sg) = size;

@@ -717,10 +720,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  for_each_sgtable_sg((*sgt), sg, i) {

  if (!sg->length)

  continue;

-

-    dma_unmap_resource(dev, sg->dma_address,

-   sg->length, dir,

-   DMA_ATTR_SKIP_CPU_SYNC);

+    if (dev)

+    dma_unmap_resource(dev, sg->dma_address,

+   sg->length, dir,

+   DMA_ATTR_SKIP_CPU_SYNC);

  }

  sg_free_table(*sgt);

  @@ -745,10 +748,12 @@ void amdgpu_vram_mgr_free_sgt(struct
device *dev,

  struct scatterlist *sg;

  int i;

  -    for_each_sgtable_sg(sgt, sg, i)

-    dma_unmap_resource(dev, sg->dma_address,

-   sg->length, dir,

-   DMA_ATTR_SKIP_CPU_SYNC);

+    if (dev) {

+    for_each_sgtable_sg(sgt, sg, i)

+    dma_unmap_resource(dev, sg->dma_address,

+   sg->length, dir,

+   DMA_ATTR_SKIP_CPU_SYNC);

+    }

  sg_free_table(sgt);

  kfree(sgt);

  }

[PATCH v4 1/7] drm/amdgpu: Support contiguous VRAM allocation

2024-04-23 Thread Philip Yang

RDMA device with limited scatter-gather ability requires contiguous VRAM
buffer allocation for RDMA peer direct support.

Add a new KFD alloc memory flag and store as bo alloc flag
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA
peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask
VRAM buddy allocator to get contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 
 include/uapi/linux/kfd_ioctl.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..ef9154043757 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & 
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..c1394c162d4e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT   (1 << 26)
 #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED   (1 << 25)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT   (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
2.43.2

[PATCH v4 3/7] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-23 Thread Philip Yang

When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system
memory then retry the allocation, this skips the KFD BOs from the same
process because KFD require all BOs are resident for user queues.

If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow
TTM evict KFD BOs from the same process, this will evict the user queues
first, and restore the queues later after contiguous VRAM allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 851509c6e90e..c907d6005641 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}
 
-- 
2.43.2

[PATCH v4 7/7] drm/amdkfd: Bump kfd version for contiguous VRAM allocation

2024-04-23 Thread Philip Yang

Bump the kfd ioctl minor version to delcare the contiguous VRAM
allocation flag support.

Signed-off-by: Philip Yang 
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index c1394c162d4e..a5ebbe98ff7f 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -41,9 +41,10 @@
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
+ * - 1.16 - Add contiguous VRAM allocation flag
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 15
+#define KFD_IOCTL_MINOR_VERSION 16
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.43.2

[PATCH v4 4/7] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-23 Thread Philip Yang

If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef9154043757..5d118e5580ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
>tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
 
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
+out:
amdgpu_bo_unreserve(bo);
-
return ret;
 }
 
-- 
2.43.2

[PATCH v4 6/7] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-23 Thread Philip Yang

To test RDMA using dummy driver on the system without NIC/RDMA
device, the get/put dma pages pass in null device pointer, skip the
dma map/unmap resource and sg table to avoid null pointer access.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++-
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 6c7133bf51d8..101a85263b53 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -698,12 +698,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
-   addr = dma_map_resource(dev, phys, size, dir,
-   DMA_ATTR_SKIP_CPU_SYNC);
-   r = dma_mapping_error(dev, addr);
-   if (r)
-   goto error_unmap;
-
+   if (dev) {
+   addr = dma_map_resource(dev, phys, size, dir,
+   DMA_ATTR_SKIP_CPU_SYNC);
+   r = dma_mapping_error(dev, addr);
+   if (r)
+   goto error_unmap;
+   } else {
+   addr = phys;
+   }
sg_set_page(sg, NULL, size, 0);
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
@@ -717,10 +720,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
for_each_sgtable_sg((*sgt), sg, i) {
if (!sg->length)
continue;
-
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
}
sg_free_table(*sgt);
 
@@ -745,10 +748,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev,
struct scatterlist *sg;
int i;
 
-   for_each_sgtable_sg(sgt, sg, i)
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev) {
+   for_each_sgtable_sg(sgt, sg, i)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
+   }
sg_free_table(sgt);
kfree(sgt);
 }
-- 
2.43.2

[PATCH v4 5/7] drm/amdkfd: Increase KFD bo restore wait time

2024-04-23 Thread Philip Yang

TTM allocate contiguous VRAM may takes more than 1 second to evict BOs
for larger size RDMA buffer. Because KFD restore bo worker reserves all
KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them,
this causes TTM failed to alloc contiguous VRAM.

Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA
pin BO to alloc the contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a81ef232fdef..c205e2d3acf9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -698,7 +698,7 @@ struct qcm_process_device {
 /* KFD Memory Eviction */
 
 /* Approx. wait time before attempting to restore evicted BOs */
-#define PROCESS_RESTORE_TIME_MS 100
+#define PROCESS_RESTORE_TIME_MS 2000
 /* Approx. back off time if restore fails due to lack of memory */
 #define PROCESS_BACK_OFF_TIME_MS 100
 /* Approx. time before evicting the process again */
-- 
2.43.2

[PATCH v4 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation

2024-04-23 Thread Philip Yang

Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length
is unsigned int, and some users of it cast to a signed int, so every
segment of sg table is limited to size 2GB maximum.

For contiguous VRAM allocation, don't limit the max buddy block size in
order to get contiguous VRAM memory. To workaround the sg table segment
size limit, allocate multiple segments if contiguous size is bigger than
MAX_SG_SEGMENT_SIZE.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 4be8b091099a..6c7133bf51d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -31,6 +31,8 @@
 #include "amdgpu_atomfirmware.h"
 #include "atom.h"
 
+#define MAX_SG_SEGMENT_SIZE(2UL << 30)
+
 struct amdgpu_vram_reservation {
u64 start;
u64 size;
@@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 
BUG_ON(min_block_size < mm->chunk_size);
 
-   /* Limit maximum size to 2GiB due to SG table limitations */
-   size = min(remaining_size, 2ULL << 30);
-
+   size = remaining_size;
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) 
- 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
@@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, );
while (cursor.remaining) {
num_entries++;
-   amdgpu_res_next(, cursor.size);
+   amdgpu_res_next(, min(cursor.size, MAX_SG_SEGMENT_SIZE));
}
 
r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL);
@@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, );
for_each_sgtable_sg((*sgt), sg, i) {
phys_addr_t phys = cursor.start + adev->gmc.aper_base;
-   size_t size = cursor.size;
+   unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
addr = dma_map_resource(dev, phys, size, dir,
@@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
 
-   amdgpu_res_next(, cursor.size);
+   amdgpu_res_next(, size);
}
 
return 0;
-- 
2.43.2

[PATCH v4 0/7] Best effort contiguous VRAM allocation

2024-04-23 Thread Philip Yang

This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour")
to avoid adding the new GEM flag

v3: add patch 2 to handle sg segment size limit (Christian)

v4: remove the buddy block size limit from vram mgr because sg table creation 
already
remove the limit, and resource uses u64 to handle block start, size 
(Christian)

Philip Yang (7):
  drm/amdgpu: Support contiguous VRAM allocation
  drm/amdgpu: Handle sg size limit for contiguous allocation
  drm/amdgpu: Evict BOs from same process for contiguous allocation
  drm/amdkfd: Evict BO itself for contiguous allocation
  drm/amdkfd: Increase KFD bo restore wait time
  drm/amdgpu: Skip dma map resource for null RDMA device
  drm/amdkfd: Bump kfd version for contiguous VRAM allocation

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 45 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +-
 include/uapi/linux/kfd_ioctl.h|  4 +-
 5 files changed, 50 insertions(+), 24 deletions(-)

-- 
2.43.2

Re: [PATCH v3 6/7] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-22 Thread Philip Yang


  


On 2024-04-22 10:56, Christian König
  wrote:

Am
  22.04.24 um 15:57 schrieb Philip Yang:
  
  To test RDMA using dummy driver on the
system without NIC/RDMA

device, the get/put dma pages pass in null device pointer, skip
the

dma map/unmap resource and sg table to avoid null pointer
access.

  
  
  Well that is completely illegal and would break IOMMU.
  
  
  Why does the RDMA driver does that in the first place?
  

That is the amdp2ptest driver, part of KFDTest rdma test. The
  simple rdma test app and driver is used to test the driver path,
  without actually transferring data b/w machines.
Regards,
Philip


  
  Regards,
  
  Christian.
  
  
  

Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33
+++-

  1 file changed, 19 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 9fe56a21ef88..0caf2c89ef1d 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

@@ -705,12 +705,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  unsigned long size = min(cursor.size,
MAX_SG_SEGMENT_SIZE);

  dma_addr_t addr;

  -    addr = dma_map_resource(dev, phys, size, dir,

-    DMA_ATTR_SKIP_CPU_SYNC);

-    r = dma_mapping_error(dev, addr);

-    if (r)

-    goto error_unmap;

-

+    if (dev) {

+    addr = dma_map_resource(dev, phys, size, dir,

+    DMA_ATTR_SKIP_CPU_SYNC);

+    r = dma_mapping_error(dev, addr);

+    if (r)

+    goto error_unmap;

+    } else {

+    addr = phys;

+    }

  sg_set_page(sg, NULL, size, 0);

  sg_dma_address(sg) = addr;

  sg_dma_len(sg) = size;

@@ -724,10 +727,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  for_each_sgtable_sg((*sgt), sg, i) {

  if (!sg->length)

  continue;

-

-    dma_unmap_resource(dev, sg->dma_address,

-   sg->length, dir,

-   DMA_ATTR_SKIP_CPU_SYNC);

+    if (dev)

+    dma_unmap_resource(dev, sg->dma_address,

+   sg->length, dir,

+   DMA_ATTR_SKIP_CPU_SYNC);

  }

  sg_free_table(*sgt);

  @@ -752,10 +755,12 @@ void amdgpu_vram_mgr_free_sgt(struct
device *dev,

  struct scatterlist *sg;

  int i;

  -    for_each_sgtable_sg(sgt, sg, i)

-    dma_unmap_resource(dev, sg->dma_address,

-   sg->length, dir,

-   DMA_ATTR_SKIP_CPU_SYNC);

+    if (dev) {

+    for_each_sgtable_sg(sgt, sg, i)

+    dma_unmap_resource(dev, sg->dma_address,

+   sg->length, dir,

+   DMA_ATTR_SKIP_CPU_SYNC);

+    }

  sg_free_table(sgt);

  kfree(sgt);

  }

Re: [PATCH v3 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation

2024-04-22 Thread Philip Yang


  


On 2024-04-22 10:40, Christian König
  wrote:

Am
  22.04.24 um 15:57 schrieb Philip Yang:
  
  Define macro MAX_SG_SEGMENT_SIZE 2GB,
because struct scatterlist length

is unsigned int, and some users of it cast to a signed int, so
every

segment of sg table is limited to size 2GB maximum.


For contiguous VRAM allocation, don't limit the max buddy block
size in

order to get contiguous VRAM memory. To workaround the sg table
segment

size limit, allocate multiple segments if contiguous size is
bigger than

MAX_SG_SEGMENT_SIZE.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17
-

  1 file changed, 12 insertions(+), 5 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 4be8b091099a..9fe56a21ef88 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

@@ -31,6 +31,8 @@

  #include "amdgpu_atomfirmware.h"

  #include "atom.h"

  +#define MAX_SG_SEGMENT_SIZE    (2UL << 30)

+

  struct amdgpu_vram_reservation {

  u64 start;

  u64 size;

@@ -532,8 +534,13 @@ static int amdgpu_vram_mgr_new(struct
ttm_resource_manager *man,

    BUG_ON(min_block_size < mm->chunk_size);

  -    /* Limit maximum size to 2GiB due to SG table
limitations */

-    size = min(remaining_size, 2ULL << 30);

+    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)

+    size = remaining_size;

+    else

+    /* Limit maximum size to 2GiB due to SG table
limitations

+ * for no contiguous allocation.

+ */

+    size = min(remaining_size, MAX_SG_SEGMENT_SIZE);

  
  
  Well that doesn't make sense, either fix the creation of the sg
  tables or limit the segment size. Not both.
  

yes, right. we don't need limit the segment size for
  non-contiguous allocation either as this is handled by
  min_block_size. I will send v4 patch to fix this. Then we could
  have another patch to remove the while loop, size and remaining
  size to simply the code in future.
Regards,
Philip


  
      if ((size >=
(u64)pages_per_block << PAGE_SHIFT) &&

  !(size & (((u64)pages_per_block <<
PAGE_SHIFT) - 1)))

@@ -675,7 +682,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  amdgpu_res_first(res, offset, length, );

  while (cursor.remaining) {

  num_entries++;

-    amdgpu_res_next(, cursor.size);

+    amdgpu_res_next(, min(cursor.size,
MAX_SG_SEGMENT_SIZE));

  }

    r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL);

@@ -695,7 +702,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  amdgpu_res_first(res, offset, length, );

  for_each_sgtable_sg((*sgt), sg, i) {

  phys_addr_t phys = cursor.start +
adev->gmc.aper_base;

-    size_t size = cursor.size;

+    unsigned long size = min(cursor.size,
MAX_SG_SEGMENT_SIZE);

  
  
  Please keep size_t here or use unsigned int, using unsigned long
  just looks like trying to hide the problem.
  
  
  And I wouldn't use a separate define but rather just INT_MAX
  instead.
  
  
  Regards,
  
  Christian.
  
  
    dma_addr_t addr;

    addr = dma_map_resource(dev, phys, size, dir,

@@ -708,7 +715,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct
amdgpu_device *adev,

  sg_dma_address(sg) = addr;

  sg_dma_len(sg) = size;

  -    amdgpu_res_next(, cursor.size);

+    amdgpu_res_next(, size);

  }

    return 0;

[PATCH v3 6/7] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-22 Thread Philip Yang

To test RDMA using dummy driver on the system without NIC/RDMA
device, the get/put dma pages pass in null device pointer, skip the
dma map/unmap resource and sg table to avoid null pointer access.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++-
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 9fe56a21ef88..0caf2c89ef1d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -705,12 +705,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
-   addr = dma_map_resource(dev, phys, size, dir,
-   DMA_ATTR_SKIP_CPU_SYNC);
-   r = dma_mapping_error(dev, addr);
-   if (r)
-   goto error_unmap;
-
+   if (dev) {
+   addr = dma_map_resource(dev, phys, size, dir,
+   DMA_ATTR_SKIP_CPU_SYNC);
+   r = dma_mapping_error(dev, addr);
+   if (r)
+   goto error_unmap;
+   } else {
+   addr = phys;
+   }
sg_set_page(sg, NULL, size, 0);
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
@@ -724,10 +727,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
for_each_sgtable_sg((*sgt), sg, i) {
if (!sg->length)
continue;
-
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
}
sg_free_table(*sgt);
 
@@ -752,10 +755,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev,
struct scatterlist *sg;
int i;
 
-   for_each_sgtable_sg(sgt, sg, i)
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev) {
+   for_each_sgtable_sg(sgt, sg, i)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
+   }
sg_free_table(sgt);
kfree(sgt);
 }
-- 
2.43.2

[PATCH v3 0/7] Best effort contiguous VRAM allocation

2024-04-22 Thread Philip Yang

This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour")
to avoid adding the new GEM flag

v3: add patch 2 to handle sg segment size limit (Christian)

Philip Yang (7):
  drm/amdgpu: Support contiguous VRAM allocation
  drm/amdgpu: Handle sg size limit for contiguous allocation
  drm/amdgpu: Evict BOs from same process for contiguous allocation
  drm/amdkfd: Evict BO itself for contiguous allocation
  drm/amdkfd: Increase KFD bo restore wait time
  drm/amdgpu: Skip dma map resource for null RDMA device
  drm/amdkfd: Bump kfd version for contiguous VRAM allocation

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 50 ---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +-
 include/uapi/linux/kfd_ioctl.h|  4 +-
 5 files changed, 56 insertions(+), 23 deletions(-)

-- 
2.43.2

[PATCH v3 5/7] drm/amdkfd: Increase KFD bo restore wait time

2024-04-22 Thread Philip Yang

TTM allocate contiguous VRAM may takes more than 1 second to evict BOs
for larger size RDMA buffer. Because KFD restore bo worker reserves all
KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them,
this causes TTM failed to alloc contiguous VRAM.

Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA
pin BO to alloc the contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a81ef232fdef..c205e2d3acf9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -698,7 +698,7 @@ struct qcm_process_device {
 /* KFD Memory Eviction */
 
 /* Approx. wait time before attempting to restore evicted BOs */
-#define PROCESS_RESTORE_TIME_MS 100
+#define PROCESS_RESTORE_TIME_MS 2000
 /* Approx. back off time if restore fails due to lack of memory */
 #define PROCESS_BACK_OFF_TIME_MS 100
 /* Approx. time before evicting the process again */
-- 
2.43.2

[PATCH v3 7/7] drm/amdkfd: Bump kfd version for contiguous VRAM allocation

2024-04-22 Thread Philip Yang

Bump the kfd ioctl minor version to delcare the contiguous VRAM
allocation flag support.

Signed-off-by: Philip Yang 
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index c1394c162d4e..a5ebbe98ff7f 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -41,9 +41,10 @@
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
+ * - 1.16 - Add contiguous VRAM allocation flag
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 15
+#define KFD_IOCTL_MINOR_VERSION 16
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.43.2

[PATCH v3 3/7] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-22 Thread Philip Yang

When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system
memory then retry the allocation, this skips the KFD BOs from the same
process because KFD require all BOs are resident for user queues.

If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow
TTM evict KFD BOs from the same process, this will evict the user queues
first, and restore the queues later after contiguous VRAM allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 851509c6e90e..c907d6005641 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}
 
-- 
2.43.2

[PATCH v3 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation

2024-04-22 Thread Philip Yang

Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length
is unsigned int, and some users of it cast to a signed int, so every
segment of sg table is limited to size 2GB maximum.

For contiguous VRAM allocation, don't limit the max buddy block size in
order to get contiguous VRAM memory. To workaround the sg table segment
size limit, allocate multiple segments if contiguous size is bigger than
MAX_SG_SEGMENT_SIZE.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 4be8b091099a..9fe56a21ef88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -31,6 +31,8 @@
 #include "amdgpu_atomfirmware.h"
 #include "atom.h"
 
+#define MAX_SG_SEGMENT_SIZE(2UL << 30)
+
 struct amdgpu_vram_reservation {
u64 start;
u64 size;
@@ -532,8 +534,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 
BUG_ON(min_block_size < mm->chunk_size);
 
-   /* Limit maximum size to 2GiB due to SG table limitations */
-   size = min(remaining_size, 2ULL << 30);
+   if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+   size = remaining_size;
+   else
+   /* Limit maximum size to 2GiB due to SG table 
limitations
+* for no contiguous allocation.
+*/
+   size = min(remaining_size, MAX_SG_SEGMENT_SIZE);
 
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) 
- 1)))
@@ -675,7 +682,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, );
while (cursor.remaining) {
num_entries++;
-   amdgpu_res_next(, cursor.size);
+   amdgpu_res_next(, min(cursor.size, MAX_SG_SEGMENT_SIZE));
}
 
r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL);
@@ -695,7 +702,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
amdgpu_res_first(res, offset, length, );
for_each_sgtable_sg((*sgt), sg, i) {
phys_addr_t phys = cursor.start + adev->gmc.aper_base;
-   size_t size = cursor.size;
+   unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE);
dma_addr_t addr;
 
addr = dma_map_resource(dev, phys, size, dir,
@@ -708,7 +715,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
 
-   amdgpu_res_next(, cursor.size);
+   amdgpu_res_next(, size);
}
 
return 0;
-- 
2.43.2

[PATCH v3 4/7] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-22 Thread Philip Yang

If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef9154043757..5d118e5580ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
>tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
 
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
+out:
amdgpu_bo_unreserve(bo);
-
return ret;
 }
 
-- 
2.43.2

[PATCH v3 1/7] drm/amdgpu: Support contiguous VRAM allocation

2024-04-22 Thread Philip Yang

RDMA device with limited scatter-gather ability requires contiguous VRAM
buffer allocation for RDMA peer direct support.

Add a new KFD alloc memory flag and store as bo alloc flag
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA
peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask
VRAM buddy allocator to get contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 
 include/uapi/linux/kfd_ioctl.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..ef9154043757 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & 
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..c1394c162d4e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT   (1 << 26)
 #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED   (1 << 25)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT   (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
2.43.2

Re: [PATCH] drm/amdkfd: Fix rescheduling of restore worker

2024-04-19 Thread Philip Yang


  


On 2024-04-19 15:00, Felix Kuehling
  wrote:


  Handle the case that the restore worker was already scheduled by another
eviction while the restore was in progress.

Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs")
Signed-off-by: Felix Kuehling 

Reviewed-by: Philip Yang 

  
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index aafdf064651f..58c1fe542193 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -2012,9 +2012,9 @@ static void restore_process_worker(struct work_struct *work)
 	if (ret) {
 		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
 			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
-		ret = queue_delayed_work(kfd_restore_wq, >restore_work,
-msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
-		WARN(!ret, "reschedule restore work failed\n");
+		if (mod_delayed_work(kfd_restore_wq, >restore_work,
+ msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
+			kfd_process_restore_queues(p);
 	}
 }

Re: [PATCH v2 1/6] drm/amdgpu: Support contiguous VRAM allocation

2024-04-18 Thread Philip Yang


  


On 2024-04-18 10:37, Christian König
  wrote:


  
  
  Am 18.04.24 um 15:57 schrieb Philip Yang:
  
  RDMA device with limited scatter-gather
ability requires contiguous VRAM

buffer allocation for RDMA peer direct support.


Add a new KFD alloc memory flag and store as bo alloc flag

AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export
for RDMA

peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag,
and ask

VRAM buddy allocator to get contiguous VRAM.


Remove the 2GB max memory block size limit for contiguous
allocation.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 

  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 9 +++--

  include/uapi/linux/kfd_ioctl.h   | 1 +

  3 files changed, 12 insertions(+), 2 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 0ae9fd844623..ef9154043757 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1712,6 +1712,10 @@ int
amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(

  alloc_flags =
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;

  alloc_flags |= (flags &
KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ?

  AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;

+

+    /* For contiguous VRAM allocation */

+    if (flags &
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)

+    alloc_flags |=
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;

  }

  xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?

  0 : fpriv->xcp_id;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 4be8b091099a..2f2ae711 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

@@ -532,8 +532,13 @@ static int amdgpu_vram_mgr_new(struct
ttm_resource_manager *man,

    BUG_ON(min_block_size < mm->chunk_size);

  -    /* Limit maximum size to 2GiB due to SG table
limitations */

-    size = min(remaining_size, 2ULL << 30);

+    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)

+    size = remaining_size;

+    else

+    /* Limit maximum size to 2GiB due to SG table
limitations

+ * for no contiguous allocation.

+ */

+    size = min(remaining_size, 2ULL << 30);

  
  
  Oh, I totally missed this in the first review. That won't work
  like that the sg table limit is still there even if the BO is
  contiguous.
  
  
  We could only fix up the VRAM P2P support to use multiple segments
  in the sg table.
  

yes, you are right, I didn't test with buffer size > 4GB,
  struct scatterlist->offset, length is unsigned int, this limits
  each sg_table entry size < 4GB.
I will do more testing, we should still get >4GB contiguous
  VRAM, will add another patch to fix it inside
  amdgpu_vram_mgr_alloc_sgt, to split it into  multiple sg_table
  entries, and RDMA peerdirect app should be able to handle this
  case based on sg_table->nents.
Regards,
Philip


  
  Regards,
  
  Christian.
  
  
      if ((size >=
(u64)pages_per_block << PAGE_SHIFT) &&

  !(size & (((u64)pages_per_block <<
PAGE_SHIFT) - 1)))

diff --git a/include/uapi/linux/kfd_ioctl.h
b/include/uapi/linux/kfd_ioctl.h

index 2040a470ddb4..c1394c162d4e 100644

--- a/include/uapi/linux/kfd_ioctl.h

+++ b/include/uapi/linux/kfd_ioctl.h

@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {

  #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT    (1 << 26)

  #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED    (1 << 25)

Re: [PATCH] drm/amdkfd: Fix eviction fence handling

2024-04-18 Thread Philip Yang


  


On 2024-04-17 23:14, Felix Kuehling
  wrote:


  Handle case that dma_fence_get_rcu_safe returns NULL.

If restore work is already scheduled, only update its timer. The same
work item cannot be queued twice, so undo the extra queue eviction.

Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs")
Signed-off-by: Felix Kuehling 

Reviewed-by: Philip Yang 

  
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index b79986412cd8..aafdf064651f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1922,6 +1922,8 @@ static int signal_eviction_fence(struct kfd_process *p)
 	rcu_read_lock();
 	ef = dma_fence_get_rcu_safe(>ef);
 	rcu_read_unlock();
+	if (!ef)
+		return -EINVAL;
 
 	ret = dma_fence_signal(ef);
 	dma_fence_put(ef);
@@ -1949,10 +1951,9 @@ static void evict_process_worker(struct work_struct *work)
 		 * they are responsible stopping the queues and scheduling
 		 * the restore work.
 		 */
-		if (!signal_eviction_fence(p))
-			queue_delayed_work(kfd_restore_wq, >restore_work,
-msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
-		else
+		if (signal_eviction_fence(p) ||
+		mod_delayed_work(kfd_restore_wq, >restore_work,
+ msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
 			kfd_process_restore_queues(p);
 
 		pr_debug("Finished evicting pasid 0x%x\n", p->pasid);

[PATCH v2 3/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-18 Thread Philip Yang

If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM space, and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c| 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ef9154043757..ff7f54741661 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,13 +1470,28 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
>tbo, ret);
+   goto out;
+   }
+   }
+   }
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
 
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
-   amdgpu_bo_unreserve(bo);
 
+out:
+   amdgpu_bo_unreserve(bo);
return ret;
 }
 
-- 
2.43.2

[PATCH v2 1/6] drm/amdgpu: Support contiguous VRAM allocation

2024-04-18 Thread Philip Yang

RDMA device with limited scatter-gather ability requires contiguous VRAM
buffer allocation for RDMA peer direct support.

Add a new KFD alloc memory flag and store as bo alloc flag
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA
peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask
VRAM buddy allocator to get contiguous VRAM.

Remove the 2GB max memory block size limit for contiguous allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 9 +++--
 include/uapi/linux/kfd_ioctl.h   | 1 +
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..ef9154043757 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & 
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 4be8b091099a..2f2ae711 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -532,8 +532,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 
BUG_ON(min_block_size < mm->chunk_size);
 
-   /* Limit maximum size to 2GiB due to SG table limitations */
-   size = min(remaining_size, 2ULL << 30);
+   if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+   size = remaining_size;
+   else
+   /* Limit maximum size to 2GiB due to SG table 
limitations
+* for no contiguous allocation.
+*/
+   size = min(remaining_size, 2ULL << 30);
 
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) 
- 1)))
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..c1394c162d4e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT   (1 << 26)
 #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED   (1 << 25)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT   (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
2.43.2

[PATCH v2 5/6] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-18 Thread Philip Yang

To test RDMA using dummy driver on the system without NIC/RDMA
device, the get/put dma pages pass in null device pointer, skip the
dma map/unmap resource to avoid null pointer access.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++-
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 2f2ae711..4c512a372ec7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -703,12 +703,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
size_t size = cursor.size;
dma_addr_t addr;
 
-   addr = dma_map_resource(dev, phys, size, dir,
-   DMA_ATTR_SKIP_CPU_SYNC);
-   r = dma_mapping_error(dev, addr);
-   if (r)
-   goto error_unmap;
-
+   if (dev) {
+   addr = dma_map_resource(dev, phys, size, dir,
+   DMA_ATTR_SKIP_CPU_SYNC);
+   r = dma_mapping_error(dev, addr);
+   if (r)
+   goto error_unmap;
+   } else {
+   addr = phys;
+   }
sg_set_page(sg, NULL, size, 0);
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
@@ -722,10 +725,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
for_each_sgtable_sg((*sgt), sg, i) {
if (!sg->length)
continue;
-
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
}
sg_free_table(*sgt);
 
@@ -750,10 +753,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev,
struct scatterlist *sg;
int i;
 
-   for_each_sgtable_sg(sgt, sg, i)
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev) {
+   for_each_sgtable_sg(sgt, sg, i)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
+   }
sg_free_table(sgt);
kfree(sgt);
 }
-- 
2.43.2

[PATCH v2 6/6] drm/amdkfd: Bump kfd version for contiguous VRAM allocation

2024-04-18 Thread Philip Yang

Bump the kfd ioctl minor version to delcare the contiguous VRAM
allocation flag support.

Signed-off-by: Philip Yang 
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index c1394c162d4e..a0af2ef696ea 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -41,9 +41,10 @@
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
+ * - 1.16 - Add contiguous VRAM allocation flag for RDMA
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 15
+#define KFD_IOCTL_MINOR_VERSION 16
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.43.2

[PATCH v2 4/6] drm/amdkfd: Increase KFD bo restore wait time

2024-04-18 Thread Philip Yang

TTM allocate contiguous VRAM may takes more than 1 second to evict BOs
for larger size RDMA buffer. Because KFD restore bo worker reserves all
KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them,
this causes TTM failed to alloc contiguous VRAM.

Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA
pin BO to alloc the contiguous VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a81ef232fdef..c205e2d3acf9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -698,7 +698,7 @@ struct qcm_process_device {
 /* KFD Memory Eviction */
 
 /* Approx. wait time before attempting to restore evicted BOs */
-#define PROCESS_RESTORE_TIME_MS 100
+#define PROCESS_RESTORE_TIME_MS 2000
 /* Approx. back off time if restore fails due to lack of memory */
 #define PROCESS_BACK_OFF_TIME_MS 100
 /* Approx. time before evicting the process again */
-- 
2.43.2

[PATCH v2 2/6] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-18 Thread Philip Yang

When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system
memory then retry the allocation, this skips the KFD BOs from the same
process because KFD require all BOs are resident for user queues.

If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow
TTM evict KFD BOs from the same process, this will evict the user queues
first, and restore the queues later after contiguous VRAM allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 851509c6e90e..c907d6005641 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}
 
-- 
2.43.2

[PATCH v2 0/6] Best effort contiguous VRAM allocation

2024-04-18 Thread Philip Yang

This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour")
to avoid adding the new GEM flag

Philip Yang (6):
  drm/amdgpu: Support contiguous VRAM allocation
  drm/amdgpu: Evict BOs from same process for contiguous allocation
  drm/amdkfd: Evict BO itself for contiguous allocation
  drm/amdkfd: Increase KFD bo restore wait time
  drm/amdgpu: Skip dma map resource for null RDMA device
  drm/amdkfd: Bump kfd version for contiguous VRAM allocation

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 21 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 42 ---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +-
 include/uapi/linux/kfd_ioctl.h|  4 +-
 5 files changed, 52 insertions(+), 20 deletions(-)

-- 
2.43.2

Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-17 Thread Philip Yang


  


On 2024-04-17 10:32, Paneer Selvam,
  Arunpravin wrote:

Hi
  Christian,
  
  
  On 4/17/2024 6:57 PM, Paneer Selvam, Arunpravin wrote:
  
  Hi Christian,


On 4/17/2024 12:19 PM, Christian König wrote:

Am 17.04.24 um 08:21 schrieb Arunpravin
  Paneer Selvam:
  
  Now we have two flags for contiguous
VRAM buffer allocation.

If the application request for
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,

it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in
the

buffer's placement function.


This patch will change the default behaviour of the two
flags.


When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS

- This means contiguous is not mandatory.

- we will try to allocate the contiguous buffer. Say if the

   allocation fails, we fallback to allocate the individual
pages.


When we setTTM_PL_FLAG_CONTIGUOUS

- This means contiguous allocation is mandatory.

- we are setting this in amdgpu_bo_pin_restricted() before
bo validation

   and check this flag in the vram manager file.

- if this is set, we should allocate the buffer pages
contiguously.

   the allocation fails, we return -ENOSPC.


v2:

   - keep the mem_flags and bo->flags check as
is(Christian)

   - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the

 amdgpu_bo_pin_restricted function placement range
iteration

 loop(Christian)

   - rename find_pages with
amdgpu_vram_mgr_calculate_pages_per_block

 (Christian)

   - Keep the kernel BO allocation as is(Christain)

   - If BO pin vram allocation failed, we need to return
-ENOSPC as

 RDMA cannot work with scattered VRAM pages(Philip)


Signed-off-by: Arunpravin Paneer Selvam


Suggested-by: Christian König


---

  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   |  8 ++-

  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57
+++-

  2 files changed, 50 insertions(+), 15 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..caaef7b1df49 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

@@ -153,8 +153,10 @@ void
amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32
domain)

  else

  places[c].flags |= TTM_PL_FLAG_TOPDOWN;

  -    if (flags &
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)

+    if (abo->tbo.type == ttm_bo_type_kernel
&&

+    flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)

  places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;

+

  c++;

  }

  @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct
amdgpu_bo *bo, u32 domain,

  if (!bo->placements[i].lpfn ||

  (lpfn && lpfn <
bo->placements[i].lpfn))

  bo->placements[i].lpfn = lpfn;

+

+    if (bo->flags &
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&

+    bo->placements[i].mem_type == TTM_PL_VRAM)

+    bo->placements[i].flags |=
TTM_PL_FLAG_CONTIGUOUS;

  }

    r = ttm_bo_validate(>tbo,
>placement, );

  
  
  Nice work, up till here that looks exactly right as far as I
  can see.
  
  
  diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

Re: [PATCH 1/6] drm/amdgpu: Support contiguous VRAM allocation

2024-04-16 Thread Philip Yang


  


On 2024-04-15 08:02, Christian König
  wrote:

Am
  12.04.24 um 22:12 schrieb Philip Yang:
  
  RDMA device with limited scatter-gather
capability requires physical

address contiguous VRAM buffer for RDMA peer direct access.


Add a new KFD alloc memory flag and store as new GEM bo alloc
flag. When

pin this buffer object to export for RDMA peerdirect access, set

AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS flag, and then vram_mgr will
set

TTM_PL_FLAG_CONTIFUOUS flag to ask VRAM buddy allocator to get

contiguous VRAM.


Remove the 2GB max memory block size limit for contiguous
allocation.

  
  
  I'm going to sync up with Arun on this once more, but I think we
  won't even need the new flag.
  
  
  We will just downgrade the existing flag to be a best effort
  allocation for contiguous buffers and only use the TTM flag
  internally to signal that we need to alter it while pinning.
  

sure, I will rebase this patch series to "[PATCH] drm/amdgpu:
  Modify the contiguous flags behaviour", this will remove the new
  flag.

Will send v2 patch series after Arun's v2 patch.
Regards,
Philip


  
  Regards,
  
  Christian.
  
  
  

Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 +++

  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 9 +++--

  include/uapi/drm/amdgpu_drm.h    | 5 +

  include/uapi/linux/kfd_ioctl.h   | 1 +

  4 files changed, 20 insertions(+), 2 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 0ae9fd844623..3523b91f8add 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1470,6 +1470,9 @@ static int
amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)

  if (unlikely(ret))

  return ret;

  +    if (bo->flags &
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT)

+    bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;

+

  ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);

  if (ret)

  pr_err("Error in Pinning BO to domain: %d\n", domain);

@@ -1712,6 +1715,10 @@ int
amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(

  alloc_flags =
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;

  alloc_flags |= (flags &
KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ?

  AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;

+

+    /* For contiguous VRAM allocation */

+    if (flags &
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)

+    alloc_flags |=
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT;

  }

  xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?

  0 : fpriv->xcp_id;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 8db880244324..1d6e45e238e1 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

@@ -516,8 +516,13 @@ static int amdgpu_vram_mgr_new(struct
ttm_resource_manager *man,

    BUG_ON(min_block_size < mm->chunk_size);

  -    /* Limit maximum size to 2GiB due to SG table
limitations */

-    size = min(remaining_size, 2ULL << 30);

+    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)

+    size = remaining_size;

+    else

+    /* Limit maximum size to 2GiB due to SG table
limitations

+ * for no contiguous allocation.

+ */

+    size = min(remaining_size, 2ULL << 30);

    if ((size >= (u64)pages_per_block <<
PAGE_SHIFT) &&

  !(size & (((u64)pages_per_block <<
PAGE_SHIF

Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-16 Thread Philip Yang


  


On 2024-04-16 02:50, Paneer Selvam,
  Arunpravin wrote:


  
  
  On 4/16/2024 3:32 AM, Philip Yang wrote:
  
  


On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote:

Now we have two flags for contiguous
  VRAM buffer allocation.
  
  If the application request for
  AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
  
  it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
  
  buffer's placement function.
  
  
  This patch will change the default behaviour of the two flags.
  

This change will simplify the KFD best effort contiguous VRAM
allocation, because KFD doesn't need set new GEM_ flag.

When we set
  AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
  
  - This means contiguous is not mandatory.
  


AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For
page table BO, it is fine as BO size is page size 4K. For 64KB
reserved BOs and F/W size related BOs, do all allocation happen
at driver initialization before the VRAM is fragmented?


- we will try to allocate the contiguous
  buffer. Say if the
  
     allocation fails, we fallback to allocate the individual
  pages.
  
  
  When we setTTM_PL_FLAG_CONTIGUOUS
  
  - This means contiguous allocation is mandatory.
  
  - we are setting this in amdgpu_bo_pin_restricted() before bo
  validation
  
     and check this flag in the vram manager file.
  
  - if this is set, we should allocate the buffer pages
  contiguously.
  
     the allocation fails, we return -ENOSPC.
  
  
  Signed-off-by: Arunpravin Paneer
  Selvam
  
  Suggested-by: Christian König
  
  ---
  
    drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
  
    drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57
  +++-
  
    2 files changed, 49 insertions(+), 22 deletions(-)
  
  
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
  b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
  
  index 8bc79924d171..41926d631563 100644
  
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
  
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
  
  @@ -153,8 +153,6 @@ void
  amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32
  domain)
  
    else
  
    places[c].flags |= TTM_PL_FLAG_TOPDOWN;
  
    -    if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
  
  -    places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
  
    c++;
  
    }
  
    @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct
  amdgpu_bo *bo, u32 domain,
  
    {
  
    struct amdgpu_device *adev =
  amdgpu_ttm_adev(bo->tbo.bdev);
  
    struct ttm_operation_ctx ctx = { false, false };
  
  +    struct ttm_place *places = bo->placements;
  
  +    u32 c = 0;
  
    int r, i;
  
      if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
  
  @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct
  amdgpu_bo *bo, u32 domain,
  
      if (bo->tbo.pin_count) {
  
    uint32_t mem_type =
  bo->tbo.resource->mem_type;
  
  -    uint32_t mem_flags =
  bo->tbo.resource->placement;
  
      if (!(domain &
  amdgpu_mem_type_to_domain(mem_type)))
  
    return -EINVAL;
  
    -    if ((mem_type == TTM_PL_VRAM) &&
  
  -    (bo->flags &
  AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
  
  -    !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
  
  -    return -EINVAL;
  
  -
  

This looks like a bug before, but with this patch, the check
makes sense and is needed.

  ttm_bo_pin(>tbo);
  
      if (max_offset != 0) {
  
  @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct
  amdgpu_bo *bo, u32 domain,
  
    bo->placements[i].lpfn = lpfn;

Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour

2024-04-15 Thread Philip Yang


  


On 2024-04-14 10:57, Arunpravin Paneer
  Selvam wrote:


  Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

This change will simplify the KFD best effort contiguous VRAM
allocation, because KFD doesn't need set new GEM_ flag. 

  

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.

AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For
  page table BO, it is fine as BO size is page size 4K. For 64KB
  reserved BOs and F/W size related BOs, do all allocation happen at
  driver initialization before the VRAM is fragmented?


  
- we will try to allocate the contiguous buffer. Say if the
  allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
  and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
  the allocation fails, we return -ENOSPC.

Signed-off-by: Arunpravin Paneer Selvam 
Suggested-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++-
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..41926d631563 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
 		else
 			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
 
-		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
-			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
 		c++;
 	}
 
@@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 {
 	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
 	struct ttm_operation_ctx ctx = { false, false };
+	struct ttm_place *places = bo->placements;
+	u32 c = 0;
 	int r, i;
 
 	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
@@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 
 	if (bo->tbo.pin_count) {
 		uint32_t mem_type = bo->tbo.resource->mem_type;
-		uint32_t mem_flags = bo->tbo.resource->placement;
 
 		if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
 			return -EINVAL;
 
-		if ((mem_type == TTM_PL_VRAM) &&
-		(bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
-		!(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
-			return -EINVAL;
-

This looks like a bug before, but with this patch, the check makes
sense and is needed.

  
 		ttm_bo_pin(>tbo);
 
 		if (max_offset != 0) {
@@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 			bo->placements[i].lpfn = lpfn;
 	}
 
+	if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
+	!WARN_ON(places[c].mem_type != TTM_PL_VRAM))
+		places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+

If BO pinned is not allocated with
  AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, should pin and return scattered
  pages because the RDMA support scattered dmabuf. Christian also
  pointed this out.

If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
  &&

    bo->placements[i].mem_type == TTM_PL_VRAM)

        o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;

  
 	r = ttm_bo_validate(>tbo, >placement, );
 	if (unlikely(r)) {
 		dev_err(adev->dev, "%p pin failed\n", bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..ddbf302878f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head)
 	return size;
 }
 
+static inline unsigned long
+amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
+ const struct ttm_place *place,
+ unsigned long bo_flags)
+{
+	unsigned long pages_per_block;
+
+	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+	place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+		pages_per_block = ~0ul;
+	} else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		pages_per_block = HPAGE_PMD_NR;
+#else
+		/* default to 2MB */
+		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+		pages_per_block = max_t(uint32_t, pages_per_block,
+	tbo->page_alignment);
+	}
+
+	return pages_per_block;
+}
+
 /**
  * DOC: mem_info_vram_total
  *
@@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
 	struct amdgpu_device *adev

[PATCH 5/6] drm/amdgpu: Skip dma map resource for null RDMA device

2024-04-12 Thread Philip Yang

To test RDMA using dummy driver on the system without NIC/RDMA
device, the get dma pages pass in null device pointer, skip the
dma map resource to avoid null device pointer access.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++-
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 1d6e45e238e1..93fb63f4dae5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -674,12 +674,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
size_t size = cursor.size;
dma_addr_t addr;
 
-   addr = dma_map_resource(dev, phys, size, dir,
-   DMA_ATTR_SKIP_CPU_SYNC);
-   r = dma_mapping_error(dev, addr);
-   if (r)
-   goto error_unmap;
-
+   if (dev) {
+   addr = dma_map_resource(dev, phys, size, dir,
+   DMA_ATTR_SKIP_CPU_SYNC);
+   r = dma_mapping_error(dev, addr);
+   if (r)
+   goto error_unmap;
+   } else {
+   addr = phys;
+   }
sg_set_page(sg, NULL, size, 0);
sg_dma_address(sg) = addr;
sg_dma_len(sg) = size;
@@ -693,10 +696,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
for_each_sgtable_sg((*sgt), sg, i) {
if (!sg->length)
continue;
-
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
}
sg_free_table(*sgt);
 
@@ -721,10 +724,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev,
struct scatterlist *sg;
int i;
 
-   for_each_sgtable_sg(sgt, sg, i)
-   dma_unmap_resource(dev, sg->dma_address,
-  sg->length, dir,
-  DMA_ATTR_SKIP_CPU_SYNC);
+   if (dev) {
+   for_each_sgtable_sg(sgt, sg, i)
+   dma_unmap_resource(dev, sg->dma_address,
+  sg->length, dir,
+  DMA_ATTR_SKIP_CPU_SYNC);
+   }
sg_free_table(sgt);
kfree(sgt);
 }
-- 
2.43.2

[PATCH 2/6] drm/amdgpu: Evict BOs from same process for contiguous allocation

2024-04-12 Thread Philip Yang

When TTM failed to alloc VRAM, TTM evict BOs from VRAM to system memory
then retry the allocation, this currently skips the KFD BOs from the
same process because KFD requires all BOs are resident for user queues.

If TTM BO with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM,
allow TTM evict KFD BOs from the same process, this will evict the user
queues first, and restore the queues later after contiguous VRAM
allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index a5ceec7820cf..00b8603d73e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1383,7 +1383,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
 */
dma_resv_for_each_fence(_cursor, bo->base.resv,
DMA_RESV_USAGE_BOOKKEEP, f) {
-   if (amdkfd_fence_check_mm(f, current->mm))
+   if (amdkfd_fence_check_mm(f, current->mm) &&
+   !(place->flags & TTM_PL_FLAG_CONTIGUOUS))
return false;
}
 
-- 
2.43.2

[PATCH 6/6] drm/amdkfd: Bump kfd version for contiguous VRAM allocation

2024-04-12 Thread Philip Yang

Bump the kfd ioctl minor version to delcare the contiguous VRAM
allocation flag support.

Signed-off-by: Philip Yang 
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index c1394c162d4e..a0af2ef696ea 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -41,9 +41,10 @@
  * - 1.13 - Add debugger API
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
+ * - 1.16 - Add contiguous VRAM allocation flag for RDMA
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 15
+#define KFD_IOCTL_MINOR_VERSION 16
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.43.2

[PATCH 4/6] drm/amdkfd: Increase KFD bo restore wait time

2024-04-12 Thread Philip Yang

TTM allocate contiguous VRAM may takes more than 1 second to evict BOs
for larger size RDMA buffer. Because KFD restore bo worker reserves all
KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them,
this may causes TTM failed to alloc contiguous VRAM.

Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA
pin BO to finish the contiguous VRAM allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a81ef232fdef..c205e2d3acf9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -698,7 +698,7 @@ struct qcm_process_device {
 /* KFD Memory Eviction */
 
 /* Approx. wait time before attempting to restore evicted BOs */
-#define PROCESS_RESTORE_TIME_MS 100
+#define PROCESS_RESTORE_TIME_MS 2000
 /* Approx. back off time if restore fails due to lack of memory */
 #define PROCESS_BACK_OFF_TIME_MS 100
 /* Approx. time before evicting the process again */
-- 
2.43.2

[PATCH 3/6] drm/amdkfd: Evict BO itself for contiguous allocation

2024-04-12 Thread Philip Yang

If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to
system memory first to free the VRAM space, then allocate contiguous
VRAM and then move it from system memory back to VRAM.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 3523b91f8add..9506de1094ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,8 +1470,21 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
-   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT)
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT) {
+   /*
+* If bo is not contiguous on VRAM, move to system memory first 
to ensure
+* we can get contiguous VRAM space after evicting other BOs.
+*/
+   if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+   ret = amdgpu_amdkfd_bo_validate(bo, 
AMDGPU_GEM_DOMAIN_GTT, false);
+   if (unlikely(ret)) {
+   pr_debug("validate bo 0x%p to GTT failed %d\n", 
>tbo, ret);
+   return ret;
+   }
+   }
+
bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+   }
 
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
-- 
2.43.2

[PATCH 1/6] drm/amdgpu: Support contiguous VRAM allocation

2024-04-12 Thread Philip Yang

RDMA device with limited scatter-gather capability requires physical
address contiguous VRAM buffer for RDMA peer direct access.

Add a new KFD alloc memory flag and store as new GEM bo alloc flag. When
pin this buffer object to export for RDMA peerdirect access, set
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS flag, and then vram_mgr will set
TTM_PL_FLAG_CONTIFUOUS flag to ask VRAM buddy allocator to get
contiguous VRAM.

Remove the 2GB max memory block size limit for contiguous allocation.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 9 +++--
 include/uapi/drm/amdgpu_drm.h| 5 +
 include/uapi/linux/kfd_ioctl.h   | 1 +
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ae9fd844623..3523b91f8add 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1470,6 +1470,9 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo 
*bo, u32 domain)
if (unlikely(ret))
return ret;
 
+   if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT)
+   bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+
ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
@@ -1712,6 +1715,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+   /* For contiguous VRAM allocation */
+   if (flags & 
KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT)
+   alloc_flags |= 
AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT;
}
xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
0 : fpriv->xcp_id;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..1d6e45e238e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -516,8 +516,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager 
*man,
 
BUG_ON(min_block_size < mm->chunk_size);
 
-   /* Limit maximum size to 2GiB due to SG table limitations */
-   size = min(remaining_size, 2ULL << 30);
+   if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+   size = remaining_size;
+   else
+   /* Limit maximum size to 2GiB due to SG table 
limitations
+* for no contiguous allocation.
+*/
+   size = min(remaining_size, 2ULL << 30);
 
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) 
- 1)))
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index ad21c613fec8..13645abb8e46 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -171,6 +171,11 @@ extern "C" {
  * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
  */
 #define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15)
+/* Flag that allocating the BO with best effort for contiguous VRAM.
+ * If no contiguous VRAM, fallback to scattered allocation.
+ * Pin the BO for peerdirect RDMA trigger VRAM defragmentation.
+ */
+#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT  (1 << 16)
 
 struct drm_amdgpu_gem_create_in  {
/** the requested memory size */
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2040a470ddb4..c1394c162d4e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT   (1 << 26)
 #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED   (1 << 25)
 #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT   (1 << 24)
+#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
2.43.2

[PATCH 0/6] Best effort contiguous VRAM allocation

2024-04-12 Thread Philip Yang

This patch series implement new KFD memory alloc flag for best effort contiguous
VRAM allocation, to support peer direct access RDMA device with limited 
scatter-gather
dma capability.

Philip Yang (6):
  drm/amdgpu: Support contiguous VRAM allocation
  drm/amdgpu: Evict BOs from same process for contiguous allocation
  drm/amdkfd: Evict BO itself for contiguous allocation
  drm/amdkfd: Increase KFD bo restore wait time
  drm/amdgpu: Skip dma map resource for null RDMA device
  drm/amdkfd: Bump kfd version for contiguous VRAM allocation

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 42 ---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +-
 include/uapi/drm/amdgpu_drm.h |  5 +++
 include/uapi/linux/kfd_ioctl.h|  4 +-
 6 files changed, 57 insertions(+), 19 deletions(-)

-- 
2.43.2

[PATCH] drm/amdgpu: Fix tlb_cb memory leaking

2024-04-08 Thread Philip Yang

After updating GPU page table via CPU on large bar system, no fence
callback, call amdgpu_vm_tlb_seq_cb directly after command committed
to free tlb_cb.

memory leaking backtrace from kmemleakd:
  unreferenced object 0xa036816b00c0 (size 32):
  backtrace:
 __kmem_cache_alloc_node+0x3fe/0x4d0
 kmalloc_trace+0x2a/0xb0
 amdgpu_vm_update_range+0x9b/0x8d0 [amdgpu]
 amdgpu_vm_clear_freed+0xc1/0x210 [amdgpu]
 unmap_bo_from_gpuvm.isra.36+0x37/0x50 [amdgpu]
 amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x118/0x1b0 [amdgpu]
 kfd_process_device_free_bos+0x7c/0xe0 [amdgpu]
 kfd_process_wq_release+0x273/0x3c0 [amdgpu]
 process_scheduled_works+0x2a7/0x500
 worker_thread+0x186/0x340

Fixes: 220ecde84bc8 ("drm/amdgpu: implement TLB flush fence")
Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 8af3f0fd3073..d0ef727cd7e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -901,12 +901,9 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,
 {
struct amdgpu_vm *vm = params->vm;
 
-   if (!fence || !*fence)
-   return;
-
tlb_cb->vm = vm;
-   if (!dma_fence_add_callback(*fence, _cb->cb,
-   amdgpu_vm_tlb_seq_cb)) {
+   if (fence && *fence &&
+   !dma_fence_add_callback(*fence, _cb->cb, amdgpu_vm_tlb_seq_cb)) 
{
dma_fence_put(vm->last_tlb_flush);
vm->last_tlb_flush = dma_fence_get(*fence);
} else {
-- 
2.43.2

Re: [PATCH 2/3] amd/amdgpu: wait no process running in kfd before resuming device

2024-03-26 Thread Philip Yang


  


On 2024-03-26 11:01, Felix Kuehling
  wrote:

On
  2024-03-26 10:53, Philip Yang wrote:
  
  


On 2024-03-25 14:45, Felix Kuehling wrote:

On 2024-03-22 15:57, Zhigang Luo wrote:
  
  it will cause page fault after device
recovered if there is a process running.


Signed-off-by: Zhigang Luo 

Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++

  1 file changed, 2 insertions(+)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 70261eb9b0bb..2867e9186e44 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -4974,6 +4974,8 @@ static int
amdgpu_device_reset_sriov(struct amdgpu_device *adev,

  retry:

  amdgpu_amdkfd_pre_reset(adev);

  +    amdgpu_amdkfd_wait_no_process_running(adev);

+

  
  
  This waits for the processes to be terminated. What would
  cause the processes to be terminated? Why do the processes
  need to be terminated? Isn't it enough if the processes are
  removed from the runlist in pre-reset, so they can no longer
  execute on the GPU?
  


mode 1 reset on SRIOV is much faster then BM, kgd2kfd_pre_reset
sends GPU reset event to user space, don't remove queues from
the runlist, after mode1 reset is done, there is queue still
running and generate vm fault because the GPU page table is
gone.


  
  I think seeing a page fault during the reset is not a problem.
  Seeing a page fault after the reset would be a bug. The process
  should not be on the runlist after the reset is done.
  
  
  Waiting for the process to terminate first looks like a
  workaround, when the real bug is maybe that we're not updating the
  process state correctly in pre-reset. All currently running
  processes should be put into evicted state, so they are not put
  back on the runlist after the reset.
  

Forgot to mention it is F/W hang issue to trigger GPU reset,
  there is also error message when kgd2kfd_pre_reset ->
  kgd2kfd_suspend to evict queues from the runlist,  yes, this seems
  W/A for the real issue related to mode1 reset.

Regards,
Philip


  
  Regards,
  
    Felix
  
  
  
  Regards,


Philip



  
  Regards,
  
    Felix
  
  
  
  amdgpu_device_stop_pending_resets(adev);

    if (from_hypervisor)

Re: [PATCH 2/3] amd/amdgpu: wait no process running in kfd before resuming device

2024-03-26 Thread Philip Yang


  


On 2024-03-25 14:45, Felix Kuehling
  wrote:

On
  2024-03-22 15:57, Zhigang Luo wrote:
  
  it will cause page fault after device
recovered if there is a process running.


Signed-off-by: Zhigang Luo 

Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++

  1 file changed, 2 insertions(+)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 70261eb9b0bb..2867e9186e44 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -4974,6 +4974,8 @@ static int
amdgpu_device_reset_sriov(struct amdgpu_device *adev,

  retry:

  amdgpu_amdkfd_pre_reset(adev);

  +    amdgpu_amdkfd_wait_no_process_running(adev);

+

  
  
  This waits for the processes to be terminated. What would cause
  the processes to be terminated? Why do the processes need to be
  terminated? Isn't it enough if the processes are removed from the
  runlist in pre-reset, so they can no longer execute on the GPU?
  

mode 1 reset on SRIOV is much faster then BM, kgd2kfd_pre_reset
  sends GPU reset event to user space, don't remove queues from the
  runlist, after mode1 reset is done, there is queue still running
  and generate vm fault because the GPU page table is gone.
Regards,
Philip  

  
  Regards,
  
    Felix
  
  
  
   
amdgpu_device_stop_pending_resets(adev);

    if (from_hypervisor)

Re: [PATCH] drm/amdkfd: return negative error code in svm_ioctl()

2024-03-25 Thread Philip Yang


  


On 2024-03-25 02:31, Su Hui wrote:


  svm_ioctl() should return negative error code in default case.

Fixes: 42de677f7999 ("drm/amdkfd: register svm range")
Signed-off-by: Su Hui 

Good catch, ioctl should return -errno. I will apply it to
  drm-next.
Reviewed-by: Philip Yang


  
---
Ps: When I try to compile this file, there is a error :
drivers/gpu/drm/amd/amdkfd/kfd_migrate.c:28:10: fatal error: amdgpu_sync.h:
No such file or directory.

Maybe there are some steps I missed or this place need to be corrected?

Don't know how you compile the driver, amdgpu_sync.h is located
  under amdgpu folder, amdkfd/Makefile is included from
  amdgpu/Makefile, which set ccflag-y -I correctly.
Regards,
Philip


  

 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f0f7f48af413..41c376f3fd27 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -4147,7 +4147,7 @@ svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
 		r = svm_range_get_attr(p, mm, start, size, nattrs, attrs);
 		break;
 	default:
-		r = EINVAL;
+		r = -EINVAL;
 		break;
 	}

[PATCH] drm/amdgpu: amdgpu_ttm_gart_bind set gtt bound flag

2024-03-11 Thread Philip Yang

Otherwise amdgpu_ttm_backend_unbind will not clear the gart page table
and leave valid mapping entry to the stale system page.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 75c9fd2c6c2a..b0ed10f4de60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -869,6 +869,7 @@ static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
amdgpu_gart_bind(adev, gtt->offset, ttm->num_pages,
 gtt->ttm.dma_address, flags);
}
+   gtt->bound = true;
 }
 
 /*
-- 
2.35.1

Re: [PATCH v5 1/2] drm/amdgpu: implement TLB flush fence

2024-03-07 Thread Philip Yang


  


On 2024-03-06 09:41, Shashank Sharma
  wrote:


  From: Christian König 

The problem is that when (for example) 4k pages are replaced
with a single 2M page we need to wait for change to be flushed
out by invalidating the TLB before the PT can be freed.

Solve this by moving the TLB flush into a DMA-fence object which
can be used to delay the freeing of the PT BOs until it is signaled.

V2: (Shashank)
- rebase
- set dma_fence_error only in case of error
- add tlb_flush fence only when PT/PD BO is locked (Felix)
- use vm->pasid when f is NULL (Mukul)

V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
- move the misplaced fence_create call to the end (Philip)

V5: - free the f->dependency properly (Christian)

Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Rajneesh Bhardwaj 
Cc: Alex Deucher 
Reviewed-by: Shashank Sharma 
Signed-off-by: Christian König 
Signed-off-by: Shashank Sharma 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|   4 +
 .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 112 ++
 4 files changed, 128 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index fa26a4e3a99d..91ab4cf29b5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
 	amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
 	atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
 	atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
-	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
+	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
+	amdgpu_ib.o amdgpu_pll.o \
 	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
 	amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
 	amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 0960e0a665d3..310aae6fb49b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -988,6 +988,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 
 	r = vm->update_funcs->commit(, fence);
 
+	/* Prepare a TLB flush fence to be attached to PTs */
+	if (!unlocked && params.needs_flush && vm->is_compute_context) {
+		amdgpu_vm_tlb_fence_create(adev, vm, fence);
+
+		/* Makes sure no PD/PT is freed before the flush */
+		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
+   DMA_RESV_USAGE_BOOKKEEP);
+	}
+
 error_unlock:
 	amdgpu_vm_eviction_unlock(vm);
 	drm_dev_exit(idx);
@@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 
 	mutex_init(>eviction_lock);
 	vm->evicting = false;
+	vm->tlb_fence_context = dma_fence_context_alloc(1);
 
 	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
 false, , xcp_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 64b3f69efa57..298f604b8e5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -341,6 +341,7 @@ struct amdgpu_vm {
 	atomic64_t		tlb_seq;
 	uint64_t		tlb_seq_va;
 	uint64_t		*tlb_seq_cpu_addr;
+	uint64_t		tlb_fence_context;
 
 	atomic64_t		kfd_last_flushed_seq;
 
@@ -594,5 +595,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
   uint64_t addr,
   uint32_t status,
   unsigned int vmhub);
+void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
+ struct amdgpu_vm *vm,
+ struct dma_fence **fence);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
new file mode 100644
index ..51cddfa3f1e8
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

Re: [PATCH v4 2/2] drm/amdgpu: sync page table freeing with tlb flush

2024-03-01 Thread Philip Yang


  


On 2024-03-01 06:07, Shashank Sharma
  wrote:


  The idea behind this patch is to delay the freeing of PT entry objects
until the TLB flush is done.

This patch:
- Adds a tlb_flush_waitlist which will keep the objects that need to be
  freed after tlb_flush
- Adds PT entries in this list in amdgpu_vm_pt_free_dfs, instead of freeing
  them immediately.
- Exports function amdgpu_vm_pt_free to be called dircetly.
- Adds a 'force' input bool to amdgpu_vm_pt_free_dfs to differentiate
  between immediate freeing of the BOs (like from
  amdgpu_vm_pt_free_root) vs delayed freeing.

V2: rebase
V4: (Christian)
- add only locked PTEs entries in TLB flush waitlist.
- do not create a separate function for list flush.
- do not create a new lock for TLB flush.
- there is no need to wait on tlb_flush_fence exclusively.

Cc: Christian König 
Cc: Alex Deucher 
Cc: Felix Kuehling 
Cc: Rajneesh Bhardwaj 
Signed-off-by: Shashank Sharma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 21 ++---
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 310aae6fb49b..94581a1fe34f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -990,11 +990,20 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 
 	/* Prepare a TLB flush fence to be attached to PTs */
 	if (!unlocked && params.needs_flush && vm->is_compute_context) {
+		struct amdgpu_vm_bo_base *entry, *next;
+
 		amdgpu_vm_tlb_fence_create(adev, vm, fence);
 
 		/* Makes sure no PD/PT is freed before the flush */
 		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
    DMA_RESV_USAGE_BOOKKEEP);
+
+		if (list_empty(>tlb_flush_waitlist))
+			goto error_unlock;
+
+		/* Now actually free the waitlist */
+		list_for_each_entry_safe(entry, next, >tlb_flush_waitlist, vm_status)
+			amdgpu_vm_pt_free(entry);
 	}
 
 error_unlock:
@@ -2214,6 +2223,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	INIT_LIST_HEAD(>pt_freed);
 	INIT_WORK(>pt_free_work, amdgpu_vm_pt_free_work);
 	INIT_KFIFO(vm->faults);
+	INIT_LIST_HEAD(>tlb_flush_waitlist);
 
 	r = amdgpu_seq64_alloc(adev, >tlb_seq_va, >tlb_seq_cpu_addr);
 	if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 298f604b8e5f..ba374c2c61bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -343,6 +343,9 @@ struct amdgpu_vm {
 	uint64_t		*tlb_seq_cpu_addr;
 	uint64_t		tlb_fence_context;
 
+	/* temporary storage of PT BOs until the TLB flush */
+	struct list_head	tlb_flush_waitlist;
+
 	atomic64_t		kfd_last_flushed_seq;
 
 	/* How many times we had to re-generate the page tables */
@@ -545,6 +548,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
 			  uint64_t start, uint64_t end,
 			  uint64_t dst, uint64_t flags);
 void amdgpu_vm_pt_free_work(struct work_struct *work);
+void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry);
 
 #if defined(CONFIG_DEBUG_FS)
 void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 95dc0afdaffb..cb14e5686c0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -636,7 +636,7 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
  *
  * @entry: PDE to free
  */
-static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
+void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
 {
 	struct amdgpu_bo *shadow;
 
@@ -685,13 +685,15 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
  * @vm: amdgpu vm structure
  * @start: optional cursor where to start freeing PDs/PTs
  * @unlocked: vm resv unlock status
+ * @force: force free all PDs/PTs without waiting for TLB flush
  *
  * Free the page directory or page table level and all sub levels.
  */
 static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
   struct amdgpu_vm *vm,
   struct amdgpu_vm_pt_cursor *start,
-  bool unlocked)
+  bool unlocked,
+  bool force)
 {
 	struct amdgpu_vm_pt_cursor cursor;
 	struct amdgpu_vm_bo_base *entry;
@@ -708,11 +710,15 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
 		return;
 	}
 
-	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
-		amdgpu_vm_pt_free(entry);

I feel like if we attach tlb flush fence before free pt bo, then
  don't need tlb_flush_waitlist.
Regards,
Philip


  
+	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) {
+		if (!force)
+			list_move(>vm_status, >tlb_flush_waitlist);
+		else
+			amdgpu_vm_pt_free(entry);
+	}
 
 	if

Re: [PATCH v4 1/2] drm/amdgpu: implement TLB flush fence

2024-03-01 Thread Philip Yang


  


On 2024-03-01 06:07, Shashank Sharma
  wrote:


  From: Christian König 

The problem is that when (for example) 4k pages are replaced
with a single 2M page we need to wait for change to be flushed
out by invalidating the TLB before the PT can be freed.

Solve this by moving the TLB flush into a DMA-fence object which
can be used to delay the freeing of the PT BOs until it is signaled.

V2: (Shashank)
- rebase
- set dma_fence_error only in case of error
- add tlb_flush fence only when PT/PD BO is locked (Felix)
- use vm->pasid when f is NULL (Mukul)

V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
- move the misplaced fence_create call to the end (Philip)

Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Rajneesh Bhardwaj 
Cc: Alex Deucher 
Signed-off-by: Christian König 
Signed-off-by: Shashank Sharma 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|   4 +
 .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 111 ++
 4 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index fa26a4e3a99d..91ab4cf29b5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
 	amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
 	atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
 	atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
-	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
+	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
+	amdgpu_ib.o amdgpu_pll.o \
 	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
 	amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
 	amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 0960e0a665d3..310aae6fb49b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -988,6 +988,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 
 	r = vm->update_funcs->commit(, fence);
 
+	/* Prepare a TLB flush fence to be attached to PTs */
+	if (!unlocked && params.needs_flush && vm->is_compute_context) {
+		amdgpu_vm_tlb_fence_create(adev, vm, fence);
+
+		/* Makes sure no PD/PT is freed before the flush */
+		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
+   DMA_RESV_USAGE_BOOKKEEP);
+	}
+

Adding fence here seems too late, the fence has to add before
calling amdgpu_vm_pt_free_dfs inside amdgpu_vm_ptes_update.

  
 error_unlock:
 	amdgpu_vm_eviction_unlock(vm);
 	drm_dev_exit(idx);
@@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 
 	mutex_init(>eviction_lock);
 	vm->evicting = false;
+	vm->tlb_fence_context = dma_fence_context_alloc(1);
 
 	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
 false, , xcp_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 64b3f69efa57..298f604b8e5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -341,6 +341,7 @@ struct amdgpu_vm {
 	atomic64_t		tlb_seq;
 	uint64_t		tlb_seq_va;
 	uint64_t		*tlb_seq_cpu_addr;
+	uint64_t		tlb_fence_context;
 
 	atomic64_t		kfd_last_flushed_seq;
 
@@ -594,5 +595,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
   uint64_t addr,
   uint32_t status,
   unsigned int vmhub);
+void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
+ struct amdgpu_vm *vm,
+ struct dma_fence **fence);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
new file mode 100644
index ..54c33c24fa46
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+

Re: [PATCH v3 3/3] drm/amdgpu: sync page table freeing with tlb flush

2024-02-26 Thread Philip Yang


  


On 2024-02-23 08:42, Shashank Sharma
  wrote:


  This patch:
- adds a new list in amdgou_vm to hold the VM PT entries being freed
- waits for the TLB flush using the vm->tlb_flush_fence
- actually frees the PT BOs

V2: rebase
V3: Do not attach the tlb_fence to the entries, rather add the entries
to a list and delay their freeing (Christian)

Cc: Christian König 
Cc: Alex Deucher 
Cc: Felix Kuehling 
Cc: Rajneesh Bhardwaj 
Signed-off-by: Shashank Sharma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  6 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  6 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 51 ---
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 67c690044b97..eebb73f2c2ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -939,6 +939,10 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		/* Makes sure no PD/PT is freed before the flush */
 		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
    DMA_RESV_USAGE_BOOKKEEP);
+
+		mutex_lock(>tlb_fence_lock);
+		vm->tlb_fence_last = *fence;
+		mutex_unlock(>tlb_fence_lock);
 	}
 
 	amdgpu_res_first(pages_addr ? NULL : res, offset,
@@ -2212,6 +2216,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	INIT_LIST_HEAD(>freed);
 	INIT_LIST_HEAD(>done);
 	INIT_LIST_HEAD(>pt_freed);
+	INIT_LIST_HEAD(>tlb_flush_waitlist);
 	INIT_WORK(>pt_free_work, amdgpu_vm_pt_free_work);
 	INIT_KFIFO(vm->faults);
 
@@ -2244,6 +2249,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	vm->last_unlocked = dma_fence_get_stub();
 	vm->generation = 0;
 
+	mutex_init(>tlb_fence_lock);
 	mutex_init(>eviction_lock);
 	vm->evicting = false;
 	vm->tlb_fence_context = dma_fence_context_alloc(1);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 8e6fd25d07b7..77f10ed80973 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -334,6 +334,10 @@ struct amdgpu_vm {
 	uint64_t		*tlb_seq_cpu_addr;
 	uint64_t		tlb_fence_context;
 
+	struct mutex 		tlb_fence_lock;
+	struct dma_fence	*tlb_fence_last;
+	struct list_head	tlb_flush_waitlist;
+
 	atomic64_t		kfd_last_flushed_seq;
 
 	/* How many times we had to re-generate the page tables */
@@ -379,6 +383,8 @@ struct amdgpu_vm {
 
 	/* cached fault info */
 	struct amdgpu_vm_fault_info fault_info;
+
+	int count_bos;
 };
 
 struct amdgpu_vm_manager {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 95dc0afdaffb..57ea95c5c085 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -643,13 +643,13 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
 	if (!entry->bo)
 		return;
 
-	entry->bo->vm_bo = NULL;
 	shadow = amdgpu_bo_shadowed(entry->bo);
 	if (shadow) {
 		ttm_bo_set_bulk_move(>tbo, NULL);
 		amdgpu_bo_unref();
 	}
 	ttm_bo_set_bulk_move(>bo->tbo, NULL);
+	entry->bo->vm_bo = NULL;
 
 	spin_lock(>vm->status_lock);
 	list_del(>vm_status);
@@ -657,6 +657,38 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
 	amdgpu_bo_unref(>bo);
 }
 
+static void amdgpu_vm_pt_flush_waitlist(struct amdgpu_vm *vm)
+{
+	struct amdgpu_vm_bo_base *entry, *next;
+	LIST_HEAD(tlb_flush_waitlist);
+
+	if (!vm || list_empty(>tlb_flush_waitlist))
+		return;
+
+	/* Wait for pending TLB flush before freeing PT BOs */
+	mutex_lock(>tlb_fence_lock);
+	if (vm->tlb_fence_last && !dma_fence_is_signaled(vm->tlb_fence_last)) {
+		if (dma_fence_wait_timeout(vm->tlb_fence_last, false,
+	   MAX_SCHEDULE_TIMEOUT) <= 0) {
+			DRM_ERROR("Timedout waiting for TLB flush, not freeing PT BOs\n");
+			mutex_unlock(>tlb_fence_lock);
+			return;
+		}
+
+		vm->tlb_fence_last = NULL;
+	}
+
+	/* Save the waitlist locally and reset the flushlist */
+	list_splice_init(>tlb_flush_waitlist, _flush_waitlist);
+	mutex_unlock(>tlb_fence_lock);
+
+	/* Now free the entries */
+	list_for_each_entry_safe(entry, next, _flush_waitlist, vm_status) {
+		if (entry)
+			amdgpu_vm_pt_free(entry);
+	}
+}
+
 void amdgpu_vm_pt_free_work(struct work_struct *work)
 {
 	struct amdgpu_vm_bo_base *entry, *next;
@@ -673,7 +705,7 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
 	amdgpu_bo_reserve(vm->root.bo, true);
 
 	list_for_each_entry_safe(entry, next, _freed, vm_status)
-		amdgpu_vm_pt_free(entry);
+		list_move(>vm_status, >tlb_flush_waitlist);
 
 	amdgpu_bo_unreserve(vm->root.bo);
 }
@@ -708,11 +740,17 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
 		return;
 	}
 
-	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
-		amdgpu_vm_pt_free(entry);
+	mutex_lock(>tlb_fence_lock);
+
+	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) {
+		if

Re: [PATCH v3 2/3] drm/amdgpu: implement TLB flush fence

2024-02-26 Thread Philip Yang


  


On 2024-02-23 11:58, Philip Yang wrote:


  
  
  
  On 2024-02-23 08:42, Shashank Sharma
wrote:
  
  
From: Christian König 

The problem is that when (for example) 4k pages are replaced
with a single 2M page we need to wait for change to be flushed
out by invalidating the TLB before the PT can be freed.

Solve this by moving the TLB flush into a DMA-fence object which
can be used to delay the freeing of the PT BOs until it is signaled.

V2: (Shashank)
- rebase
- set dma_fence_error only in case of error
- add tlb_flush fence only when PT/PD BO is locked (Felix)
- use vm->pasid when f is NULL (Mukul)

Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Rajneesh Bhardwaj 
Cc: Alex Deucher 
Signed-off-by: Christian König 
Signed-off-by: Shashank Sharma 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|   4 +
 .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 106 ++
 4 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 4c989da4d2f3..fdbb3d770c7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
 	amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
 	atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
 	atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
-	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
+	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
+	amdgpu_ib.o amdgpu_pll.o \
 	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
 	amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
 	amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 0960e0a665d3..67c690044b97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -932,6 +932,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	if (r)
 		goto error_unlock;
 
+	/* Prepare a TLB flush fence to be attached to PTs */
+	if (!unlocked && params.needs_flush && vm->is_compute_context) {
+		amdgpu_vm_tlb_fence_create(adev, vm, fence);
+
+		/* Makes sure no PD/PT is freed before the flush */
+		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
+   DMA_RESV_USAGE_BOOKKEEP);
+	}
+
 	amdgpu_res_first(pages_addr ? NULL : res, offset,
 			 (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, );
 	while (cursor.remaining) {
@@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 
 	mutex_init(>eviction_lock);
 	vm->evicting = false;
+	vm->tlb_fence_context = dma_fence_context_alloc(1);
 
 	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
 false, , xcp_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index ac9380afcb69..8e6fd25d07b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -332,6 +332,7 @@ struct amdgpu_vm {
 	atomic64_t		tlb_seq;
 	uint64_t		tlb_seq_va;
 	uint64_t		*tlb_seq_cpu_addr;
+	uint64_t		tlb_fence_context;
 
 	atomic64_t		kfd_last_flushed_seq;
 
@@ -585,5 +586,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
   uint64_t addr,
   uint32_t status,
   unsigned int vmhub);
+void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
+ struct amdgpu_vm *vm,
+ struct dma_fence **fence);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
new file mode 100644
index ..569681badd7c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILI

Re: [PATCH v3 2/3] drm/amdgpu: implement TLB flush fence

2024-02-23 Thread Philip Yang


  


On 2024-02-23 08:42, Shashank Sharma
  wrote:


  From: Christian König 

The problem is that when (for example) 4k pages are replaced
with a single 2M page we need to wait for change to be flushed
out by invalidating the TLB before the PT can be freed.

Solve this by moving the TLB flush into a DMA-fence object which
can be used to delay the freeing of the PT BOs until it is signaled.

V2: (Shashank)
- rebase
- set dma_fence_error only in case of error
- add tlb_flush fence only when PT/PD BO is locked (Felix)
- use vm->pasid when f is NULL (Mukul)

Cc: Christian Koenig 
Cc: Felix Kuehling 
Cc: Rajneesh Bhardwaj 
Cc: Alex Deucher 
Signed-off-by: Christian König 
Signed-off-by: Shashank Sharma 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|   4 +
 .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 106 ++
 4 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 4c989da4d2f3..fdbb3d770c7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
 	amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
 	atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
 	atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
-	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
+	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
+	amdgpu_ib.o amdgpu_pll.o \
 	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
 	amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
 	amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 0960e0a665d3..67c690044b97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -932,6 +932,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	if (r)
 		goto error_unlock;
 
+	/* Prepare a TLB flush fence to be attached to PTs */
+	if (!unlocked && params.needs_flush && vm->is_compute_context) {
+		amdgpu_vm_tlb_fence_create(adev, vm, fence);
+
+		/* Makes sure no PD/PT is freed before the flush */
+		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
+   DMA_RESV_USAGE_BOOKKEEP);
+	}
+
 	amdgpu_res_first(pages_addr ? NULL : res, offset,
 			 (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, );
 	while (cursor.remaining) {
@@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 
 	mutex_init(>eviction_lock);
 	vm->evicting = false;
+	vm->tlb_fence_context = dma_fence_context_alloc(1);
 
 	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
 false, , xcp_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index ac9380afcb69..8e6fd25d07b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -332,6 +332,7 @@ struct amdgpu_vm {
 	atomic64_t		tlb_seq;
 	uint64_t		tlb_seq_va;
 	uint64_t		*tlb_seq_cpu_addr;
+	uint64_t		tlb_fence_context;
 
 	atomic64_t		kfd_last_flushed_seq;
 
@@ -585,5 +586,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
   uint64_t addr,
   uint32_t status,
   unsigned int vmhub);
+void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
+ struct amdgpu_vm *vm,
+ struct dma_fence **fence);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
new file mode 100644
index ..569681badd7c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES

Re: [PATCH] drm/amdgpu: break COW for user ptr during fork()

2024-02-22 Thread Philip Yang


  


On 2024-02-21 21:01, Lang Yu wrote:


  This is useful to prevent copy-on-write semantics
from changing the physical location of a page if
the parent writes to it after a fork().

Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c| 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 75c9fd2c6c2a..2ee0af3c41b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -693,6 +693,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages,
 	}
 
 	readonly = amdgpu_ttm_tt_is_readonly(ttm);
+	vm_flags_set(vma, VM_DONTCOPY);

This will break user mode because the forked child process cannot
  access this vma/userptr.
This can be set by application if needed, using
  madvise(...MADV_DONTFORK) to avoid COW after fork.
Regards,
Philip


  
 	r = amdgpu_hmm_range_get_pages(>notifier, start, ttm->num_pages,
    readonly, NULL, pages, range);
 out_unlock:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 6aa032731ddc..607a8f68f26f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1674,6 +1674,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 			next = min(vma->vm_end, end);
 			npages = (next - addr) >> PAGE_SHIFT;
 			WRITE_ONCE(p->svms.faulting_task, current);
+			vm_flags_set(vma, VM_DONTCOPY);
 			r = amdgpu_hmm_range_get_pages(>notifier, addr, npages,
 		   readonly, owner, NULL,
 		   _range);

Re: [PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro

2024-02-20 Thread Philip Yang


  


On 2024-02-16 15:16, Felix Kuehling
  wrote:


  
  On 2024-02-15 10:18, Philip Yang wrote:
  
  Document how to use SMI system management
interface to receive SVM

events.


Define SVM events message string format macro that could use by
user

mode for sscanf to parse the event. Add it to uAPI header file
to make

it obvious that is changing uAPI in future.


No functional changes.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51
+++---

  include/uapi/linux/kfd_ioctl.h  | 77
-

  2 files changed, 102 insertions(+), 26 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

index d9953c2b2661..85465eb303a9 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

@@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct
kfd_node *dev, bool post_reset)

  event = KFD_SMI_EVENT_GPU_PRE_RESET;

  ++(dev->reset_seq_num);

  }

-    kfd_smi_event_add(0, dev, event, "%x\n",
dev->reset_seq_num);

+    kfd_smi_event_add(0, dev, event,

+ 
KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num));

  }

    void kfd_smi_event_update_thermal_throttling(struct kfd_node
*dev,

   uint64_t throttle_bitmask)

  {

-    kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE,
"%llx:%llx\n",

-  throttle_bitmask,

- 
amdgpu_dpm_get_thermal_throttling_counter(dev->adev));

+    kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE,

+ 
KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask,

+ 
amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));

  }

    void kfd_smi_event_update_vmfault(struct kfd_node *dev,
uint16_t pasid)

@@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct
kfd_node *dev, uint16_t pasid)

  if (!task_info.pid)

  return;

  -    kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,
"%x:%s\n",

-  task_info.pid, task_info.task_name);

+    kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,

+  KFD_EVENT_FMT_VMFAULT(task_info.pid,
task_info.task_name));

  }

    void kfd_smi_event_page_fault_start(struct kfd_node *node,
pid_t pid,

@@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct
kfd_node *node, pid_t pid,

  ktime_t ts)

  {

  kfd_smi_event_add(pid, node,
KFD_SMI_EVENT_PAGE_FAULT_START,

-  "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,

-  address, node->id, write_fault ? 'W' : 'R');

+  KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts),
pid,

+  address, node->id, write_fault ? 'W' : 'R'));

  }

    void kfd_smi_event_page_fault_end(struct kfd_node *node,
pid_t pid,

    unsigned long address, bool migration)

  {

  kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,

-  "%lld -%d @%lx(%x) %c\n",
ktime_get_boottime_ns(),

-  pid, address, node->id, migration ? 'M' :
'U');

+ 
KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),

+  pid, address, node->id, migration ? 'M' :
'U'));

  }

    void kfd_smi_event_migration_start(struct kfd_node *node,
pid_t pid,

@@ -274,9 +275,9 @@ void kfd_smi_event_migration_start(struct
kfd_node *node, pid_t pid,

 uint32_t trigger)

  {

  kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE

Re: [PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro

2024-02-15 Thread Philip Yang


  


On 2024-02-15 12:54, Chen, Xiaogang
  wrote:


  
  On 2/15/2024 9:18 AM, Philip Yang wrote:
  
  Caution: This message originated from an
External Source. Use proper caution when opening attachments,
clicking links, or responding.



Document how to use SMI system management interface to receive
SVM

events.


Define SVM events message string format macro that could use by
user

mode for sscanf to parse the event. Add it to uAPI header file
to make

it obvious that is changing uAPI in future.


No functional changes.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51
+++---

  include/uapi/linux/kfd_ioctl.h  | 77
-

  2 files changed, 102 insertions(+), 26 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

index d9953c2b2661..85465eb303a9 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

@@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct
kfd_node *dev, bool post_reset)

 event = KFD_SMI_EVENT_GPU_PRE_RESET;

 ++(dev->reset_seq_num);

 }

-   kfd_smi_event_add(0, dev, event, "%x\n",
dev->reset_seq_num);

+   kfd_smi_event_add(0, dev, event,

+
KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num));

  }


  void kfd_smi_event_update_thermal_throttling(struct kfd_node
*dev,

  uint64_t
throttle_bitmask)

  {

-   kfd_smi_event_add(0, dev,
KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",

- throttle_bitmask,

-
amdgpu_dpm_get_thermal_throttling_counter(dev->adev));

+   kfd_smi_event_add(0, dev,
KFD_SMI_EVENT_THERMAL_THROTTLE,

+
KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask,

+
amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));

  }


  void kfd_smi_event_update_vmfault(struct kfd_node *dev,
uint16_t pasid)

@@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct
kfd_node *dev, uint16_t pasid)

 if (!task_info.pid)

 return;


-   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,
"%x:%s\n",

- task_info.pid, task_info.task_name);

+   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,

+ KFD_EVENT_FMT_VMFAULT(task_info.pid,
task_info.task_name));

  }


  void kfd_smi_event_page_fault_start(struct kfd_node *node,
pid_t pid,

@@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct
kfd_node *node, pid_t pid,

 ktime_t ts)

  {

 kfd_smi_event_add(pid, node,
KFD_SMI_EVENT_PAGE_FAULT_START,

- "%lld -%d @%lx(%x) %c\n",
ktime_to_ns(ts), pid,

- address, node->id, write_fault ?
'W' : 'R');

+
KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,

+ address, node->id, write_fault ?
'W' : 'R'));

  }


  void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t
pid,

   unsigned long address, bool
migration)

  {

 kfd_smi_event_add(pid, node,
KFD_SMI_EVENT_PAGE_FAULT_END,

- "%lld -%d @%lx(%x) %c\n",
ktime_get_boottime_ns(),

- pid, address, node->id, migration ?
'M' : 'U');

+
KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_bootti

[PATCH 2/2] drm/amdkfd: Output migrate end event if migration failed

2024-02-15 Thread Philip Yang

To track the migrate end-event in case of a migration failure, always
output migrate end event, with the failure result added to the existing
migrate end event string.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c| 16 
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c |  5 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  2 +-
 include/uapi/linux/kfd_ioctl.h  |  7 ---
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 480e222364d5..23cf9484331e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -445,15 +445,15 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct 
svm_range *prange,
pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
 mpages, cpages, migrate.npages);
 
-   kfd_smi_event_migration_end(node, p->lead_thread->pid,
-   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
-   0, node->id, trigger);
-
svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages);
 
 out_free:
kvfree(buf);
 out:
+   kfd_smi_event_migration_end(node, p->lead_thread->pid,
+   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
+   0, node->id, trigger, r);
+
if (!r && mpages) {
pdd = svm_range_get_pdd_by_node(prange, node);
if (pdd)
@@ -737,15 +737,15 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct 
svm_range *prange,
svm_migrate_copy_done(adev, mfence);
migrate_vma_finalize();
 
-   kfd_smi_event_migration_end(node, p->lead_thread->pid,
-   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
-   node->id, 0, trigger);
-
svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages);
 
 out_free:
kvfree(buf);
 out:
+   kfd_smi_event_migration_end(node, p->lead_thread->pid,
+   start >> PAGE_SHIFT, end >> PAGE_SHIFT,
+   node->id, 0, trigger, r);
+
if (!r && cpages) {
mpages = cpages - upages;
pdd = svm_range_get_pdd_by_node(prange, node);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 85465eb303a9..d1a567f8a8d9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -282,11 +282,12 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
 
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
 unsigned long start, unsigned long end,
-uint32_t from, uint32_t to, uint32_t trigger)
+uint32_t from, uint32_t to, uint32_t trigger,
+int result)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
  KFD_EVENT_FMT_MIGRATE_END(ktime_get_boottime_ns(), 
pid,
- start, end - start, from, to, trigger));
+ start, end - start, from, to, trigger, result));
 }
 
 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index fa95c2dfd587..6c99eaa39f09 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -41,7 +41,7 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
 uint32_t trigger);
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
 unsigned long start, unsigned long end,
-uint32_t from, uint32_t to, uint32_t trigger);
+uint32_t from, uint32_t to, uint32_t trigger, int 
r);
 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
  uint32_t trigger);
 void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 430c01f4148b..5220670a434d 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -601,6 +601,7 @@ struct kfd_ioctl_smi_events_args {
  *migrate_update: the GPU page is recovered by 'M' for migrate, 'U' for 
update
  *rescheduled: 'R' if the queue restore failed and rescheduled to try again
  *rw: 'W' for write page fault, 'R' for read page fault
+ *result: page mirgate result, 0 for success, otherwise error code
  */
 #define KFD_EVENT_FMT_UPDATE_GP

[PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro

2024-02-15 Thread Philip Yang

Document how to use SMI system management interface to receive SVM
events.

Define SVM events message string format macro that could use by user
mode for sscanf to parse the event. Add it to uAPI header file to make
it obvious that is changing uAPI in future.

No functional changes.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51 +++---
 include/uapi/linux/kfd_ioctl.h  | 77 -
 2 files changed, 102 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index d9953c2b2661..85465eb303a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, 
bool post_reset)
event = KFD_SMI_EVENT_GPU_PRE_RESET;
++(dev->reset_seq_num);
}
-   kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
+   kfd_smi_event_add(0, dev, event,
+ KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num));
 }
 
 void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
 uint64_t throttle_bitmask)
 {
-   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n",
- throttle_bitmask,
- amdgpu_dpm_get_thermal_throttling_counter(dev->adev));
+   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE,
+ 
KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask,
+ 
amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));
 }
 
 void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
@@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, 
uint16_t pasid)
if (!task_info.pid)
return;
 
-   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
- task_info.pid, task_info.task_name);
+   kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,
+ KFD_EVENT_FMT_VMFAULT(task_info.pid, 
task_info.task_name));
 }
 
 void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
@@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node 
*node, pid_t pid,
ktime_t ts)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START,
- "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
- address, node->id, write_fault ? 'W' : 'R');
+ KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid,
+ address, node->id, write_fault ? 'W' : 'R'));
 }
 
 void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid,
  unsigned long address, bool migration)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,
- "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
- pid, address, node->id, migration ? 'M' : 'U');
+ KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),
+ pid, address, node->id, migration ? 'M' : 'U'));
 }
 
 void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid,
@@ -274,9 +275,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, 
pid_t pid,
   uint32_t trigger)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,
- "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",
- ktime_get_boottime_ns(), pid, start, end - start,
- from, to, prefetch_loc, preferred_loc, trigger);
+ KFD_EVENT_FMT_MIGRATE_START(ktime_get_boottime_ns(),
+ pid, start, end - start, from, to, prefetch_loc,
+ preferred_loc, trigger));
 }
 
 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid,
@@ -284,24 +285,23 @@ void kfd_smi_event_migration_end(struct kfd_node *node, 
pid_t pid,
 uint32_t from, uint32_t to, uint32_t trigger)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END,
- "%lld -%d @%lx(%lx) %x->%x %d\n",
- ktime_get_boottime_ns(), pid, start, end - start,
- from, to, trigger);
+ KFD_EVENT_FMT_MIGRATE_END(ktime_get_boottime_ns(), 
pid,
+ start, end - start, from, to, trigger));
 }
 
 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
  uint32_t trigger)
 {
kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION,
-

[PATCH 2/2] drm/amdgpu: Improve huge page mapping update

2024-02-01 Thread Philip Yang

Update huge page mapping, ex 2MB address and size aligned, we alloc PTB
bo, and then free the PTB bo after updating PDE0 as PTE.

If fragment size >= parent_shift, don't alloc PT bo, because we will
update PDE entry, this will improve the huge page mapping update
by removing the extra PTB bo alloc and free.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index a3d609655ce3..ef3ef03e50ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -916,7 +916,11 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params 
*params,
uint64_t incr, entry_end, pe_start;
struct amdgpu_bo *pt;
 
-   if (!params->unlocked) {
+   shift = amdgpu_vm_pt_level_shift(adev, cursor.level);
+   parent_shift = amdgpu_vm_pt_level_shift(adev, cursor.level - 1);
+
+   if (!params->unlocked &&
+   (adev->asic_type < CHIP_VEGA10 || frag < parent_shift)) {
/* make sure that the page tables covering the
 * address range are actually allocated
 */
@@ -926,8 +930,6 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params 
*params,
return r;
}
 
-   shift = amdgpu_vm_pt_level_shift(adev, cursor.level);
-   parent_shift = amdgpu_vm_pt_level_shift(adev, cursor.level - 1);
if (params->unlocked) {
/* Unlocked updates are only allowed on the leaves */
if (amdgpu_vm_pt_descendant(adev, ))
-- 
2.35.1

[PATCH 1/2] drm/amdgpu: Unmap only clear the page table leaves

2024-02-01 Thread Philip Yang

SVM migration unmap pages from GPU and then update mapping to GPU to
recover page fault. Currently unmap clears the PDE entry for range
length >= huge page and free PTB bo, update mapping to alloc new PT bo.
There is race bug that the freed entry bo maybe still on the pt_free
list, reused when updating mapping and then freed, leave invalid PDE
entry and cause GPU page fault.

By setting the update to clear only one PDE entry or clear PTB, to
avoid unmap to free PTE bo. This fixes the race bug and improve the
unmap and map to GPU performance. Update mapping to huge page will
still free the PTB bo.

With this change, the vm->pt_freed list and work is not needed. Add
WARN_ON(unlocked) in amdgpu_vm_pt_free_dfs to catch if unmap to free the
PTB.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  4 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  4 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 43 ++-
 3 files changed, 10 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 82e5fd66a10d..3bde77dfc63f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2256,8 +2256,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
spin_lock_init(>status_lock);
INIT_LIST_HEAD(>freed);
INIT_LIST_HEAD(>done);
-   INIT_LIST_HEAD(>pt_freed);
-   INIT_WORK(>pt_free_work, amdgpu_vm_pt_free_work);
INIT_KFIFO(vm->faults);
 
r = amdgpu_vm_init_entities(adev, vm);
@@ -2446,8 +2444,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
 
amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
 
-   flush_work(>pt_free_work);
-
root = amdgpu_bo_ref(vm->root.bo);
amdgpu_bo_reserve(root, true);
amdgpu_vm_set_pasid(adev, vm, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index cdb61f1e7c35..74fe211b9ecd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -316,10 +316,6 @@ struct amdgpu_vm {
/* BOs which are invalidated, has been updated in the PTs */
struct list_headdone;
 
-   /* PT BOs scheduled to free and fill with zero if vm_resv is not hold */
-   struct list_headpt_freed;
-   struct work_struct  pt_free_work;
-
/* contains the page directory */
struct amdgpu_vm_bo_base root;
struct dma_fence*last_update;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index a160265ddc07..a3d609655ce3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -657,27 +657,6 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base 
*entry)
amdgpu_bo_unref(>bo);
 }
 
-void amdgpu_vm_pt_free_work(struct work_struct *work)
-{
-   struct amdgpu_vm_bo_base *entry, *next;
-   struct amdgpu_vm *vm;
-   LIST_HEAD(pt_freed);
-
-   vm = container_of(work, struct amdgpu_vm, pt_free_work);
-
-   spin_lock(>status_lock);
-   list_splice_init(>pt_freed, _freed);
-   spin_unlock(>status_lock);
-
-   /* flush_work in amdgpu_vm_fini ensure vm->root.bo is valid. */
-   amdgpu_bo_reserve(vm->root.bo, true);
-
-   list_for_each_entry_safe(entry, next, _freed, vm_status)
-   amdgpu_vm_pt_free(entry);
-
-   amdgpu_bo_unreserve(vm->root.bo);
-}
-
 /**
  * amdgpu_vm_pt_free_dfs - free PD/PT levels
  *
@@ -696,17 +675,7 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device 
*adev,
struct amdgpu_vm_pt_cursor cursor;
struct amdgpu_vm_bo_base *entry;
 
-   if (unlocked) {
-   spin_lock(>status_lock);
-   for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
-   list_move(>vm_status, >pt_freed);
-
-   if (start)
-   list_move(>entry->vm_status, >pt_freed);
-   spin_unlock(>status_lock);
-   schedule_work(>pt_free_work);
-   return;
-   }
+   WARN_ON(unlocked);
 
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
amdgpu_vm_pt_free(entry);
@@ -1009,7 +978,15 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params 
*params,
incr = (uint64_t)AMDGPU_GPU_PAGE_SIZE << shift;
mask = amdgpu_vm_pt_entries_mask(adev, cursor.level);
pe_start = ((cursor.pfn >> shift) & mask) * 8;
-   entry_end = ((uint64_t)mask + 1) << shift;
+
+   if (cursor.level < AMDGPU_VM_PTB && params->unlocked)
+   /*
+* Unmap to clear o

Re: [PATCH] drm/amdgpu: Support >=4GB GTT memory mapping

2024-01-29 Thread Philip Yang


  


On 2024-01-29 11:30, Christian König
  wrote:

Am
  29.01.24 um 17:25 schrieb Philip Yang:
  
  On 2024-01-29 05:06, Christian König
wrote:

Am 26.01.24 um 20:47 schrieb Philip
  Yang:
  
  This is to work around a bug in
function drm_prime_pages_to_sg if length

of nr_pages >= 4GB, by doing the same check for
max_segment and then

calling sg_alloc_table_from_pages_segment directly instead.


This issue shows up on APU because VRAM is allocated as GTT
memory. It

also fixes >=4GB GTT memory mapping for mGPUs with IOMMU
isolation mode.

  
  
  Well that was talked about before and rejected. If we really
  want more than 4GiB in DMA-bufs we need to fix
  drm_prime_pages_to_sg() instead.
  


I sent a patch to fix drm_prime_pages_to_sg but the patch was
rejected.


  
  
  Why was that rejected? If this isn't something we want for DRM we
  probably don't want it for AMDGPU either.
  

The reason is same as your concern, to check if we want more than
  4GB dmabuf support and may need fix for other drm functions.
I am not familiar with drm layer, amdgpu need more than 4GB
  dmabuf on mGPUs APU. Do you want me to resend that drm patch to
  fix only drm_prime_pages_to_sg function?
anything like this will cause size becomes 0 if nr_pages size is
  more than 4GB:

unsigned int nr_pages;
unsigned long size = nr_pages << PAGE_SHIFT;

Regards,
Philip


  
  This issue happens on APU, as VRAM is
allocated as GTT memory, get to this patch only if IOMMU is
isolation mode, with IOMMU off or pt mode, multiple GPUs share
the same dma mapping.


Even with the fix patch accepted by drm, we still need this
patch to workaround the issue on old kernel version.


  
  
  Yeah, but that's then just a functions fixup for our backporting
  team and shouldn't be worked around like this.
  
  
  Regards,
  
  Christian.
  
  
  Regards,


Philip



  
  Regards,
  
  Christian.
  
  
  

Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 50
++---

  1 file changed, 34 insertions(+), 16 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c

index 055ba2ea4c12..a203633fd629 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c

@@ -171,18 +171,41 @@ static struct sg_table
*amdgpu_dma_buf_map(struct dma_buf_attachment *attach,

  }

    switch (bo->tbo.resource->mem_type) {

-    case TTM_PL_TT:

-    sgt = drm_prime_pages_to_sg(obj->dev,

-    bo->tbo.ttm->pages,

-    bo->tbo.ttm->num_pages);

-    if (IS_ERR(sgt))

-    return sgt;

-

-    if (dma_map_sgtable(attach->dev, sgt, dir,

-    DMA_ATTR_SKIP_CPU_SYNC))

-    goto error_free;

-    break;

+    case TTM_PL_TT: {

+    size_t max_segment = 0;

+    u64 num_pages;

+    int err;

+

+    sgt = kmalloc(sizeof(*sgt), GFP_KERNEL);

+    if (!sgt)

+    return ERR_PTR(-ENOMEM);

+

+    if (obj->dev)

+    max_segment =
dma_max_mapping_size(obj->dev->dev);

+    if (max_segment == 0)

+    max_segment = UINT_MAX;

+

+    /*

+ * Use u64, otherwise if length of num_pages >=
4GB then size

Re: [PATCH] drm/amdgpu: Support >=4GB GTT memory mapping

2024-01-29 Thread Philip Yang


  


On 2024-01-29 05:06, Christian König
  wrote:

Am
  26.01.24 um 20:47 schrieb Philip Yang:
  
  This is to work around a bug in function
drm_prime_pages_to_sg if length

of nr_pages >= 4GB, by doing the same check for max_segment
and then

calling sg_alloc_table_from_pages_segment directly instead.


This issue shows up on APU because VRAM is allocated as GTT
memory. It

also fixes >=4GB GTT memory mapping for mGPUs with IOMMU
isolation mode.

  
  
  Well that was talked about before and rejected. If we really want
  more than 4GiB in DMA-bufs we need to fix drm_prime_pages_to_sg()
  instead.
  

I sent a patch to fix drm_prime_pages_to_sg but the patch was
  rejected.
This issue happens on APU, as VRAM is allocated as GTT memory,
  get to this patch only if IOMMU is isolation mode, with IOMMU off
  or pt mode, multiple GPUs share the same dma mapping.
Even with the fix patch accepted by drm, we still need this patch
  to workaround the issue on old kernel version.
Regards,
Philip  


  
  Regards,
  
  Christian.
  
  
  

Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 50
++---

  1 file changed, 34 insertions(+), 16 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c

index 055ba2ea4c12..a203633fd629 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c

@@ -171,18 +171,41 @@ static struct sg_table
*amdgpu_dma_buf_map(struct dma_buf_attachment *attach,

  }

    switch (bo->tbo.resource->mem_type) {

-    case TTM_PL_TT:

-    sgt = drm_prime_pages_to_sg(obj->dev,

-    bo->tbo.ttm->pages,

-    bo->tbo.ttm->num_pages);

-    if (IS_ERR(sgt))

-    return sgt;

-

-    if (dma_map_sgtable(attach->dev, sgt, dir,

-    DMA_ATTR_SKIP_CPU_SYNC))

-    goto error_free;

-    break;

+    case TTM_PL_TT: {

+    size_t max_segment = 0;

+    u64 num_pages;

+    int err;

+

+    sgt = kmalloc(sizeof(*sgt), GFP_KERNEL);

+    if (!sgt)

+    return ERR_PTR(-ENOMEM);

+

+    if (obj->dev)

+    max_segment =
dma_max_mapping_size(obj->dev->dev);

+    if (max_segment == 0)

+    max_segment = UINT_MAX;

+

+    /*

+ * Use u64, otherwise if length of num_pages >= 4GB
then size

+ * (num_pages << PAGE_SHIFT) becomes 0

+ */

+    num_pages = bo->tbo.ttm->num_pages;

+    err = sg_alloc_table_from_pages_segment(sgt,
bo->tbo.ttm->pages,

+    num_pages, 0,

+    num_pages << PAGE_SHIFT,

+    max_segment, GFP_KERNEL);

+    if (err) {

+    kfree(sgt);

+    return ERR_PTR(err);

+    }

  +    if (dma_map_sgtable(attach->dev, sgt, dir,
DMA_ATTR_SKIP_CPU_SYNC)) {

+    sg_free_table(sgt);

+    kfree(sgt);

+    return ERR_PTR(-EBUSY);

+    }

+    break;

+    }

  case TTM_PL_VRAM:

  r = amdgpu_vram_mgr_alloc_sgt(adev,
bo->tbo.resource, 0,

    bo->tbo.base.size,
attach->dev,

@@ -195,11 +218,6 @@ static struct sg_table
*amdgpu_dma_buf_map(struct dma_buf_attachment *attach,

  }

    return sgt;

-

-error_free:

-    sg_free_table(sgt);

-    kfree(sgt);

-    return ERR_PTR(-EBUSY);

  }

    /**

[PATCH] drm/amdgpu: Support >=4GB GTT memory mapping

2024-01-26 Thread Philip Yang

This is to work around a bug in function drm_prime_pages_to_sg if length
of nr_pages >= 4GB, by doing the same check for max_segment and then
calling sg_alloc_table_from_pages_segment directly instead.

This issue shows up on APU because VRAM is allocated as GTT memory. It
also fixes >=4GB GTT memory mapping for mGPUs with IOMMU isolation mode.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 50 ++---
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 055ba2ea4c12..a203633fd629 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -171,18 +171,41 @@ static struct sg_table *amdgpu_dma_buf_map(struct 
dma_buf_attachment *attach,
}
 
switch (bo->tbo.resource->mem_type) {
-   case TTM_PL_TT:
-   sgt = drm_prime_pages_to_sg(obj->dev,
-   bo->tbo.ttm->pages,
-   bo->tbo.ttm->num_pages);
-   if (IS_ERR(sgt))
-   return sgt;
-
-   if (dma_map_sgtable(attach->dev, sgt, dir,
-   DMA_ATTR_SKIP_CPU_SYNC))
-   goto error_free;
-   break;
+   case TTM_PL_TT: {
+   size_t max_segment = 0;
+   u64 num_pages;
+   int err;
+
+   sgt = kmalloc(sizeof(*sgt), GFP_KERNEL);
+   if (!sgt)
+   return ERR_PTR(-ENOMEM);
+
+   if (obj->dev)
+   max_segment = dma_max_mapping_size(obj->dev->dev);
+   if (max_segment == 0)
+   max_segment = UINT_MAX;
+
+   /*
+* Use u64, otherwise if length of num_pages >= 4GB then size
+* (num_pages << PAGE_SHIFT) becomes 0
+*/
+   num_pages = bo->tbo.ttm->num_pages;
+   err = sg_alloc_table_from_pages_segment(sgt, bo->tbo.ttm->pages,
+   num_pages, 0,
+   num_pages << PAGE_SHIFT,
+   max_segment, 
GFP_KERNEL);
+   if (err) {
+   kfree(sgt);
+   return ERR_PTR(err);
+   }
 
+   if (dma_map_sgtable(attach->dev, sgt, dir, 
DMA_ATTR_SKIP_CPU_SYNC)) {
+   sg_free_table(sgt);
+   kfree(sgt);
+   return ERR_PTR(-EBUSY);
+   }
+   break;
+   }
case TTM_PL_VRAM:
r = amdgpu_vram_mgr_alloc_sgt(adev, bo->tbo.resource, 0,
  bo->tbo.base.size, attach->dev,
@@ -195,11 +218,6 @@ static struct sg_table *amdgpu_dma_buf_map(struct 
dma_buf_attachment *attach,
}
 
return sgt;
-
-error_free:
-   sg_free_table(sgt);
-   kfree(sgt);
-   return ERR_PTR(-EBUSY);
 }
 
 /**
-- 
2.35.1

Re: [PATCH] drm/amdgpu: Limit the maximum fragment to granularity size

2024-01-26 Thread Philip Yang


  


On 2024-01-26 10:35, Christian König
  wrote:

Am
  26.01.24 um 16:17 schrieb Philip Yang:
  
  On 2024-01-26 09:59, Christian König
wrote:

Am 26.01.24 um 15:38 schrieb Philip
  Yang:
  
  svm range support partial migration
and mapping update, for size 4MB

virtual address 4MB alignment and physical address
continuous range, if

mapping to GPU with fs=10, after updating mapping of the
first 2MB,

if the second 2MB mapping fs=10 in cache TLB, this causes
the first 2MB

access to the stale mapping.

  
  
  Well that sounds fishy. When that happens with (for example)
  4MiB and 2MiB, why doesn't it happen with 8KiB and 4KiB as
  well?
  


unmap svm range is aligned to granularity size, if the range
size is 8KB (all within one 2MB granularity range), it will be
mapped/unmapped as 8KB, even if only 4KB is migrated. This is
handled in another patch series "amd/amdkfd: Unmap range from
GPU based on granularity".


  
  
  Ok that makes a bit more sense.
  
  
  But when you have a linear 4MiB mapping and unmap the first 2MiB
  of it you need to flush the TLB anyway.
  
  
  So why would that cause a stale access?
  

yes, unmap does flush the TLB, the issue happens if GPU access
  the second 2MB to load fs=10 entry to TLB, and then access the
  first 2MB.
Originally I thought this could be fixed by using granularity
  aligned address, size to map/unmap to GPU, after debugging,
  realize we still need limit the max fragment size. We could change
  this in svm map function, but it is more efficient to pass the max
  fragment size to GPU page table update level.
Regards,
Philip


  
  Regards,
  
  Christian.
  
  
  Regards,


Philip



  
  Christian.
  
  
  

Limit the maximum fragment size to granularity size, 2MB by
default,

with the mapping and unmapping based on gramularity size, to
solve this

issue.


The change is only for SVM map/unmap range, no change for
gfx and legacy

API path.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    | 12
+++-

  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  4 ++--

  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 22
--

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  9 +

  4 files changed, 26 insertions(+), 21 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index ed4a8c5d26d7..a2bef94cb959 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

@@ -897,6 +897,7 @@ static void amdgpu_vm_tlb_seq_cb(struct
dma_fence *fence,

   * @res: ttm_resource to map

   * @pages_addr: DMA addresses to use for mapping

   * @fence: optional resulting fence

+ * @frag_size: max map fragment size

   *

   * Fill in the page table entries between @start and
@last.

   *

@@ -908,7 +909,7 @@ int amdgpu_vm_update_range(struct
amdgpu_device *adev, struct amdgpu_vm *vm,

 struct dma_resv *resv, uint64_t start,
uint64_t last,

 uint64_t flags, uint64_t offset, uint64_t
vram_base,

 struct ttm_resource *res, dma_addr_t
*pages_addr,

-   struct dma_fence **fence)

+   struct dma_fence **fence, unsigned int
frag_size)

  {

  struct amdgpu_vm_update_params params;

  struct amdgpu_vm_tlb_seq_struct *tlb_cb;

@@ -1016,7 +1017,7 @@ int amdgpu_vm_update_range(struct
amdgpu_de

Re: [PATCH] drm/amdgpu: Limit the maximum fragment to granularity size

2024-01-26 Thread Philip Yang


  


On 2024-01-26 09:59, Christian König
  wrote:

Am
  26.01.24 um 15:38 schrieb Philip Yang:
  
  svm range support partial migration and
mapping update, for size 4MB

virtual address 4MB alignment and physical address continuous
range, if

mapping to GPU with fs=10, after updating mapping of the first
2MB,

if the second 2MB mapping fs=10 in cache TLB, this causes the
first 2MB

access to the stale mapping.

  
  
  Well that sounds fishy. When that happens with (for example) 4MiB
  and 2MiB, why doesn't it happen with 8KiB and 4KiB as well?
  

unmap svm range is aligned to granularity size, if the range size
  is 8KB (all within one 2MB granularity range), it will be
  mapped/unmapped as 8KB, even if only 4KB is migrated. This is
  handled in another patch series "amd/amdkfd: Unmap range from GPU
  based on granularity".
Regards,
Philip


  
  Christian.
  
  
  

Limit the maximum fragment size to granularity size, 2MB by
default,

with the mapping and unmapping based on gramularity size, to
solve this

issue.


The change is only for SVM map/unmap range, no change for gfx
and legacy

API path.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    | 12 +++-

  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |  4 ++--

  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 22
--

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  9 +

  4 files changed, 26 insertions(+), 21 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index ed4a8c5d26d7..a2bef94cb959 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

@@ -897,6 +897,7 @@ static void amdgpu_vm_tlb_seq_cb(struct
dma_fence *fence,

   * @res: ttm_resource to map

   * @pages_addr: DMA addresses to use for mapping

   * @fence: optional resulting fence

+ * @frag_size: max map fragment size

   *

   * Fill in the page table entries between @start and @last.

   *

@@ -908,7 +909,7 @@ int amdgpu_vm_update_range(struct
amdgpu_device *adev, struct amdgpu_vm *vm,

 struct dma_resv *resv, uint64_t start, uint64_t
last,

 uint64_t flags, uint64_t offset, uint64_t
vram_base,

 struct ttm_resource *res, dma_addr_t
*pages_addr,

-   struct dma_fence **fence)

+   struct dma_fence **fence, unsigned int
frag_size)

  {

  struct amdgpu_vm_update_params params;

  struct amdgpu_vm_tlb_seq_struct *tlb_cb;

@@ -1016,7 +1017,7 @@ int amdgpu_vm_update_range(struct
amdgpu_device *adev, struct amdgpu_vm *vm,

  }

    tmp = start + num_entries;

-    r = amdgpu_vm_ptes_update(, start, tmp,
addr, flags);

+    r = amdgpu_vm_ptes_update(, start, tmp,
addr, flags, frag_size);

  if (r)

  goto error_free;

  @@ -1197,7 +1198,7 @@ int amdgpu_vm_bo_update(struct
amdgpu_device *adev, struct amdgpu_bo_va *bo_va,

 !uncached, resv, mapping->start,
mapping->last,

 update_flags, mapping->offset,

 vram_base, mem, pages_addr,

-   last_update);

+   last_update, 0);

  if (r)

  return r;

  }

@@ -1392,7 +1393,7 @@ int amdgpu_vm_clear_freed(struct
amdgpu_device *adev,

  r = amdgpu_vm_update_range(adev, vm, false, false,
true, false,

 resv, mapping->start,
mapping->last,

 init_pte_value, 0, 0, NULL, NULL,

-   );

+   , 0);

  amdgpu_vm_free_map

[PATCH] drm/amdgpu: Limit the maximum fragment to granularity size

2024-01-26 Thread Philip Yang

svm range support partial migration and mapping update, for size 4MB
virtual address 4MB alignment and physical address continuous range, if
mapping to GPU with fs=10, after updating mapping of the first 2MB,
if the second 2MB mapping fs=10 in cache TLB, this causes the first 2MB
access to the stale mapping.

Limit the maximum fragment size to granularity size, 2MB by default,
with the mapping and unmapping based on gramularity size, to solve this
issue.

The change is only for SVM map/unmap range, no change for gfx and legacy
API path.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 12 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 22 --
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  9 +
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ed4a8c5d26d7..a2bef94cb959 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -897,6 +897,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
  * @res: ttm_resource to map
  * @pages_addr: DMA addresses to use for mapping
  * @fence: optional resulting fence
+ * @frag_size: max map fragment size
  *
  * Fill in the page table entries between @start and @last.
  *
@@ -908,7 +909,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
   struct dma_resv *resv, uint64_t start, uint64_t last,
   uint64_t flags, uint64_t offset, uint64_t vram_base,
   struct ttm_resource *res, dma_addr_t *pages_addr,
-  struct dma_fence **fence)
+  struct dma_fence **fence, unsigned int frag_size)
 {
struct amdgpu_vm_update_params params;
struct amdgpu_vm_tlb_seq_struct *tlb_cb;
@@ -1016,7 +1017,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
}
 
tmp = start + num_entries;
-   r = amdgpu_vm_ptes_update(, start, tmp, addr, flags);
+   r = amdgpu_vm_ptes_update(, start, tmp, addr, flags, 
frag_size);
if (r)
goto error_free;
 
@@ -1197,7 +1198,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, 
struct amdgpu_bo_va *bo_va,
   !uncached, resv, mapping->start, 
mapping->last,
   update_flags, mapping->offset,
   vram_base, mem, pages_addr,
-  last_update);
+  last_update, 0);
if (r)
return r;
}
@@ -1392,7 +1393,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
r = amdgpu_vm_update_range(adev, vm, false, false, true, false,
   resv, mapping->start, mapping->last,
   init_pte_value, 0, 0, NULL, NULL,
-  );
+  , 0);
amdgpu_vm_free_mapping(adev, vm, mapping, f);
if (r) {
dma_fence_put(f);
@@ -2733,7 +2734,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
}
 
r = amdgpu_vm_update_range(adev, vm, true, false, false, false,
-  NULL, addr, addr, flags, value, 0, NULL, 
NULL, NULL);
+  NULL, addr, addr, flags, value, 0, NULL, 
NULL,
+  NULL, 0);
if (r)
goto error_unlock;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 98a57192..b34466b5086f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -465,7 +465,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
   struct dma_resv *resv, uint64_t start, uint64_t last,
   uint64_t flags, uint64_t offset, uint64_t vram_base,
   struct ttm_resource *res, dma_addr_t *pages_addr,
-  struct dma_fence **fence);
+  struct dma_fence **fence, unsigned int frag_size);
 int amdgpu_vm_bo_update(struct amdgpu_device *adev,
struct amdgpu_bo_va *bo_va,
bool clear);
@@ -531,7 +531,7 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params 
*params,
 struct amdgpu_vm_bo_base *entry);
 int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
  uint64_t start, uint64_t end,
-

[PATCH v4 7/7] drm/amdkfd: Wait update sdma fence before tlb flush

2024-01-15 Thread Philip Yang

If using sdma update GPU page table, kfd flush tlb does nothing if vm
update fence callback doesn't update vm->tlb_seq. This works now because
retry fault will come and update page table again and flush tlb finally.

With the bitmap_map flag, the retry fault recover will only update
GPU page table once, have to wait sdma udate fence and then flush tlb.

No change if using CPU update GPU page table for large bar because no vm
update fence.

Remove wait parameter in svm_range_validate_and_map because it is always
called with true now.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index b36d997e7a3d..9e5f6e12c498 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1677,7 +1677,7 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, 
struct svm_range *prange,
 static int
 svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
  unsigned long npages, bool readonly,
- unsigned long *bitmap, bool wait, bool flush_tlb)
+ unsigned long *bitmap, bool flush_tlb)
 {
struct kfd_process_device *pdd;
struct amdgpu_device *bo_adev = NULL;
@@ -1710,8 +1710,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned 
long offset,
 
r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly,
 prange->dma_addr[gpuidx],
-bo_adev, wait ?  : NULL,
-flush_tlb);
+bo_adev, , flush_tlb);
if (r)
break;
 
@@ -1837,7 +1836,7 @@ static void *kfd_svm_page_owner(struct kfd_process *p, 
int32_t gpuidx)
 static int svm_range_validate_and_map(struct mm_struct *mm,
  unsigned long map_start, unsigned long 
map_last,
  struct svm_range *prange, int32_t gpuidx,
- bool intr, bool wait, bool flush_tlb)
+ bool intr, bool flush_tlb)
 {
struct svm_validate_context *ctx;
unsigned long start, end, addr;
@@ -1950,7 +1949,7 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
offset = map_start_vma - prange->start;
npages = map_last_vma - map_start_vma + 1;
r = svm_range_map_to_gpus(prange, offset, 
npages, readonly,
- ctx->bitmap, wait, 
flush_tlb);
+ ctx->bitmap, 
flush_tlb);
}
}
 
@@ -2041,7 +2040,7 @@ static void svm_range_restore_work(struct work_struct 
*work)
mutex_lock(>migrate_mutex);
 
r = svm_range_validate_and_map(mm, prange->start, prange->last, 
prange,
-  MAX_GPU_INSTANCE, false, true, 
false);
+  MAX_GPU_INSTANCE, false, false);
if (r)
pr_debug("failed %d to map 0x%lx to gpus\n", r,
 prange->start);
@@ -3303,7 +3302,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
unsigned int pasid,
mmap_read_lock(mm);
 
r = svm_range_validate_and_map(mm, start, last, prange, gpuidx, false,
-  false, false);
+  false);
if (r)
pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
 r, svms, start, last);
@@ -3847,7 +3846,7 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
flush_tlb = !migrated && update_mapping &&
svm_range_partial_mapped(prange, prange->start, 
prange->last);
r = svm_range_validate_and_map(mm, prange->start, prange->last, 
prange,
-  MAX_GPU_INSTANCE, true, true, 
flush_tlb);
+  MAX_GPU_INSTANCE, true, 
flush_tlb);
if (r)
pr_debug("failed %d to map svm range\n", r);
 
@@ -3863,7 +3862,7 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
mutex_lock(>migrate_mutex);
flush_tlb = svm_range_partial_mapped(prange, prange->start, 
prange->last);
r = svm_range_validate_and_map(mm,  prange->start, 
prange->last, prange,
-  MAX_GPU_INSTANCE, true,

[PATCH v4 2/7] drm/amdkfd: Add helper function align range start last

2024-01-15 Thread Philip Yang

Calculate range start, last address aligned to the range granularity
size. This removes the duplicate code, and the helper function will be
used in the future patch to handle map, unmap to GPU based on range
granularity. No functional change.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 10 --
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 10 ++
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index dae05f70257b..64eb9023d66b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -986,8 +986,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
 
/* Align migration range start and size to granularity size */
size = 1UL << prange->granularity;
-   start = max(ALIGN_DOWN(addr, size), prange->start);
-   last = min(ALIGN(addr + 1, size) - 1, prange->last);
+   start = svm_range_align_start(addr, prange->start, size);
+   last = svm_range_align_last(addr, prange->last, size);
 
r = svm_migrate_vram_to_ram(prange, vmf->vma->vm_mm, start, last,
KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, 
vmf->page);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 14dbc0fd51a9..a2c96f5760ff 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2698,10 +2698,8 @@ svm_range_get_range_boundaries(struct kfd_process *p, 
int64_t addr,
 (vma->vm_start <= vma->vm_mm->start_stack &&
  vma->vm_end >= vma->vm_mm->start_stack);
 
-   start_limit = max(vma->vm_start >> PAGE_SHIFT,
- (unsigned long)ALIGN_DOWN(addr, 2UL << 8));
-   end_limit = min(vma->vm_end >> PAGE_SHIFT,
-   (unsigned long)ALIGN(addr + 1, 2UL << 8));
+   start_limit = svm_range_align_start(addr, vma->vm_start >> PAGE_SHIFT, 
2UL << 8);
+   end_limit = svm_range_align_last(addr, (vma->vm_end >> PAGE_SHIFT) - 1, 
2UL << 8) + 1;
/* First range that starts after the fault address */
node = interval_tree_iter_first(>svms.objects, addr + 1, ULONG_MAX);
if (node) {
@@ -3043,8 +3041,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
unsigned int pasid,
 
/* Align migration range start and size to granularity size */
size = 1UL << prange->granularity;
-   start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start);
-   last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last);
+   start = svm_range_align_start(addr, prange->start, size);
+   last = svm_range_align_last(addr, prange->last, size);
if (prange->actual_loc != 0 || best_loc != 0) {
migration = true;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 026863a0abcd..806bcac6d101 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -159,6 +159,16 @@ static inline struct svm_range_bo *svm_range_bo_ref(struct 
svm_range_bo *svm_bo)
return svm_bo;
 }
 
+static inline u64 svm_range_align_start(u64 addr, u64 range_start, u64 
align_size)
+{
+   return max(ALIGN_DOWN(addr, align_size), range_start);
+}
+
+static inline u64 svm_range_align_last(u64 addr, u64 range_last, u64 
align_size)
+{
+   return min(ALIGN(addr + 1, align_size) - 1, range_last);
+}
+
 int svm_range_list_init(struct kfd_process *p);
 void svm_range_list_fini(struct kfd_process *p);
 int svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
-- 
2.35.1

[PATCH v4 5/7] drm/amdkfd: Change range granularity update bitmap_map

2024-01-15 Thread Philip Yang

When changing the svm range granularity, update the svm range
bitmap_map based on new range granularity.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 49 +++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 7a30c3e58234..ebc4cce801bf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -757,6 +757,53 @@ svm_range_check_attr(struct kfd_process *p,
return 0;
 }
 
+static void
+svm_range_change_granularity(struct svm_range *prange, u8 value)
+{
+   struct kfd_process *p = container_of(prange->svms, struct kfd_process, 
svms);
+   u32 new_nbits, old_nbits, i, n;
+   unsigned long *new_bits, *old_bits;
+   u32 gpuidx;
+
+   if (prange->granularity == value)
+   return;
+
+   old_nbits = svm_range_map_nbits(prange->start, prange->last, 
prange->granularity);
+   new_nbits = svm_range_map_nbits(prange->start, prange->last, value);
+   if (new_nbits > old_nbits) {
+   n = new_nbits / old_nbits;
+   if (new_nbits % old_nbits)
+   n++;
+   } else {
+   n = old_nbits / new_nbits;
+   if (old_nbits % new_nbits)
+   n++;
+   }
+
+   pr_debug("prange 0x%p [0x%lx 0x%lx] bitmap_map nbits %d -> %d\n",
+prange, prange->start, prange->last, old_nbits, new_nbits);
+
+   for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+   old_bits = prange->bitmap_map[gpuidx];
+   if (bitmap_empty(old_bits, old_nbits))
+   continue;
+
+   new_bits = bitmap_zalloc(new_nbits, GFP_KERNEL);
+   if (!new_bits)
+   return;
+
+   for_each_set_bit(i, old_bits, old_nbits) {
+   if (new_nbits > old_nbits)
+   bitmap_set(new_bits, i * n, n);
+   else
+   bitmap_set(new_bits, i / n, 1);
+   }
+   prange->bitmap_map[gpuidx] = new_bits;
+   bitmap_free(old_bits);
+   }
+   prange->granularity = value;
+}
+
 static void
 svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange,
  uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
@@ -801,7 +848,7 @@ svm_range_apply_attrs(struct kfd_process *p, struct 
svm_range *prange,
prange->flags &= ~attrs[i].value;
break;
case KFD_IOCTL_SVM_ATTR_GRANULARITY:
-   prange->granularity = min_t(uint32_t, attrs[i].value, 
0x3F);
+   svm_range_change_granularity(prange, min_t(u32, 
attrs[i].value, 0x3F));
break;
default:
WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
-- 
2.35.1

[PATCH v4 4/7] amd/amdkfd: Unmap range from GPU based on granularity

2024-01-15 Thread Philip Yang

When MMU notifier invalidate the range, align the start and last address
to range granularity to unmap from GPU and update bitmap_map flag.
Skip unmap from GPU if range is already unmapped based on bitmap_map
flag. This  avoids unmap 1 page from GPU and flush TLB, also solve
the rocgdb CWSR migration related issue.

Unmap the range from cpu will remove the range and split the range, this
cannot align the start and last address to range granularity. Change
to split the range and bitmap_map flag first, then unmap the range
from GPU. If unmapping from GPU first, the bitmap_map flag is updated,
split range may get incorrect bitmap_map for the remaining ranges.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 42 +++-
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index a003406db067..7a30c3e58234 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2114,6 +2114,13 @@ svm_range_evict(struct svm_range *prange, struct 
mm_struct *mm,
} else {
unsigned long s, l;
uint32_t trigger;
+   u64 size = 1UL << prange->granularity;
+
+   if (!svm_range_partial_mapped(prange, start, last)) {
+   pr_debug("svms 0x%p [0x%lx 0x%lx] unmapped already\n",
+prange->svms, start, last);
+   return 0;
+   }
 
if (event == MMU_NOTIFY_MIGRATE)
trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE;
@@ -2122,16 +2129,17 @@ svm_range_evict(struct svm_range *prange, struct 
mm_struct *mm,
 
pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n",
 prange->svms, start, last);
+
list_for_each_entry(pchild, >child_list, child_list) {
mutex_lock_nested(>lock, 1);
-   s = max(start, pchild->start);
-   l = min(last, pchild->last);
+   s = svm_range_align_start(start, pchild->start, size);
+   l = svm_range_align_last(last, pchild->last, size);
if (l >= s)
svm_range_unmap_from_gpus(pchild, s, l, 
trigger);
mutex_unlock(>lock);
}
-   s = max(start, prange->start);
-   l = min(last, prange->last);
+   s = svm_range_align_start(start, prange->start, size);
+   l = svm_range_align_last(last, prange->last, size);
if (l >= s)
svm_range_unmap_from_gpus(prange, s, l, trigger);
}
@@ -2645,24 +2653,32 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct 
svm_range *prange,
 
list_for_each_entry(pchild, >child_list, child_list) {
mutex_lock_nested(>lock, 1);
-   s = max(start, pchild->start);
-   l = min(last, pchild->last);
-   if (l >= s)
-   svm_range_unmap_from_gpus(pchild, s, l, trigger);
svm_range_unmap_split(mm, prange, pchild, start, last);
mutex_unlock(>lock);
}
-   s = max(start, prange->start);
-   l = min(last, prange->last);
-   if (l >= s)
-   svm_range_unmap_from_gpus(prange, s, l, trigger);
svm_range_unmap_split(mm, prange, prange, start, last);
-
if (unmap_parent)
svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
else
svm_range_add_list_work(svms, prange, mm,
SVM_OP_UPDATE_RANGE_NOTIFIER);
+
+   list_for_each_entry(pchild, >child_list, child_list) {
+   if (pchild->work_item.op != SVM_OP_UNMAP_RANGE)
+   continue;
+
+   s = max(start, pchild->start);
+   l = min(last, pchild->last);
+   if (l >= s)
+   svm_range_unmap_from_gpus(pchild, s, l, trigger);
+   }
+   if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
+   s = max(start, prange->start);
+   l = min(last, prange->last);
+   if (l >= s)
+   svm_range_unmap_from_gpus(prange, s, l, trigger);
+   }
+
schedule_deferred_list_work(svms);
 
kfd_unref_process(p);
-- 
2.35.1

[PATCH v4 6/7] drm/amdkfd: Check bitmap_map flag to skip retry fault

2024-01-15 Thread Philip Yang

Remove prange validate_timestamp which is not accurate for multiple
GPUs.

Use the bitmap_map flag to skip the retry fault from different pages of
the same granularity range if the granularity range is already mapped
on the specific GPU.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 24 
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  1 -
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index ebc4cce801bf..b36d997e7a3d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -45,10 +45,6 @@
 
 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
 
-/* Long enough to ensure no retry fault comes after svm range is restored and
- * page table is updated.
- */
-#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING   (2UL * NSEC_PER_MSEC)
 #if IS_ENABLED(CONFIG_DYNAMIC_DEBUG)
 #define dynamic_svm_range_dump(svms) \
_dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms)
@@ -380,7 +376,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, 
uint64_t start,
INIT_LIST_HEAD(>deferred_list);
INIT_LIST_HEAD(>child_list);
atomic_set(>invalid, 0);
-   prange->validate_timestamp = 0;
mutex_init(>migrate_mutex);
mutex_init(>lock);
 
@@ -1965,8 +1960,6 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
}
 
svm_range_unreserve_bos(ctx);
-   if (!r)
-   prange->validate_timestamp = ktime_get_boottime();
 
 free_ctx:
kfree(ctx);
@@ -3226,15 +3219,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
unsigned int pasid,
goto out_unlock_mm;
}
 
-   /* skip duplicate vm fault on different pages of same range */
-   if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
-   AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
-   pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
-svms, prange->start, prange->last);
-   r = 0;
-   goto out_unlock_mm;
-   }
-
/* __do_munmap removed VMA, return success as we are handling stale
 * retry fault.
 */
@@ -3260,6 +3244,14 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
unsigned int pasid,
goto out_unlock_mm;
}
 
+   /* skip duplicate vm fault on different pages of same granularity range 
*/
+   if (svm_range_partial_mapped_dev(gpuidx, prange, addr, addr)) {
+   pr_debug("svms 0x%p [0x%lx %lx] addr 0x%llx already mapped on 
gpu %d\n",
+svms, prange->start, prange->last, addr, gpuidx);
+   r = 0;
+   goto out_unlock_mm;
+   }
+
mutex_lock(>migrate_mutex);
 
pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index a10eeb77f83e..5a9688d5c18c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -129,7 +129,6 @@ struct svm_range {
uint32_tactual_loc;
uint8_t granularity;
atomic_tinvalid;
-   ktime_t validate_timestamp;
struct mmu_interval_notifiernotifier;
struct svm_work_list_item   work_item;
struct list_headdeferred_list;
-- 
2.35.1

[PATCH v4 3/7] drm/amdkfd: Add granularity size based bitmap map flag

2024-01-15 Thread Philip Yang

Replace prange->mapped_to_gpu with prange->bitmap_map[], which is per
GPU flag and use bitmap bits based on prange granularity. Align map to
GPU or unmap from GPU range size to granularity size and update the
corresponding bitmap_map flag bits.  This will optimize multiple GPU
map, unmap and retry fault recover.

svm_range_partial_mapped is false only if no part of the range mapping
on any GPUs.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 258 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |   7 +-
 2 files changed, 219 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index a2c96f5760ff..a003406db067 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -307,12 +307,12 @@ static void svm_range_free(struct svm_range *prange, bool 
do_unmap)
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
}
 
-   /* free dma_addr array for each gpu */
+   /* free dma_addr array, bitmap_map for each gpu */
for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
-   if (prange->dma_addr[gpuidx]) {
+   if (prange->dma_addr[gpuidx])
kvfree(prange->dma_addr[gpuidx]);
-   prange->dma_addr[gpuidx] = NULL;
-   }
+   if (prange->bitmap_map[gpuidx])
+   bitmap_free(prange->bitmap_map[gpuidx]);
}
 
mutex_destroy(>lock);
@@ -338,19 +338,38 @@ svm_range *svm_range_new(struct svm_range_list *svms, 
uint64_t start,
uint64_t size = last - start + 1;
struct svm_range *prange;
struct kfd_process *p;
-
-   prange = kzalloc(sizeof(*prange), GFP_KERNEL);
-   if (!prange)
-   return NULL;
+   unsigned int nbits;
+   u32 gpuidx;
 
p = container_of(svms, struct kfd_process, svms);
if (!p->xnack_enabled && update_mem_usage &&
amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT,
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) {
pr_info("SVM mapping failed, exceeds resident system memory 
limit\n");
-   kfree(prange);
return NULL;
}
+
+   prange = kzalloc(sizeof(*prange), GFP_KERNEL);
+   if (!prange)
+   return NULL;
+
+   svm_range_set_default_attributes(>preferred_loc,
+>prefetch_loc,
+>granularity, >flags);
+
+   nbits = svm_range_map_nbits(start, last, prange->granularity);
+   pr_debug("prange 0x%p [0x%llx 0x%llx] bitmap_map nbits %d\n", prange,
+start, last, nbits);
+   for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) {
+   prange->bitmap_map[gpuidx] = bitmap_zalloc(nbits, GFP_KERNEL);
+   if (!prange->bitmap_map[gpuidx]) {
+   while (gpuidx--)
+   bitmap_free(prange->bitmap_map[gpuidx]);
+   kfree(prange);
+   return NULL;
+   }
+   }
+
prange->npages = size;
prange->svms = svms;
prange->start = start;
@@ -369,10 +388,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, 
uint64_t start,
bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
MAX_GPU_INSTANCE);
 
-   svm_range_set_default_attributes(>preferred_loc,
->prefetch_loc,
->granularity, >flags);
-
pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
 
return prange;
@@ -1017,6 +1032,51 @@ svm_range_split_nodes(struct svm_range *new, struct 
svm_range *old,
return 0;
 }
 
+static int
+svm_range_split_bitmap_map(struct svm_range *new, struct svm_range *old,
+  u64 start, u64 last)
+{
+   struct kfd_process *p = container_of(new->svms, struct kfd_process, 
svms);
+   u32 new_nbits, old_nbits, old_nbits2;
+   unsigned long *bits;
+   u32 gpuidx;
+
+   new_nbits = svm_range_map_nbits(new->start, new->last, 
new->granularity);
+   old_nbits = svm_range_map_nbits(old->start, old->last, 
old->granularity);
+   old_nbits2 = svm_range_map_nbits(start, last, old->granularity);
+
+   pr_debug("old 0x%p [0x%lx 0x%lx] => [0x%llx 0x%llx] nbits %d => %d\n",
+old, old->start, old->last, start, last, old_nbits, 
old_nbits2);
+   pr_debug("new 0x%p [0x%lx 0x%lx] nbits %d\n", new, new->start, 
new->last,
+new_nbits);
+
+   for_each_set_bit(gpuidx, p-&g

[PATCH v4 1/7] drm/amdkfd: Add helper function svm_range_need_access_gpus

2024-01-15 Thread Philip Yang

Add the helper function to get all GPUs bitmap that need access the svm
range. This helper will be used in the following patch to check if
prange is mapped to all gpus.

Refactor svm_range_validate_and_map to use the helper function, no
functional change.

Signed-off-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 74 
 1 file changed, 43 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 18f8c82a849c..14dbc0fd51a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1169,6 +1169,44 @@ svm_range_add_child(struct svm_range *prange, struct 
mm_struct *mm,
list_add_tail(>child_list, >child_list);
 }
 
+static int
+svm_range_need_access_gpus(unsigned long *bitmap, struct svm_range *prange)
+{
+   struct kfd_process *p = container_of(prange->svms, struct kfd_process, 
svms);
+   u32 gpuidx;
+
+   if (p->xnack_enabled) {
+   bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
+
+   /* If prefetch range to GPU, or GPU retry fault migrate range to
+* GPU, which has ACCESS attribute to the range, create mapping
+* on that GPU.
+*/
+   if (prange->actual_loc) {
+   gpuidx = kfd_process_gpuidx_from_gpuid(p, 
prange->actual_loc);
+   if (gpuidx < 0)
+   return -EINVAL;
+
+   if (test_bit(gpuidx, prange->bitmap_access))
+   bitmap_set(bitmap, gpuidx, 1);
+   }
+
+   /*
+* If prange is already mapped or with always mapped flag,
+* update mapping on GPUs with ACCESS attribute
+*/
+   if (bitmap_empty(bitmap, MAX_GPU_INSTANCE)) {
+   if (prange->mapped_to_gpu ||
+   prange->flags & 
KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)
+   bitmap_copy(bitmap, prange->bitmap_access, 
MAX_GPU_INSTANCE);
+   }
+   } else {
+   bitmap_or(bitmap, prange->bitmap_access,
+ prange->bitmap_aip, MAX_GPU_INSTANCE);
+   }
+   return 0;
+}
+
 static bool
 svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b)
 {
@@ -1609,38 +1647,12 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
if (gpuidx < MAX_GPU_INSTANCE) {
bitmap_zero(ctx->bitmap, MAX_GPU_INSTANCE);
bitmap_set(ctx->bitmap, gpuidx, 1);
-   } else if (ctx->process->xnack_enabled) {
-   bitmap_copy(ctx->bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
-
-   /* If prefetch range to GPU, or GPU retry fault migrate range to
-* GPU, which has ACCESS attribute to the range, create mapping
-* on that GPU.
-*/
-   if (prange->actual_loc) {
-   gpuidx = kfd_process_gpuidx_from_gpuid(ctx->process,
-   prange->actual_loc);
-   if (gpuidx < 0) {
-   WARN_ONCE(1, "failed get device by id 0x%x\n",
-prange->actual_loc);
-   r = -EINVAL;
-   goto free_ctx;
-   }
-   if (test_bit(gpuidx, prange->bitmap_access))
-   bitmap_set(ctx->bitmap, gpuidx, 1);
-   }
-
-   /*
-* If prange is already mapped or with always mapped flag,
-* update mapping on GPUs with ACCESS attribute
-*/
-   if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
-   if (prange->mapped_to_gpu ||
-   prange->flags & 
KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)
-   bitmap_copy(ctx->bitmap, prange->bitmap_access, 
MAX_GPU_INSTANCE);
-   }
} else {
-   bitmap_or(ctx->bitmap, prange->bitmap_access,
- prange->bitmap_aip, MAX_GPU_INSTANCE);
+   r = svm_range_need_access_gpus(ctx->bitmap, prange);
+   if (r) {
+   WARN_ONCE(1, "failed get device by id 0x%x\n", 
prange->actual_loc);
+   goto free_ctx;
+   }
}
 
if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
-- 
2.35.1

[PATCH v4] drm/amdkfd: Set correct svm range actual loc after spliting

2024-01-15 Thread Philip Yang

While svm range partial migrating to system memory, clear dma_addr vram
domain flag, otherwise the future split will get incorrect vram_pages
and actual loc.

After range spliting, set new range and old range actual_loc:
new range actual_loc is 0 if new->vram_pages is 0.
old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0.

new range takes svm_bo ref only if vram_pages not equal to 0.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  8 +
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 42 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  1 +
 3 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index bdc01ca9609a..79baa195ccac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -564,6 +564,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
dma_addr_t *scratch, uint64_t npages)
 {
struct device *dev = adev->dev;
+   dma_addr_t *dma_addr;
uint64_t *src;
dma_addr_t *dst;
struct page *dpage;
@@ -575,6 +576,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
 prange->last);
 
addr = migrate->start;
+   dma_addr = svm_get_dma_addr_for_page_count(prange, addr);
 
src = (uint64_t *)(scratch + npages);
dst = scratch;
@@ -623,6 +625,12 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
goto out_oom;
}
 
+   /* Clear VRAM flag when page is migrated to ram, to count vram
+* pages correctly when spliting the range.
+*/
+   if (dma_addr && (dma_addr[i] & SVM_RANGE_VRAM_DOMAIN))
+   dma_addr[i] = 0;
+
pr_debug_ratelimited("dma mapping dst to 0x%llx, pfn 0x%lx\n",
 dst[i] >> PAGE_SHIFT, page_to_pfn(dpage));
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f84547eccd28..78b4968e4c95 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -362,7 +362,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, 
uint64_t start,
INIT_LIST_HEAD(>child_list);
atomic_set(>invalid, 0);
prange->validate_timestamp = 0;
-   prange->vram_pages = 0;
mutex_init(>migrate_mutex);
mutex_init(>lock);
 
@@ -965,6 +964,24 @@ svm_range_split_array(void *ppnew, void *ppold, size_t 
size,
return 0;
 }
 
+dma_addr_t *
+svm_get_dma_addr_for_page_count(struct svm_range *prange, u64 addr)
+{
+   struct kfd_process *p = container_of(prange->svms, struct kfd_process, 
svms);
+   dma_addr_t *dma_addr;
+   s32 gpuidx;
+
+   gpuidx = kfd_process_gpuidx_from_gpuid(p, prange->actual_loc);
+   if (gpuidx < 0) {
+   pr_debug("no GPU id 0x%x found\n", prange->actual_loc);
+   return NULL;
+   }
+
+   dma_addr = prange->dma_addr[gpuidx];
+   dma_addr += (addr >> PAGE_SHIFT) - prange->start;
+   return dma_addr;
+}
+
 static int
 svm_range_split_pages(struct svm_range *new, struct svm_range *old,
  uint64_t start, uint64_t last)
@@ -980,9 +997,14 @@ svm_range_split_pages(struct svm_range *new, struct 
svm_range *old,
if (r)
return r;
}
-   if (old->actual_loc)
+   if (old->actual_loc && new->vram_pages) {
old->vram_pages -= new->vram_pages;
-
+   new->actual_loc = old->actual_loc;
+   if (!old->vram_pages)
+   old->actual_loc = 0;
+   }
+   pr_debug("new->vram_pages 0x%llx loc 0x%x old->vram_pages 0x%llx loc 
0x%x\n",
+new->vram_pages, new->actual_loc, old->vram_pages, 
old->actual_loc);
return 0;
 }
 
@@ -1002,13 +1024,14 @@ svm_range_split_nodes(struct svm_range *new, struct 
svm_range *old,
new->offset = old->offset + npages;
}
 
-   new->svm_bo = svm_range_bo_ref(old->svm_bo);
-   new->ttm_res = old->ttm_res;
-
-   spin_lock(>svm_bo->list_lock);
-   list_add(>svm_bo_list, >svm_bo->range_list);
-   spin_unlock(>svm_bo->list_lock);
+   if (new->vram_pages) {
+   new->svm_bo = svm_range_bo_ref(old->svm_bo);
+   new->ttm_res = old->ttm_res;
 
+   spin_lock(>svm_bo->list_lock);
+   list_add(>svm_bo_list, >svm_bo->range_list);
+   spin_unlock(>svm_bo->list_lock);
+   }
return 0;

[PATCH] drm/amdkfd: Correct partial migration virtual addr

2024-01-15 Thread Philip Yang

Partial migration to system memory should use migrate.addr, not
prange->start as virtual address to allocate system memory page.

Fixes: 18eb61bd5a6a ("drm/amdkfd: Use partial migrations/mapping for GPU/CPU 
page faults in SVM"
Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index f856901055d3..bdc01ca9609a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -574,7 +574,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start,
 prange->last);
 
-   addr = prange->start << PAGE_SHIFT;
+   addr = migrate->start;
 
src = (uint64_t *)(scratch + npages);
dst = scratch;
-- 
2.35.1

Re: [PATCH v3] amd/amdkfd: Set correct svm range actual loc after spliting

2024-01-11 Thread Philip Yang


  


On 2024-01-11 12:37, Chen, Xiaogang
  wrote:


  
  On 1/11/2024 10:54 AM, Felix Kuehling wrote:
  
  

On 2024-01-10 17:01, Philip Yang wrote:

While svm range partial migrating to
  system memory, clear dma_addr vram
  
  domain flag, otherwise the future split will get incorrect
  vram_pages
  
  and actual loc.
  
  
  After range spliting, set new range and old range actual_loc:
  
  new range actual_loc is 0 if new->vram_pages is 0.
  
  old range actual_loc is 0 if old->vram_pages -
  new->vram_pages == 0.
  
  
  new range takes svm_bo ref only if vram_pages not equal to 0.
  
  
  Signed-off-by: Philip Yang 
  
  ---
  
    drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 20
  +++-
  
    drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 24
  ++--
  
    2 files changed, 33 insertions(+), 11 deletions(-)
  
  
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
  b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
  
  index f856901055d3..dae05f70257b 100644
  
  --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
  
  +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
  
  @@ -563,18 +563,30 @@ svm_migrate_copy_to_ram(struct
  amdgpu_device *adev, struct svm_range *prange,
  
    struct migrate_vma *migrate, struct dma_fence
  **mfence,
  
    dma_addr_t *scratch, uint64_t npages)
  
    {
  
  +    struct kfd_process *p = container_of(prange->svms,
  struct kfd_process, svms);
  
    struct device *dev = adev->dev;
  
  +    dma_addr_t *dma_addr;
  
    uint64_t *src;
  
    dma_addr_t *dst;
  
    struct page *dpage;
  
    uint64_t i = 0, j;
  
    uint64_t addr;
  
  +    s32 gpuidx;
  
  +    u64 offset;
  
    int r = 0;
  
      pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
  prange->start,
  
     prange->last);
  
    -    addr = prange->start << PAGE_SHIFT;
  


Is this another bug fix for partial migration? If so, it may be
worth making that a separate patch.


  
  Seems it is also a bug when prange is across multiple vma. With
  partial migration it become obvious.
  

yes

  

+    gpuidx =
  kfd_process_gpuidx_from_gpuid(p, prange->actual_loc);
  
  +    if (gpuidx < 0) {
  
  +    pr_debug("no GPU id 0x%x found\n",
  prange->actual_loc);
  
  +    return -EINVAL;
  
  +    }
  
  +
  
  +    addr = migrate->start;
  
  +    offset = (addr >> PAGE_SHIFT) - prange->start;
  
  +    dma_addr = prange->dma_addr[gpuidx];
  
      src = "" *)(scratch + npages);
  
    dst = scratch;
  
  @@ -623,6 +635,12 @@ svm_migrate_copy_to_ram(struct
  amdgpu_device *adev, struct svm_range *prange,
  
    goto out_oom;
  
    }
  
    +    /* Clear VRAM flag when page is migrated to ram, to
  count vram
  
  + * pages correctly when spliting the range.
  
  + */
  
  +    if (dma_addr && (dma_addr[offset + i] &
  SVM_RANGE_VRAM_DOMAIN))
  
  +    dma_addr[offset + i] = 0;
  
  +
  


  
  When come here we already know the page has been moved to system
  ram, do we still need check
  
  
  dma_addr[offset + i] & SVM_RANGE_VRAM_DOMAIN)
  
  
  You want to set dma_addr[offset + i] = 0 anyway.
  

I agree, dma_addr NULL and flag check is for safe purpose in case we
may change dma_addr update after migration or prefetch later.

  
  
  I'm not a big fan of messing with the DMA
arrays here, but I don't have a good alternative. I think what
bothers me is, how the DMA address array and handling of vram
page count is now spread out across so many places. It feels
fragile.

Re: [PATCH v3] amd/amdkfd: Set correct svm range actual loc after spliting

2024-01-11 Thread Philip Yang


  


On 2024-01-11 11:54, Felix Kuehling
  wrote:


  
  On 2024-01-10 17:01, Philip Yang wrote:
  
  While svm range partial migrating to
system memory, clear dma_addr vram

domain flag, otherwise the future split will get incorrect
vram_pages

and actual loc.


After range spliting, set new range and old range actual_loc:

new range actual_loc is 0 if new->vram_pages is 0.

old range actual_loc is 0 if old->vram_pages -
new->vram_pages == 0.


new range takes svm_bo ref only if vram_pages not equal to 0.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 20
+++-

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 24
++--

  2 files changed, 33 insertions(+), 11 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

index f856901055d3..dae05f70257b 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

@@ -563,18 +563,30 @@ svm_migrate_copy_to_ram(struct
amdgpu_device *adev, struct svm_range *prange,

  struct migrate_vma *migrate, struct dma_fence
**mfence,

  dma_addr_t *scratch, uint64_t npages)

  {

+    struct kfd_process *p = container_of(prange->svms,
struct kfd_process, svms);

  struct device *dev = adev->dev;

+    dma_addr_t *dma_addr;

  uint64_t *src;

  dma_addr_t *dst;

  struct page *dpage;

  uint64_t i = 0, j;

  uint64_t addr;

+    s32 gpuidx;

+    u64 offset;

  int r = 0;

    pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
prange->start,

   prange->last);

  -    addr = prange->start << PAGE_SHIFT;

  
  
  Is this another bug fix for partial migration? If so, it may be
  worth making that a separate patch.
  

yes, it is another bug I just noticed, the addr is passed to
  alloc system page along with migrate.vma, but addr is ignored for
  normal path, only used for shmem path, maybe it doesn't matter, I
  will put this into a separate patch anyway.


  
  
  +    gpuidx =
kfd_process_gpuidx_from_gpuid(p, prange->actual_loc);

+    if (gpuidx < 0) {

+    pr_debug("no GPU id 0x%x found\n",
prange->actual_loc);

+    return -EINVAL;

+    }

+

+    addr = migrate->start;

+    offset = (addr >> PAGE_SHIFT) - prange->start;

+    dma_addr = prange->dma_addr[gpuidx];

    src = "" *)(scratch + npages);

  dst = scratch;

@@ -623,6 +635,12 @@ svm_migrate_copy_to_ram(struct
amdgpu_device *adev, struct svm_range *prange,

  goto out_oom;

  }

  +    /* Clear VRAM flag when page is migrated to ram, to
count vram

+ * pages correctly when spliting the range.

+ */

+    if (dma_addr && (dma_addr[offset + i] &
SVM_RANGE_VRAM_DOMAIN))

+    dma_addr[offset + i] = 0;

+

  
  
  I'm not a big fan of messing with the DMA arrays here, but I don't
  have a good alternative. I think what bothers me is, how the DMA
  address array and handling of vram page count is now spread out
  across so many places. It feels fragile.
  
  
  Maybe it would be good to add a helper in kfd_svm.c:
  svm_get_dma_addr_for_page_count(prange, offset). That way you can
  keep the choice of gpuid and offset calculation in one place in
  kfd_svm.c, close to svm_range_copy_array.
  

vram page counting is only used when spliting range, it is good
  idea to add helper and put close to svm range split and copy
  array, not put in the migration path.
Regards,
Philip


  
  Other than that, the patch looks good to me.
  
  
  Regards,
  
    Felix
  
  
  
    pr_debug_ratelimited("dma
mapping dst to

[PATCH v3] amd/amdkfd: Set correct svm range actual loc after spliting

2024-01-10 Thread Philip Yang

While svm range partial migrating to system memory, clear dma_addr vram
domain flag, otherwise the future split will get incorrect vram_pages
and actual loc.

After range spliting, set new range and old range actual_loc:
new range actual_loc is 0 if new->vram_pages is 0.
old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0.

new range takes svm_bo ref only if vram_pages not equal to 0.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 20 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 24 ++--
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index f856901055d3..dae05f70257b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -563,18 +563,30 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, 
struct svm_range *prange,
struct migrate_vma *migrate, struct dma_fence **mfence,
dma_addr_t *scratch, uint64_t npages)
 {
+   struct kfd_process *p = container_of(prange->svms, struct kfd_process, 
svms);
struct device *dev = adev->dev;
+   dma_addr_t *dma_addr;
uint64_t *src;
dma_addr_t *dst;
struct page *dpage;
uint64_t i = 0, j;
uint64_t addr;
+   s32 gpuidx;
+   u64 offset;
int r = 0;
 
pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start,
 prange->last);
 
-   addr = prange->start << PAGE_SHIFT;
+   gpuidx = kfd_process_gpuidx_from_gpuid(p, prange->actual_loc);
+   if (gpuidx < 0) {
+   pr_debug("no GPU id 0x%x found\n", prange->actual_loc);
+   return -EINVAL;
+   }
+
+   addr = migrate->start;
+   offset = (addr >> PAGE_SHIFT) - prange->start;
+   dma_addr = prange->dma_addr[gpuidx];
 
src = (uint64_t *)(scratch + npages);
dst = scratch;
@@ -623,6 +635,12 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
goto out_oom;
}
 
+   /* Clear VRAM flag when page is migrated to ram, to count vram
+* pages correctly when spliting the range.
+*/
+   if (dma_addr && (dma_addr[offset + i] & SVM_RANGE_VRAM_DOMAIN))
+   dma_addr[offset + i] = 0;
+
pr_debug_ratelimited("dma mapping dst to 0x%llx, pfn 0x%lx\n",
 dst[i] >> PAGE_SHIFT, page_to_pfn(dpage));
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index cc24f30f88fb..35ee9e648cca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -362,7 +362,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, 
uint64_t start,
INIT_LIST_HEAD(>child_list);
atomic_set(>invalid, 0);
prange->validate_timestamp = 0;
-   prange->vram_pages = 0;
mutex_init(>migrate_mutex);
mutex_init(>lock);
 
@@ -980,9 +979,14 @@ svm_range_split_pages(struct svm_range *new, struct 
svm_range *old,
if (r)
return r;
}
-   if (old->actual_loc)
+   if (old->actual_loc && new->vram_pages) {
old->vram_pages -= new->vram_pages;
-
+   new->actual_loc = old->actual_loc;
+   if (!old->vram_pages)
+   old->actual_loc = 0;
+   }
+   pr_debug("new->vram_pages 0x%llx loc 0x%x old->vram_pages 0x%llx loc 
0x%x\n",
+new->vram_pages, new->actual_loc, old->vram_pages, 
old->actual_loc);
return 0;
 }
 
@@ -1002,13 +1006,14 @@ svm_range_split_nodes(struct svm_range *new, struct 
svm_range *old,
new->offset = old->offset + npages;
}
 
-   new->svm_bo = svm_range_bo_ref(old->svm_bo);
-   new->ttm_res = old->ttm_res;
-
-   spin_lock(>svm_bo->list_lock);
-   list_add(>svm_bo_list, >svm_bo->range_list);
-   spin_unlock(>svm_bo->list_lock);
+   if (new->vram_pages) {
+   new->svm_bo = svm_range_bo_ref(old->svm_bo);
+   new->ttm_res = old->ttm_res;
 
+   spin_lock(>svm_bo->list_lock);
+   list_add(>svm_bo_list, >svm_bo->range_list);
+   spin_unlock(>svm_bo->list_lock);
+   }
return 0;
 }
 
@@ -1058,7 +1063,6 @@ svm_range_split_adjust(struct svm_range *new, struct 
svm_range *old,
new->flags = old->flags;
new->preferred_loc = old->preferred_loc;
new->prefetch_loc = old->prefetch_loc;
-   new->actual_loc = old->actual_loc;
new->granularity = old->granularity;
new->mapped_to_gpu = old->mapped_to_gpu;
bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
-- 
2.35.1

Re: [PATCH v2] amd/amdkfd: Set correct svm range actual loc after spliting

2024-01-10 Thread Philip Yang


  


On 2024-01-10 11:30, Felix Kuehling
  wrote:


  
  On 2024-01-09 15:05, Philip Yang wrote:
  
  After svm range partial migrating to
system memory, unmap to cleanup the

corresponding dma_addr vram domain flag, otherwise the future
split will

get incorrect vram_pages and actual loc.


After range spliting, set new range and old range actual_loc:

new range actual_loc is 0 if new->vram_pages is 0.

old range actual_loc is 0 if old->vram_pages -
new->vram_pages == 0.


new range takes svm_bo ref only if vram_pages not equal to 0.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  3 ++

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 35
+++-

  drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  3 +-

  3 files changed, 27 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

index f856901055d3..e85bcda29db6 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

@@ -839,6 +839,9 @@ int svm_migrate_vram_to_ram(struct svm_range
*prange, struct mm_struct *mm,

  prange->actual_loc = 0;

  svm_range_vram_node_free(prange);

  }

+

+    svm_range_dma_unmap(prange, start_mgr -
prange->start,

+    last_mgr - start_mgr + 1);

  
  
  If this is just for clearing the VRAM flags, then we should
  probably create another helper function for that. DMA unmapping
  system memory pages that didn't even move is not necessary here.
  
  
  Also, as Xiaogang pointed out, the migration may have missed some
  pages due to page locking race conditions. If you want this to
  give you accurate VRAM page counts, you should only clear the VRAM
  flags for pages that were actually migrated.
  

ok, understand the concern now, if failed to migrate page to
  system memory to recover CPU page fault, app will crash, but
  prefetch may fail to migrate page to system memory, will send new
  patch, to clear the prange->dma_addr[gpuidx] VRAM flags while
  migrating the range to ram.
Regards,
Philip


  
  Regards,
  
    Felix
  
  
  
    }

    return r < 0 ? r : 0;

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index cc24f30f88fb..2202bdcde057 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

@@ -254,6 +254,10 @@ void svm_range_dma_unmap_dev(struct device
*dev, dma_addr_t *dma_addr,

  return;

    for (i = offset; i < offset + npages; i++) {

+    if (dma_addr[i] & SVM_RANGE_VRAM_DOMAIN) {

+    dma_addr[i] = 0;

+    continue;

+    }

  if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))

  continue;

  pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i]
>> PAGE_SHIFT);

@@ -262,7 +266,8 @@ void svm_range_dma_unmap_dev(struct device
*dev, dma_addr_t *dma_addr,

  }

  }

  -void svm_range_dma_unmap(struct svm_range *prange)

+void svm_range_dma_unmap(struct svm_range *prange, unsigned
long offset,

+ unsigned long npages)

  {

  struct kfd_process_device *pdd;

  dma_addr_t *dma_addr;

@@ -284,7 +289,7 @@ void svm_range_dma_unmap(struct svm_range
*prange)

  }

  dev = >dev->adev->pdev->dev;

  -    svm_range_dma_unmap_dev(dev, dma_addr, 0,
prange->npages);

+    svm_range_dma_unmap_dev(dev, dma_addr, offset, npages);

  }

  }

  @@ -299,7 +304,7 @@ static void svm_range_free(struct
svm_range *prange, bool do_unmap)

    svm_range_vram_node_free(prange);

  if (do_unmap)

-    svm_range_dma_unmap(prang

Re: [PATCH v2] amd/amdkfd: Set correct svm range actual loc after spliting

2024-01-10 Thread Philip Yang


  


On 2024-01-09 17:29, Chen, Xiaogang
  wrote:


  
  On 1/9/2024 2:05 PM, Philip Yang wrote:
  
  After svm range partial migrating to
system memory, unmap to cleanup the

corresponding dma_addr vram domain flag, otherwise the future
split will

get incorrect vram_pages and actual loc.


After range spliting, set new range and old range actual_loc:

new range actual_loc is 0 if new->vram_pages is 0.

old range actual_loc is 0 if old->vram_pages -
new->vram_pages == 0.


new range takes svm_bo ref only if vram_pages not equal to 0.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  3 ++

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 35
+++-

  drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  3 +-

  3 files changed, 27 insertions(+), 14 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

index f856901055d3..e85bcda29db6 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

@@ -839,6 +839,9 @@ int svm_migrate_vram_to_ram(struct svm_range
*prange, struct mm_struct *mm,

  prange->actual_loc = 0;

  svm_range_vram_node_free(prange);

  }

+

+    svm_range_dma_unmap(prange, start_mgr -
prange->start,

+    last_mgr - start_mgr + 1);

  
  
  when come here we know some pages got migrated to sys ram, in
  theory we do not know if all pages got migrated.
  svm_range_dma_unmap does dma_unmap for all pages from  start_mgr -
  prange->start to  last_mgr - start_mgr + 1.
  
  
  If there are pages not migrated due to some reason(though it is
  rare) we still need keep its dma_addr, I think only hmm can tell
  that.
  

For system page dma unmap_page and set dma_addr=0 after migration
  is fine because before updating GPU mapping,
  svm_range_validate_and_map calls svm_range_dma_map to update
  dma_addr for system pages.

  
    }

    return r < 0 ? r : 0;

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index cc24f30f88fb..2202bdcde057 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

@@ -254,6 +254,10 @@ void svm_range_dma_unmap_dev(struct device
*dev, dma_addr_t *dma_addr,

  return;

    for (i = offset; i < offset + npages; i++) {

+    if (dma_addr[i] & SVM_RANGE_VRAM_DOMAIN) {

+    dma_addr[i] = 0;

+    continue;

+    }

  
  same as above here set dma_addr[i]=0 unconditionally without
  knowing if the page is indeed in sys ram.
  

dma_addr[i] & SVM_RANGE_VRAM_DOMAIN is for device page,
  system page will still call dma_unmap_page.




    if
(!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))

  continue;

  pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i]
>> PAGE_SHIFT);

@@ -262,7 +266,8 @@ void svm_range_dma_unmap_dev(struct device
*dev, dma_addr_t *dma_addr,

  }

  }

  -void svm_range_dma_unmap(struct svm_range *prange)

+void svm_range_dma_unmap(struct svm_range *prange, unsigned
long offset,

+ unsigned long npages)

  {

  struct kfd_process_device *pdd;

  dma_addr_t *dma_addr;

@@ -284,7 +289,7 @@ void svm_range_dma_unmap(struct svm_range
*prange)

  }

  dev = >dev->adev->pdev->dev;

  -    svm_range_dma_unmap_dev(dev, dma_addr, 0,
prange->npages);

+    svm_range_dma_unmap_dev(dev, dma_addr, offset, npages);

  }

  }

  @@ -299,7 +304,7 @@ static void svm_range_free(struct
svm_range *prange, bool do_unmap)

    svm_range_vram_node_free(prange);

  if

[PATCH v2] amd/amdkfd: Set correct svm range actual loc after spliting

2024-01-09 Thread Philip Yang

After svm range partial migrating to system memory, unmap to cleanup the
corresponding dma_addr vram domain flag, otherwise the future split will
get incorrect vram_pages and actual loc.

After range spliting, set new range and old range actual_loc:
new range actual_loc is 0 if new->vram_pages is 0.
old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0.

new range takes svm_bo ref only if vram_pages not equal to 0.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 35 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  3 +-
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index f856901055d3..e85bcda29db6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -839,6 +839,9 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, 
struct mm_struct *mm,
prange->actual_loc = 0;
svm_range_vram_node_free(prange);
}
+
+   svm_range_dma_unmap(prange, start_mgr - prange->start,
+   last_mgr - start_mgr + 1);
}
 
return r < 0 ? r : 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index cc24f30f88fb..2202bdcde057 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -254,6 +254,10 @@ void svm_range_dma_unmap_dev(struct device *dev, 
dma_addr_t *dma_addr,
return;
 
for (i = offset; i < offset + npages; i++) {
+   if (dma_addr[i] & SVM_RANGE_VRAM_DOMAIN) {
+   dma_addr[i] = 0;
+   continue;
+   }
if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))
continue;
pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> 
PAGE_SHIFT);
@@ -262,7 +266,8 @@ void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t 
*dma_addr,
}
 }
 
-void svm_range_dma_unmap(struct svm_range *prange)
+void svm_range_dma_unmap(struct svm_range *prange, unsigned long offset,
+unsigned long npages)
 {
struct kfd_process_device *pdd;
dma_addr_t *dma_addr;
@@ -284,7 +289,7 @@ void svm_range_dma_unmap(struct svm_range *prange)
}
dev = >dev->adev->pdev->dev;
 
-   svm_range_dma_unmap_dev(dev, dma_addr, 0, prange->npages);
+   svm_range_dma_unmap_dev(dev, dma_addr, offset, npages);
}
 }
 
@@ -299,7 +304,7 @@ static void svm_range_free(struct svm_range *prange, bool 
do_unmap)
 
svm_range_vram_node_free(prange);
if (do_unmap)
-   svm_range_dma_unmap(prange);
+   svm_range_dma_unmap(prange, 0, prange->npages);
 
if (do_unmap && !p->xnack_enabled) {
pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
@@ -362,7 +367,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, 
uint64_t start,
INIT_LIST_HEAD(>child_list);
atomic_set(>invalid, 0);
prange->validate_timestamp = 0;
-   prange->vram_pages = 0;
mutex_init(>migrate_mutex);
mutex_init(>lock);
 
@@ -980,9 +984,14 @@ svm_range_split_pages(struct svm_range *new, struct 
svm_range *old,
if (r)
return r;
}
-   if (old->actual_loc)
+   if (old->actual_loc && new->vram_pages) {
old->vram_pages -= new->vram_pages;
-
+   new->actual_loc = old->actual_loc;
+   if (!old->vram_pages)
+   old->actual_loc = 0;
+   }
+   pr_debug("new->vram_pages 0x%llx loc 0x%x old->vram_pages 0x%llx loc 
0x%x\n",
+new->vram_pages, new->actual_loc, old->vram_pages, 
old->actual_loc);
return 0;
 }
 
@@ -1002,13 +1011,14 @@ svm_range_split_nodes(struct svm_range *new, struct 
svm_range *old,
new->offset = old->offset + npages;
}
 
-   new->svm_bo = svm_range_bo_ref(old->svm_bo);
-   new->ttm_res = old->ttm_res;
-
-   spin_lock(>svm_bo->list_lock);
-   list_add(>svm_bo_list, >svm_bo->range_list);
-   spin_unlock(>svm_bo->list_lock);
+   if (new->vram_pages) {
+   new->svm_bo = svm_range_bo_ref(old->svm_bo);
+   new->ttm_res = old->ttm_res;
 
+   spin_lock(>svm_bo->list_lock);
+   list_add(>svm_bo_list, >svm_bo->range_list);
+   spin_unlock(>svm_bo->list_lock);
+   }
return 0;
 }
 
@@ -1058,7 +1068,6 @@ svm_

Re: [PATCH] amd/amdkfd: Set correct svm range actual loc after spliting

2024-01-09 Thread Philip Yang


  


On 2024-01-08 18:17, Chen, Xiaogang
  wrote:

With a
  nitpick below, this patch is
  
  
  Reviewed-by:Xiaogang Chen
  
  
  On 1/8/2024 4:36 PM, Philip Yang wrote:
  
  After range spliting, set new range and
old range actual_loc:

new range actual_loc is 0 if new->vram_pages is 0.

old range actual_loc is 0 if old->vram_pages -
new->vram_pages == 0.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 8 +---

  1 file changed, 5 insertions(+), 3 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index cc24f30f88fb..cb09e1d3a643 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

@@ -362,7 +362,6 @@ svm_range *svm_range_new(struct
svm_range_list *svms, uint64_t start,

  INIT_LIST_HEAD(>child_list);

  atomic_set(>invalid, 0);

  prange->validate_timestamp = 0;

-    prange->vram_pages = 0;

  
  I think it is better to keep it, also:
  
  
  +new->actual_loc = 0;
  
  
  though not necessary as prange is allocated by kzalloc, just keep
  consistent with previous statements, or remove
  
  
  atomic_set(>invalid, 0);
  
  prange->validate_timestamp = 0;
  
  
  too.
  

kzalloc memset prange to 0, we should remove unnecessary 0
  assignment. prange->validate_timestamp will be removed
  completely in the following patch.
Will send out v2 patch to fix other related issues.

Regards,
Philip


  
  Regards
  
  Xiaogang
  
  
   
mutex_init(>migrate_mutex);

  mutex_init(>lock);

  @@ -980,8 +979,12 @@ svm_range_split_pages(struct svm_range
*new, struct svm_range *old,

  if (r)

  return r;

  }

-    if (old->actual_loc)

+    if (old->actual_loc && new->vram_pages) {

  old->vram_pages -= new->vram_pages;

+    new->actual_loc = old->actual_loc;

+    if (!old->vram_pages)

+    old->actual_loc = 0;

+    }

    return 0;

  }

@@ -1058,7 +1061,6 @@ svm_range_split_adjust(struct svm_range
*new, struct svm_range *old,

  new->flags = old->flags;

  new->preferred_loc = old->preferred_loc;

  new->prefetch_loc = old->prefetch_loc;

-    new->actual_loc = old->actual_loc;

  new->granularity = old->granularity;

  new->mapped_to_gpu = old->mapped_to_gpu;

  bitmap_copy(new->bitmap_access, old->bitmap_access,
MAX_GPU_INSTANCE);

1 2 3 4 5 6 7 >

1 - 100 of 636 matches

Mail list logo