[PATCH 30/34] drm/amdkfd: refine migration policy with xnack on

2021-04-14 Thread Felix Kuehling
With xnack on, GPU vm fault handler decide the best restore location,
then migrate range to the best restore location and update GPU mapping
to recover the GPU vm fault.

Signed-off-by: Philip Yang 
Signed-off-by: Alex Sierra 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |   7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |   3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  16 +++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 136 +--
 5 files changed, 150 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 5e0fec5dbcd6..9c1199d1b61b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -480,18 +480,19 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, 
struct svm_range *prange,
  * svm_migrate_ram_to_vram - migrate svm range from system to device
  * @prange: range structure
  * @best_loc: the device to migrate to
+ * @mm: the process mm structure
  *
  * Context: Process context, caller hold mmap read lock, svms lock, prange lock
  *
  * Return:
  * 0 - OK, otherwise error code
  */
-int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc)
+int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
+   struct mm_struct *mm)
 {
unsigned long addr, start, end;
struct vm_area_struct *vma;
struct amdgpu_device *adev;
-   struct mm_struct *mm;
int r = 0;
 
if (prange->actual_loc == best_loc) {
@@ -515,8 +516,6 @@ int svm_migrate_ram_to_vram(struct svm_range *prange, 
uint32_t best_loc)
start = prange->start << PAGE_SHIFT;
end = (prange->last + 1) << PAGE_SHIFT;
 
-   mm = current->mm;
-
for (addr = start; addr < end;) {
unsigned long next;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
index 082b9bb22270..53c899b80b85 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
@@ -38,7 +38,8 @@ enum MIGRATION_COPY_DIR {
FROM_VRAM_TO_RAM
 };
 
-int svm_migrate_ram_to_vram(struct svm_range *prange,  uint32_t best_loc);
+int svm_migrate_ram_to_vram(struct svm_range *prange,  uint32_t best_loc,
+   struct mm_struct *mm);
 int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm);
 unsigned long
 svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ca1b0c518d46..bce44164f1e3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -864,6 +864,9 @@ struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
 
 int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id);
+int kfd_process_gpuid_from_kgd(struct kfd_process *p,
+  struct amdgpu_device *adev, uint32_t *gpuid,
+  uint32_t *gpuidx);
 static inline int kfd_process_gpuid_from_gpuidx(struct kfd_process *p,
uint32_t gpuidx, uint32_t *gpuid) {
return gpuidx < p->n_pdds ? p->pdds[gpuidx]->dev->id : -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index b6ba2b91bdc3..dfc9e4bf7c72 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1681,6 +1681,22 @@ int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, 
uint32_t gpu_id)
return -EINVAL;
 }
 
+int
+kfd_process_gpuid_from_kgd(struct kfd_process *p, struct amdgpu_device *adev,
+  uint32_t *gpuid, uint32_t *gpuidx)
+{
+   struct kgd_dev *kgd = (struct kgd_dev *)adev;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++)
+   if (p->pdds[i] && p->pdds[i]->dev->kgd == kgd) {
+   *gpuid = p->pdds[i]->dev->id;
+   *gpuidx = i;
+   return 0;
+   }
+   return -EINVAL;
+}
+
 static void evict_process_worker(struct work_struct *work)
 {
int ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 6d3add17fcac..40e1dc42a981 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1317,6 +1317,24 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
if (gpuidx < MAX_GPU_INSTANCE) {
bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE);
bitmap_set(ctx.bitmap, gpuidx, 1);
+   } else if (ctx.process->xnack_enabled) {
+   bitmap_copy(ctx.bitmap, prange->bitmap_aip, 

[PATCH 30/34] drm/amdkfd: refine migration policy with xnack on

2021-04-05 Thread Felix Kuehling
With xnack on, GPU vm fault handler decide the best restore location,
then migrate range to the best restore location and update GPU mapping
to recover the GPU vm fault.

Signed-off-by: Philip Yang 
Signed-off-by: Alex Sierra 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |   7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |   3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  16 +++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 136 +--
 5 files changed, 150 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 6d22ca0819ee..a4069e76bf8d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -479,18 +479,19 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, 
struct svm_range *prange,
  * svm_migrate_ram_to_vram - migrate svm range from system to device
  * @prange: range structure
  * @best_loc: the device to migrate to
+ * @mm: the process mm structure
  *
  * Context: Process context, caller hold mmap read lock, svms lock, prange lock
  *
  * Return:
  * 0 - OK, otherwise error code
  */
-int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc)
+int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
+   struct mm_struct *mm)
 {
unsigned long addr, start, end;
struct vm_area_struct *vma;
struct amdgpu_device *adev;
-   struct mm_struct *mm;
int r = 0;
 
if (prange->actual_loc == best_loc) {
@@ -514,8 +515,6 @@ int svm_migrate_ram_to_vram(struct svm_range *prange, 
uint32_t best_loc)
start = prange->start << PAGE_SHIFT;
end = (prange->last + 1) << PAGE_SHIFT;
 
-   mm = current->mm;
-
for (addr = start; addr < end;) {
unsigned long next;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
index 95fd7b21791f..9949b55d3b6a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
@@ -37,7 +37,8 @@ enum MIGRATION_COPY_DIR {
FROM_VRAM_TO_RAM
 };
 
-int svm_migrate_ram_to_vram(struct svm_range *prange,  uint32_t best_loc);
+int svm_migrate_ram_to_vram(struct svm_range *prange,  uint32_t best_loc,
+   struct mm_struct *mm);
 int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm);
 unsigned long
 svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ca1b0c518d46..bce44164f1e3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -864,6 +864,9 @@ struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
 
 int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id);
+int kfd_process_gpuid_from_kgd(struct kfd_process *p,
+  struct amdgpu_device *adev, uint32_t *gpuid,
+  uint32_t *gpuidx);
 static inline int kfd_process_gpuid_from_gpuidx(struct kfd_process *p,
uint32_t gpuidx, uint32_t *gpuid) {
return gpuidx < p->n_pdds ? p->pdds[gpuidx]->dev->id : -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index f897c1d0ea66..1d6310f63ae9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1673,6 +1673,22 @@ int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, 
uint32_t gpu_id)
return -EINVAL;
 }
 
+int
+kfd_process_gpuid_from_kgd(struct kfd_process *p, struct amdgpu_device *adev,
+  uint32_t *gpuid, uint32_t *gpuidx)
+{
+   struct kgd_dev *kgd = (struct kgd_dev *)adev;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++)
+   if (p->pdds[i] && p->pdds[i]->dev->kgd == kgd) {
+   *gpuid = p->pdds[i]->dev->id;
+   *gpuidx = i;
+   return 0;
+   }
+   return -EINVAL;
+}
+
 static void evict_process_worker(struct work_struct *work)
 {
int ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index adb8e69fbd28..da2ca264753c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1321,6 +1321,24 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
if (gpuidx < MAX_GPU_INSTANCE) {
bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE);
bitmap_set(ctx.bitmap, gpuidx, 1);
+   } else if (ctx.process->xnack_enabled) {
+   bitmap_copy(ctx.bitmap, prange->bitmap_aip, 

[PATCH 30/34] drm/amdkfd: refine migration policy with xnack on

2021-03-31 Thread Felix Kuehling
With xnack on, GPU vm fault handler decide the best restore location,
then migrate range to the best restore location and update GPU mapping
to recover the GPU vm fault.

Signed-off-by: Philip Yang 
Signed-off-by: Alex Sierra 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |   7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.h |   3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  16 +++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 136 +--
 5 files changed, 150 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 552c4f656e2d..73701406acb3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -479,18 +479,19 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, 
struct svm_range *prange,
  * svm_migrate_ram_to_vram - migrate svm range from system to device
  * @prange: range structure
  * @best_loc: the device to migrate to
+ * @mm: the process mm structure
  *
  * Context: Process context, caller hold mmap read lock, svms lock, prange lock
  *
  * Return:
  * 0 - OK, otherwise error code
  */
-int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc)
+int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
+   struct mm_struct *mm)
 {
unsigned long addr, start, end;
struct vm_area_struct *vma;
struct amdgpu_device *adev;
-   struct mm_struct *mm;
int r = 0;
 
if (prange->actual_loc == best_loc) {
@@ -514,8 +515,6 @@ int svm_migrate_ram_to_vram(struct svm_range *prange, 
uint32_t best_loc)
start = prange->start << PAGE_SHIFT;
end = (prange->last + 1) << PAGE_SHIFT;
 
-   mm = current->mm;
-
for (addr = start; addr < end;) {
unsigned long next;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
index 95fd7b21791f..9949b55d3b6a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
@@ -37,7 +37,8 @@ enum MIGRATION_COPY_DIR {
FROM_VRAM_TO_RAM
 };
 
-int svm_migrate_ram_to_vram(struct svm_range *prange,  uint32_t best_loc);
+int svm_migrate_ram_to_vram(struct svm_range *prange,  uint32_t best_loc,
+   struct mm_struct *mm);
 int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm);
 unsigned long
 svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ca1b0c518d46..bce44164f1e3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -864,6 +864,9 @@ struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
 
 int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id);
+int kfd_process_gpuid_from_kgd(struct kfd_process *p,
+  struct amdgpu_device *adev, uint32_t *gpuid,
+  uint32_t *gpuidx);
 static inline int kfd_process_gpuid_from_gpuidx(struct kfd_process *p,
uint32_t gpuidx, uint32_t *gpuid) {
return gpuidx < p->n_pdds ? p->pdds[gpuidx]->dev->id : -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index f897c1d0ea66..1d6310f63ae9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1673,6 +1673,22 @@ int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, 
uint32_t gpu_id)
return -EINVAL;
 }
 
+int
+kfd_process_gpuid_from_kgd(struct kfd_process *p, struct amdgpu_device *adev,
+  uint32_t *gpuid, uint32_t *gpuidx)
+{
+   struct kgd_dev *kgd = (struct kgd_dev *)adev;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++)
+   if (p->pdds[i] && p->pdds[i]->dev->kgd == kgd) {
+   *gpuid = p->pdds[i]->dev->id;
+   *gpuidx = i;
+   return 0;
+   }
+   return -EINVAL;
+}
+
 static void evict_process_worker(struct work_struct *work)
 {
int ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 77da6c68fab2..61bf2df38e72 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1283,6 +1283,24 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
if (gpuidx < MAX_GPU_INSTANCE) {
bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE);
bitmap_set(ctx.bitmap, gpuidx, 1);
+   } else if (ctx.process->xnack_enabled) {
+   bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
+
+