from:"Alex Sierra"

[PATCH] drm/amdgpu: Force order between a read and write to the same address

2023-11-20 Thread Alex Sierra

Setting register to force ordering to prevent read/write or write/read
hazards for un-cached modes.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 8 
 .../gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_offset.h| 2 ++
 2 files changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 0c6133cc5e57..40ce12323164 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -89,6 +89,10 @@ MODULE_FIRMWARE("amdgpu/gc_11_5_0_me.bin");
 MODULE_FIRMWARE("amdgpu/gc_11_5_0_mec.bin");
 MODULE_FIRMWARE("amdgpu/gc_11_5_0_rlc.bin");
 
+static const struct soc15_reg_golden golden_settings_gc_11_0[] = {
+   SOC15_REG_GOLDEN_VALUE(GC, 0, regTCP_CNTL, 0x2000, 0x2000)
+};
+
 static const struct soc15_reg_golden golden_settings_gc_11_0_1[] =
 {
SOC15_REG_GOLDEN_VALUE(GC, 0, regCGTT_GS_NGG_CLK_CTRL, 0x9fff8fff, 
0x0010),
@@ -304,6 +308,10 @@ static void gfx_v11_0_init_golden_registers(struct 
amdgpu_device *adev)
default:
break;
}
+   soc15_program_register_sequence(adev,
+   golden_settings_gc_11_0,
+   (const 
u32)ARRAY_SIZE(golden_settings_gc_11_0));
+
 }
 
 static void gfx_v11_0_write_data_to_reg(struct amdgpu_ring *ring, int eng_sel,
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_offset.h
index c92c4b83253f..4bff1ef8a9a6 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_0_offset.h
@@ -6369,6 +6369,8 @@
 #define regTCP_INVALIDATE_BASE_IDX 
 1
 #define regTCP_STATUS  
 0x19a1
 #define regTCP_STATUS_BASE_IDX 
 1
+#define regTCP_CNTL
 0x19a2
+#define regTCP_CNTL_BASE_IDX   
 1
 #define regTCP_CNTL2   
 0x19a3
 #define regTCP_CNTL2_BASE_IDX  
 1
 #define regTCP_DEBUG_INDEX 
 0x19a5
-- 
2.32.0

[PATCH] drm/amdgpu: Force order between a read and write to the same address

2023-11-18 Thread Alex Sierra

Setting register to force ordering to prevent read/write or write/read
hazards for un-cached modes.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 22 +--
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c|  8 +++
 .../include/asic_reg/gc/gc_11_0_0_offset.h|  2 ++
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 68747a52e5c2..592ac993f013 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3193,7 +3193,8 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER9_SELECT, 0xf0f001ff, 
0x),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSX_DEBUG_1, 0x0001, 0x00010020),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0xfff7, 0x0103),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0xffbf, 0x00a0)
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0xffbf, 0x00a0),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmTCP_CNTL, 0x2000, 0x2000)
 };
 
 static const struct soc15_reg_golden golden_settings_gc_10_3_sienna_cichlid[] 
= {
@@ -3245,6 +3246,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_2[] = {
 
/* This is not in GDB yet. Don't remove it. It fixes a GPU hang on Navy 
Flounder. */
SOC15_REG_GOLDEN_VALUE(GC, 0, mmLDS_CONFIG,  0x0020, 0x0020),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmTCP_CNTL, 0x2000, 0x2000)
 };
 
 static const struct soc15_reg_golden golden_settings_gc_10_3_vangogh[] = {
@@ -3275,6 +3277,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_vangogh[] = {
 
/* This is not in GDB yet. Don't remove it. It fixes a GPU hang on 
VanGogh. */
SOC15_REG_GOLDEN_VALUE(GC, 0, mmLDS_CONFIG,  0x0020, 0x0020),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmTCP_CNTL, 0x2000, 0x2000)
 };
 
 static const struct soc15_reg_golden golden_settings_gc_10_3_3[] = {
@@ -3297,7 +3300,8 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_3[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmPA_SC_BINNER_TIMEOUT_COUNTER, 
0x, 0x0800),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmPA_SC_ENHANCE_2, 0xffbf, 
0x0820),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0xfff7, 0x0103),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0x, 0x0010)
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0x, 0x0010),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmTCP_CNTL, 0x2000, 0x2000)
 };
 
 static const struct soc15_reg_golden golden_settings_gc_10_3_4[] = {
@@ -3336,7 +3340,8 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_4[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSX_DEBUG_1, 0x0001, 0x00010020),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0x0103, 0x0103),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0x03a0, 0x00a0),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmLDS_CONFIG,  0x0020, 0x0020)
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmLDS_CONFIG,  0x0020, 0x0020),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmTCP_CNTL, 0x2000, 0x2000)
 };
 
 static const struct soc15_reg_golden golden_settings_gc_10_3_5[] = {
@@ -3371,7 +3376,8 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_5[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER8_SELECT, 0xf0f001ff, 
0x),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_PERFCOUNTER9_SELECT, 0xf0f001ff, 
0x),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0xfff7, 0x0103),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0xffbf, 0x00a0)
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0xffbf, 0x00a0),
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmTCP_CNTL, 0x2000, 0x2000)
 };
 
 static const struct soc15_reg_golden golden_settings_gc_10_0_cyan_skillfish[] 
= {
@@ -3408,7 +3414,7 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_0_cyan_skillfish[] =
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_ARB_CONFIG, 0x0100, 0x0130),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQ_LDS_CLK_CTRL, 0x, 
0x),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0x00030008, 0x0103),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0x0080, 0x0080)
+   SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0x0080, 0x0080),
 };
 
 static const struct soc15_reg_golden golden_settings_gc_10_3_6[] = {
@@ -3433,7 +3439,8 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_3_6[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSQG_CONFIG, 0x17ff, 0x1000),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmSX_DEBUG_1, 0xff7f, 0x00010020),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmTA_CNTL_AUX, 0xfff7, 0x0103),
-   SOC15_REG_GOLDEN_VALUE(GC, 0, mmUTCL1_CTRL, 0x

[PATCH] drm/amdkfd: remap unaligned svm ranges that have split

2023-10-19 Thread Alex Sierra

Split SVM ranges that have been mapped into 2MB page table entries,
require to be remap in case the split has happened in a non-aligned
VA.
[WHY]:
This condition causes the 2MB page table entries be split into 4KB
PTEs.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 43 +---
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 7b81233bc9ae..aa2996d6f818 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1104,26 +1104,32 @@ svm_range_split(struct svm_range *prange, uint64_t 
start, uint64_t last,
 }
 
 static int
-svm_range_split_tail(struct svm_range *prange,
-uint64_t new_last, struct list_head *insert_list)
+svm_range_split_tail(struct svm_range *prange, uint64_t new_last,
+struct list_head *insert_list, struct list_head 
*remap_list)
 {
struct svm_range *tail;
int r = svm_range_split(prange, prange->start, new_last, );
 
-   if (!r)
+   if (!r) {
list_add(>list, insert_list);
+   if (!IS_ALIGNED(new_last + 1, 1UL << prange->granularity))
+   list_add(>update_list, remap_list);
+   }
return r;
 }
 
 static int
-svm_range_split_head(struct svm_range *prange,
-uint64_t new_start, struct list_head *insert_list)
+svm_range_split_head(struct svm_range *prange, uint64_t new_start,
+struct list_head *insert_list, struct list_head 
*remap_list)
 {
struct svm_range *head;
int r = svm_range_split(prange, new_start, prange->last, );
 
-   if (!r)
+   if (!r) {
list_add(>list, insert_list);
+   if (!IS_ALIGNED(new_start, 1UL << prange->granularity))
+   list_add(>update_list, remap_list);
+   }
return r;
 }
 
@@ -2113,7 +2119,7 @@ static int
 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
  uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
  struct list_head *update_list, struct list_head *insert_list,
- struct list_head *remove_list)
+ struct list_head *remove_list, struct list_head *remap_list)
 {
unsigned long last = start + size - 1UL;
struct svm_range_list *svms = >svms;
@@ -2129,6 +2135,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, 
uint64_t size,
INIT_LIST_HEAD(insert_list);
INIT_LIST_HEAD(remove_list);
INIT_LIST_HEAD(_list);
+   INIT_LIST_HEAD(remap_list);
 
node = interval_tree_iter_first(>objects, start, last);
while (node) {
@@ -2153,6 +2160,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, 
uint64_t size,
struct svm_range *old = prange;
 
prange = svm_range_clone(old);
+
if (!prange) {
r = -ENOMEM;
goto out;
@@ -2161,18 +2169,17 @@ svm_range_add(struct kfd_process *p, uint64_t start, 
uint64_t size,
list_add(>update_list, remove_list);
list_add(>list, insert_list);
list_add(>update_list, update_list);
-
if (node->start < start) {
pr_debug("change old range start\n");
r = svm_range_split_head(prange, start,
-insert_list);
+insert_list, 
remap_list);
if (r)
goto out;
}
if (node->last > last) {
pr_debug("change old range last\n");
r = svm_range_split_tail(prange, last,
-insert_list);
+insert_list, 
remap_list);
if (r)
goto out;
}
@@ -3565,6 +3572,7 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
struct list_head update_list;
struct list_head insert_list;
struct list_head remove_list;
+   struct list_head remap_list;
struct svm_range_list *svms;
struct svm_range *prange;
struct svm_range *next;
@@ -3596,7 +3604,7 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
 
/* Add new range and split existing ranges as needed */
r = svm_range_add(p, start, size, nattr, attrs, _list,
- _list, _list);
+

[PATCH] drm/amdkfd: remap unaligned svm ranges that have split

2023-10-18 Thread Alex Sierra

Split SVM ranges that have been mapped into 2MB page table entries,
require to be remap in case the split has happened in a non-aligned
VA.
[WHY]:
This condition causes the 2MB page table entries be split into 4KB
PTEs.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 45 +---
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 7b81233bc9ae..1dd9a1cf2358 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1104,26 +1104,34 @@ svm_range_split(struct svm_range *prange, uint64_t 
start, uint64_t last,
 }
 
 static int
-svm_range_split_tail(struct svm_range *prange,
-uint64_t new_last, struct list_head *insert_list)
+svm_range_split_tail(struct svm_range *prange, uint64_t new_last,
+struct list_head *insert_list, struct list_head 
*remap_list)
 {
struct svm_range *tail;
int r = svm_range_split(prange, prange->start, new_last, );
 
-   if (!r)
+   if (!r) {
list_add(>list, insert_list);
+   if (!IS_ALIGNED(tail->last + 1 - tail->start,
+   1UL << tail->granularity))
+   list_add(>update_list, remap_list);
+   }
return r;
 }
 
 static int
-svm_range_split_head(struct svm_range *prange,
-uint64_t new_start, struct list_head *insert_list)
+svm_range_split_head(struct svm_range *prange, uint64_t new_start,
+struct list_head *insert_list, struct list_head 
*remap_list)
 {
struct svm_range *head;
int r = svm_range_split(prange, new_start, prange->last, );
 
-   if (!r)
+   if (!r) {
list_add(>list, insert_list);
+   if (!IS_ALIGNED(head->last + 1 - head->start,
+   1UL << head->granularity))
+   list_add(>update_list, remap_list);
+   }
return r;
 }
 
@@ -2113,7 +2121,7 @@ static int
 svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
  uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
  struct list_head *update_list, struct list_head *insert_list,
- struct list_head *remove_list)
+ struct list_head *remove_list, struct list_head *remap_list)
 {
unsigned long last = start + size - 1UL;
struct svm_range_list *svms = >svms;
@@ -2129,6 +2137,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, 
uint64_t size,
INIT_LIST_HEAD(insert_list);
INIT_LIST_HEAD(remove_list);
INIT_LIST_HEAD(_list);
+   INIT_LIST_HEAD(remap_list);
 
node = interval_tree_iter_first(>objects, start, last);
while (node) {
@@ -2153,6 +2162,7 @@ svm_range_add(struct kfd_process *p, uint64_t start, 
uint64_t size,
struct svm_range *old = prange;
 
prange = svm_range_clone(old);
+
if (!prange) {
r = -ENOMEM;
goto out;
@@ -2161,18 +2171,17 @@ svm_range_add(struct kfd_process *p, uint64_t start, 
uint64_t size,
list_add(>update_list, remove_list);
list_add(>list, insert_list);
list_add(>update_list, update_list);
-
if (node->start < start) {
pr_debug("change old range start\n");
r = svm_range_split_head(prange, start,
-insert_list);
+insert_list, 
remap_list);
if (r)
goto out;
}
if (node->last > last) {
pr_debug("change old range last\n");
r = svm_range_split_tail(prange, last,
-insert_list);
+insert_list, 
remap_list);
if (r)
goto out;
}
@@ -3565,6 +3574,7 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
struct list_head update_list;
struct list_head insert_list;
struct list_head remove_list;
+   struct list_head remap_list;
struct svm_range_list *svms;
struct svm_range *prange;
struct svm_range *next;
@@ -3596,7 +3606,7 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
 
/* Add new range and split existing ranges as needed */
r = svm_range_add(p, start, size

[PATCH] drm/amdkfd: use mask to get v9 interrupt sq data bits correctly

2023-08-28 Thread Alex Sierra

Interrupt sq data bits were not taken properly from contextid0 and contextid1.
Use macro KFD_CONTEXT_ID_GET_SQ_INT_DATA instead.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index f0731a6a5306..830396b1c3b1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -384,7 +384,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
default:
break;
}
-   kfd_signal_event_interrupt(pasid, context_id0 & 
0xff, 24);
+   kfd_signal_event_interrupt(pasid, sq_int_data, 24);
} else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),
-- 
2.32.0

[PATCH] drm/amdkfd: retry after EBUSY is returned from hmm_ranges_get_pages

2023-08-16 Thread Alex Sierra

if hmm_range_get_pages returns EBUSY error during
svm_range_validate_and_map, within the context of a page fault
interrupt. This should retry through svm_range_restore_pages
callback. Therefore we treat this as EAGAIN error instead, and defer
it to restore pages fallback.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 93609ea42163..3ebd5d99f39e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1685,6 +1685,8 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
WRITE_ONCE(p->svms.faulting_task, NULL);
if (r) {
pr_debug("failed %d to get svm range pages\n", r);
+   if (r == -EBUSY)
+   r = -EAGAIN;
goto unreserve_out;
}
 
-- 
2.32.0

[PATCH] drm/amdkfd: wrap dynamic debug call with CONFIG_DYNAMIC_DEBUG_CORE

2023-08-04 Thread Alex Sierra

This causes error compilation if CONFIG_DYNAMIC_DEBUG_CORE is not
defined.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index a69994ff1c2f..cde4cc6afa83 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -824,6 +824,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct 
svm_range *prange,
  *
  * Context: The caller must hold svms->lock
  */
+#if defined(CONFIG_DYNAMIC_DEBUG_CORE)
 static void svm_range_debug_dump(struct svm_range_list *svms)
 {
struct interval_tree_node *node;
@@ -851,6 +852,7 @@ static void svm_range_debug_dump(struct svm_range_list 
*svms)
node = interval_tree_iter_next(node, 0, ~0ULL);
}
 }
+#endif
 
 static void *
 svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements,
@@ -3594,7 +3596,9 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
break;
}
 
+#if defined(CONFIG_DYNAMIC_DEBUG_CORE)
dynamic_svm_range_dump(svms);
+#endif
 
mutex_unlock(>lock);
mmap_read_unlock(mm);
-- 
2.32.0

[PATCH] drm/amdkfd: avoid unmap dma address when svm_ranges are split

2023-07-28 Thread Alex Sierra

DMA address reference within svm_ranges should be unmapped only after
the memory has been released from the system. In case of range
splitting, the DMA address information should be copied to the
corresponding range after this has split. But leaving dma mapping
intact.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  7 +--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 61 +---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  2 +-
 3 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 709ac885ca6d..7d82c7da223a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -461,7 +461,6 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct 
svm_range *prange,
0, node->id, trigger);
 
svm_range_dma_unmap(adev->dev, scratch, 0, npages);
-   svm_range_free_dma_mappings(prange);
 
 out_free:
kvfree(buf);
@@ -543,10 +542,12 @@ svm_migrate_ram_to_vram(struct svm_range *prange, 
uint32_t best_loc,
addr = next;
}
 
-   if (cpages)
+   if (cpages) {
prange->actual_loc = best_loc;
-   else
+   svm_range_free_dma_mappings(prange, true);
+   } else {
svm_range_vram_node_free(prange);
+   }
 
return r < 0 ? r : 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 1b50eae051a4..a69994ff1c2f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -241,7 +241,7 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t 
*dma_addr,
}
 }
 
-void svm_range_free_dma_mappings(struct svm_range *prange)
+void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma)
 {
struct kfd_process_device *pdd;
dma_addr_t *dma_addr;
@@ -262,13 +262,14 @@ void svm_range_free_dma_mappings(struct svm_range *prange)
continue;
}
dev = >dev->adev->pdev->dev;
-   svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
+   if (unmap_dma)
+   svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
kvfree(dma_addr);
prange->dma_addr[gpuidx] = NULL;
}
 }
 
-static void svm_range_free(struct svm_range *prange, bool update_mem_usage)
+static void svm_range_free(struct svm_range *prange, bool do_unmap)
 {
uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT;
struct kfd_process *p = container_of(prange->svms, struct kfd_process, 
svms);
@@ -277,9 +278,9 @@ static void svm_range_free(struct svm_range *prange, bool 
update_mem_usage)
 prange->start, prange->last);
 
svm_range_vram_node_free(prange);
-   svm_range_free_dma_mappings(prange);
+   svm_range_free_dma_mappings(prange, do_unmap);
 
-   if (update_mem_usage && !p->xnack_enabled) {
+   if (do_unmap && !p->xnack_enabled) {
pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
@@ -851,6 +852,37 @@ static void svm_range_debug_dump(struct svm_range_list 
*svms)
}
 }
 
+static void *
+svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements,
+uint64_t offset)
+{
+   unsigned char *dst;
+
+   dst = kvmalloc_array(num_elements, size, GFP_KERNEL);
+   if (!dst)
+   return NULL;
+   memcpy(dst, (unsigned char *)psrc + offset, num_elements * size);
+
+   return (void *)dst;
+}
+
+static int
+svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src)
+{
+   int i;
+
+   for (i = 0; i < MAX_GPU_INSTANCE; i++) {
+   if (!src->dma_addr[i])
+   continue;
+   dst->dma_addr[i] = svm_range_copy_array(src->dma_addr[i],
+   sizeof(*src->dma_addr[i]), src->npages, 
0);
+   if (!dst->dma_addr[i])
+   return -ENOMEM;
+   }
+
+   return 0;
+}
+
 static int
 svm_range_split_array(void *ppnew, void *ppold, size_t size,
  uint64_t old_start, uint64_t old_n,
@@ -865,22 +897,16 @@ svm_range_split_array(void *ppnew, void *ppold, size_t 
size,
if (!pold)
return 0;
 
-   new = kvmalloc_array(new_n, size, GFP_KERNEL);
+   d = (new_start - old_start) * size;
+   new = svm_range_copy_array(pold, size, new_n, d);
if (!new)
return -ENOMEM;
-
-   d = (new_start - old_start) * size;
-   memcpy(new, pold + d, new_n * s

[PATCH] drm/amdkfd: avoid unmap dma address when svm_ranges are split

2023-07-27 Thread Alex Sierra

DMA address reference within svm_ranges should be unmapped only after
the memory has been released from the system. In case of range
splitting, the DMA address information should be copied to the
corresponding range after this has split. But leaving dma mapping
intact.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 67 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  2 +-
 3 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 709ac885ca6d..2586ac070190 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -461,7 +461,7 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct 
svm_range *prange,
0, node->id, trigger);
 
svm_range_dma_unmap(adev->dev, scratch, 0, npages);
-   svm_range_free_dma_mappings(prange);
+   svm_range_free_dma_mappings(prange, true);
 
 out_free:
kvfree(buf);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 1b50eae051a4..d1ff1c7e96d0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -241,7 +241,7 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t 
*dma_addr,
}
 }
 
-void svm_range_free_dma_mappings(struct svm_range *prange)
+void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma)
 {
struct kfd_process_device *pdd;
dma_addr_t *dma_addr;
@@ -262,7 +262,8 @@ void svm_range_free_dma_mappings(struct svm_range *prange)
continue;
}
dev = >dev->adev->pdev->dev;
-   svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
+   if (unmap_dma)
+   svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
kvfree(dma_addr);
prange->dma_addr[gpuidx] = NULL;
}
@@ -277,7 +278,7 @@ static void svm_range_free(struct svm_range *prange, bool 
update_mem_usage)
 prange->start, prange->last);
 
svm_range_vram_node_free(prange);
-   svm_range_free_dma_mappings(prange);
+   svm_range_free_dma_mappings(prange, update_mem_usage);
 
if (update_mem_usage && !p->xnack_enabled) {
pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
@@ -851,12 +852,46 @@ static void svm_range_debug_dump(struct svm_range_list 
*svms)
}
 }
 
+static int
+svm_range_copy_array(void *ppdst, void *ppsrc, size_t size,
+uint64_t num_elements, uint64_t offset)
+{
+   unsigned char *dst, *psrc;
+
+   psrc = *(unsigned char **)ppsrc;
+   dst = kvmalloc_array(num_elements, size, GFP_KERNEL);
+   if (!dst)
+   return -ENOMEM;
+   memcpy(dst, psrc + offset, num_elements * size);
+   *(void **)ppdst = dst;
+
+   return 0;
+}
+
+static int
+svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src)
+{
+   int i, r;
+
+   for (i = 0; i < MAX_GPU_INSTANCE; i++) {
+   if (!src->dma_addr[i])
+   continue;
+   r = svm_range_copy_array(>dma_addr[i], >dma_addr[i],
+sizeof(*src->dma_addr[i]), 
src->npages, 0);
+   if (r)
+   return r;
+   }
+
+   return 0;
+}
+
 static int
 svm_range_split_array(void *ppnew, void *ppold, size_t size,
  uint64_t old_start, uint64_t old_n,
  uint64_t new_start, uint64_t new_n)
 {
unsigned char *new, *old, *pold;
+   int r;
uint64_t d;
 
if (!ppold)
@@ -865,22 +900,16 @@ svm_range_split_array(void *ppnew, void *ppold, size_t 
size,
if (!pold)
return 0;
 
-   new = kvmalloc_array(new_n, size, GFP_KERNEL);
-   if (!new)
-   return -ENOMEM;
-
d = (new_start - old_start) * size;
-   memcpy(new, pold + d, new_n * size);
-
-   old = kvmalloc_array(old_n, size, GFP_KERNEL);
-   if (!old) {
+   r = svm_range_copy_array(, ppold, size, new_n, d);
+   if (r)
+   return r;
+   d = (new_start == old_start) ? new_n * size : 0;
+   r = svm_range_copy_array(, ppold, size, old_n, d);
+   if (r) {
kvfree(new);
-   return -ENOMEM;
+   return r;
}
-
-   d = (new_start == old_start) ? new_n * size : 0;
-   memcpy(old, pold + d, old_n * size);
-
kvfree(pold);
*(void **)ppold = old;
*(void **)ppnew = new;
@@ -2075,7 +2104,11 @@ svm_range_add(struct kfd_process *p, uint64_t start, 
uint64_t size,
r = -ENOMEM;

[PATCH] drm/amdkfd: avoid svm dump when dynamic debug disabled

2023-07-19 Thread Alex Sierra

Set dynamic_svm_range_dump macro to avoid iterating over SVM lists
from svm_range_debug_dump when dynamic debug is disabled. Otherwise,
it could drop performance, specially with big number of SVM ranges.
Make sure both svm_range_set_attr and svm_range_debug_dump functions
are dynamically enabled to print svm_range_debug_dump debug traces.

Signed-off-by: Alex Sierra 
Tested-by: Alex Sierra 
Signed-off-by: Philip Yang 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 479c4f66afa7..1b50eae051a4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -46,6 +46,8 @@
  * page table is updated.
  */
 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING   (2UL * NSEC_PER_MSEC)
+#define dynamic_svm_range_dump(svms) \
+   _dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms)
 
 /* Giant svm range split into smaller ranges based on this, it is decided using
  * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to
@@ -3563,7 +3565,7 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
break;
}
 
-   svm_range_debug_dump(svms);
+   dynamic_svm_range_dump(svms);
 
mutex_unlock(>lock);
mmap_read_unlock(mm);
-- 
2.32.0

[PATCH] drm/amdkfd: avoid svm dump when dynamic debug disabled

2023-07-19 Thread Alex Sierra

Set dynamic_svm_range_dump macro to avoid iterating over SVM lists
from svm_range_debug_dump when dynamic debug is disabled. Otherwise,
it could drop performance, specially with big number of SVM ranges.
Make sure both svm_range_set_attr and svm_range_debug_dump functions
are dynamically enabled to print svm_range_debug_dump debug traces.

Signed-off-by: Alex Sierra 
Tested-by: Alex Sierra 
Signed-off-by: Philip Yang 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 479c4f66afa7..0687f27f506c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3563,7 +3563,7 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
break;
}
 
-   svm_range_debug_dump(svms);
+   dynamic_svm_range_dump(svms);
 
mutex_unlock(>lock);
mmap_read_unlock(mm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 21b14510882b..ed4cd501fafe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -39,6 +39,9 @@
 #define SVM_ADEV_PGMAP_OWNER(adev)\
((adev)->hive ? (void *)(adev)->hive : (void *)(adev))
 
+#define dynamic_svm_range_dump(svms) \
+   _dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms)
+
 struct svm_range_bo {
struct amdgpu_bo*bo;
struct kref kref;
-- 
2.32.0

[PATCH] drm/amdkfd: avoid svm dump when dynamic debug disabled

2023-07-08 Thread Alex Sierra

svm_range_debug_dump should not be called at all when dynamic debug
is disabled to avoid iterating over SVM lists. This could drop
performance, specially with big number of SVM ranges.

Signed-off-by: Alex Sierra 
Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 479c4f66afa7..4fb427fc5942 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -821,7 +821,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct 
svm_range *prange,
  *
  * Context: The caller must hold svms->lock
  */
-static void svm_range_debug_dump(struct svm_range_list *svms)
+static int svm_range_debug_dump(struct svm_range_list *svms)
 {
struct interval_tree_node *node;
struct svm_range *prange;
@@ -847,6 +847,8 @@ static void svm_range_debug_dump(struct svm_range_list 
*svms)
 prange->actual_loc);
node = interval_tree_iter_next(node, 0, ~0ULL);
}
+
+   return 0;
 }
 
 static int
@@ -3563,7 +3565,7 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
break;
}
 
-   svm_range_debug_dump(svms);
+   pr_debug("%d", svm_range_debug_dump(svms));
 
mutex_unlock(>lock);
mmap_read_unlock(mm);
-- 
2.32.0

[PATCH] drm/amdkfd: set coherent host access capability flag

2023-06-15 Thread Alex Sierra

This flag determines whether the host possesses coherent access to
the memory of the device.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 90b86a6ac7bd..7ede3de4f7fb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -2107,6 +2107,10 @@ int kfd_topology_add_device(struct kfd_node *gpu)
if (KFD_IS_SVM_API_SUPPORTED(dev->gpu->adev))
dev->node_props.capability |= HSA_CAP_SVMAPI_SUPPORTED;
 
+   if (dev->gpu->adev->gmc.is_app_apu |
+   dev->gpu->adev->gmc.xgmi.connected_to_cpu)
+   dev->node_props.capability |= HSA_CAP_FLAGS_COHERENTHOSTACCESS;
+
kfd_debug_print_topology();
 
kfd_notify_gpu_change(gpu_id, 1);
-- 
2.32.0

[PATCH] drm/amdkfd: set coherent host access capability flag

2023-06-15 Thread Alex Sierra

This flag determines whether the host possesses coherent access to
the memory of the device.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 90b86a6ac7bd..296219be350d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -2107,6 +2107,9 @@ int kfd_topology_add_device(struct kfd_node *gpu)
if (KFD_IS_SVM_API_SUPPORTED(dev->gpu->adev))
dev->node_props.capability |= HSA_CAP_SVMAPI_SUPPORTED;
 
+   if (dev->gpu->adev->gmc.xgmi.connected_to_cpu)
+   dev->node_props.capability |= HSA_CAP_FLAGS_COHERENTHOSTACCESS;
+
kfd_debug_print_topology();
 
kfd_notify_gpu_change(gpu_id, 1);
-- 
2.32.0

[PATCH] drm/amdkfd: flag added to handle errors from svm validate and map

2023-05-29 Thread Alex Sierra

If a return error is raised during validation and mapping of a
prange, this flag is set. It is a rare occurrence, but it could happen
when `amdgpu_hmm_range_get_pages_done` returns true. In such cases,
the caller should retry. However, it is important to ensure that the
prange is updated correctly during the retry.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index fcfde9140bce..910c0269598a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -823,7 +823,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct 
svm_range *prange,
}
}
 
-   return true;
+   return !prange->is_error_flag;
 }
 
 /**
@@ -1657,6 +1657,7 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
 unreserve_out:
svm_range_unreserve_bos();
 
+   prange->is_error_flag = !!r;
if (!r)
prange->validate_timestamp = ktime_get_boottime();
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 7a33b93f9df6..b716d4bf7ee0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -133,6 +133,7 @@ struct svm_range {
DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
boolvalidated_once;
boolmapped_to_gpu;
+   boolis_error_flag;
 };
 
 static inline void svm_range_lock(struct svm_range *prange)
-- 
2.32.0

[PATCH] drm/amd: SVM flag error added at svm_range flags

2023-05-29 Thread Alex Sierra

If a return error is raised during validation and mapping of a
prange, this flag is set. It is a rare occurrence, but it could happen
when `amdgpu_hmm_range_get_pages_done` returns true. In such cases,
the caller should retry. However, it is important to ensure that the
prange is updated correctly during the retry.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 12 +++-
 include/uapi/linux/kfd_ioctl.h   |  2 ++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index fcfde9140bce..96abae515bcf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -756,10 +756,12 @@ svm_range_apply_attrs(struct kfd_process *p, struct 
svm_range *prange,
break;
case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
*update_mapping = true;
+   attrs[i].value &= ~KFD_IOCTL_SVM_FLAG_ERROR;
prange->flags |= attrs[i].value;
break;
case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
*update_mapping = true;
+   attrs[i].value &= ~KFD_IOCTL_SVM_FLAG_ERROR;
prange->flags &= ~attrs[i].value;
break;
case KFD_IOCTL_SVM_ATTR_GRANULARITY:
@@ -778,6 +780,9 @@ svm_range_is_same_attrs(struct kfd_process *p, struct 
svm_range *prange,
uint32_t i;
int gpuidx;
 
+   if (prange->flags & KFD_IOCTL_SVM_FLAG_ERROR)
+   return false;
+
for (i = 0; i < nattr; i++) {
switch (attrs[i].type) {
case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
@@ -1657,8 +1662,11 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
 unreserve_out:
svm_range_unreserve_bos();
 
-   if (!r)
+   prange->flags |= KFD_IOCTL_SVM_FLAG_ERROR;
+   if (!r) {
+   prange->flags &= ~KFD_IOCTL_SVM_FLAG_ERROR;
prange->validate_timestamp = ktime_get_boottime();
+   }
 
return r;
 }
@@ -3674,9 +3682,11 @@ svm_range_get_attr(struct kfd_process *p, struct 
mm_struct *mm,
break;
case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
attrs[i].value = flags_and;
+   attrs[i].value &= ~KFD_IOCTL_SVM_FLAG_ERROR;
break;
case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
attrs[i].value = ~flags_or;
+   attrs[i].value &= ~KFD_IOCTL_SVM_FLAG_ERROR;
break;
case KFD_IOCTL_SVM_ATTR_GRANULARITY:
attrs[i].value = (uint32_t)granularity;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 7e19a2d1e907..2b6f68bd06da 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -1125,6 +1125,8 @@ struct kfd_ioctl_cross_memory_copy_deprecated_args {
 #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x0020
 /* Keep GPU memory mapping always valid as if XNACK is disable */
 #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED   0x0040
+/* Set during svm validation and map error */
+#define KFD_IOCTL_SVM_FLAG_ERROR   0x8000
 
 /**
  * kfd_ioctl_svm_op - SVM ioctl operations
-- 
2.32.0

[PATCH] drm/amdgpu: improve wait logic at fence polling

2023-04-26 Thread Alex Sierra

Accomplish this by reading the seq number right away instead of sleep
for 5us. There are certain cases where the fence is ready almost
immediately. Sleep number granularity was also reduced as the majority
of the kiq tlb flush takes between 2us to 6us.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index a7627cc0118d..9192896239e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -377,14 +377,11 @@ signed long amdgpu_fence_wait_polling(struct amdgpu_ring 
*ring,
  uint32_t wait_seq,
  signed long timeout)
 {
-   uint32_t seq;
-
-   do {
-   seq = amdgpu_fence_read(ring);
-   udelay(5);
-   timeout -= 5;
-   } while ((int32_t)(wait_seq - seq) > 0 && timeout > 0);
 
+   while ((int32_t)(wait_seq - amdgpu_fence_read(ring)) > 0 && timeout > 
0) {
+   udelay(2);
+   timeout -= 2;
+   }
return timeout > 0 ? timeout : 0;
 }
 /**
-- 
2.32.0

[PATCH] drm/amdgpu: ensure no PCIe peer access for CPU XGMI iolinks

2022-08-29 Thread Alex Sierra

[Why] Devices with CPU XGMI iolink do not support PCIe peer access.

Signed-off-by: Alex Sierra 
Acked-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ce7d117efdb5..afaa1056e039 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5532,7 +5532,8 @@ bool amdgpu_device_is_peer_accessible(struct 
amdgpu_device *adev,
~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
resource_size_t aper_limit =
adev->gmc.aper_base + adev->gmc.aper_size - 1;
-   bool p2p_access = !(pci_p2pdma_distance_many(adev->pdev,
+   bool p2p_access = !adev->gmc.xgmi.connected_to_cpu &&
+ !(pci_p2pdma_distance_many(adev->pdev,
_adev->dev, 1, true) < 0);
 
return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
-- 
2.32.0

[PATCH] drm/amdgpu: ensure no PCIe peer access for CPU XGMI iolinks

2022-08-26 Thread Alex Sierra

[Why] Devices with CPU XGMI iolink do not support PCIe peer access.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ce7d117efdb5..1ff66718639d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5538,7 +5538,8 @@ bool amdgpu_device_is_peer_accessible(struct 
amdgpu_device *adev,
return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
!(adev->gmc.aper_base & address_mask ||
- aper_limit & address_mask));
+ aper_limit & address_mask) &&
+   !adev->gmc.xgmi.connected_to_cpu);
 #else
return false;
 #endif
-- 
2.32.0

[PATCH v9 00/14] Add MEMORY_DEVICE_COHERENT for coherent device memory mapping

2022-07-16 Thread Alex Sierra

This is our MEMORY_DEVICE_COHERENT patch series rebased and updated
for current 5.19.0-rc6

Changes since the last version:
- Fixed problems with migration during long-term pinning in
get_user_pages
- Open coded vm_normal_lru_pages as suggested in previous code review
- Update hmm_gup_test with more get_user_pages calls, include
hmm_cow_in_device in hmm-test.

This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
owned by a device that can be mapped into CPU page tables like
MEMORY_DEVICE_GENERIC and can also be migrated like
MEMORY_DEVICE_PRIVATE.

This patch series is mostly self-contained except for a few places where
it needs to update other subsystems to handle the new memory type.

System stability and performance are not affected according to our
ongoing testing, including xfstests.

How it works: The system BIOS advertises the GPU device memory
(aka VRAM) as SPM (special purpose memory) in the UEFI system address
map.

The amdgpu driver registers the memory with devmap as
MEMORY_DEVICE_COHERENT using devm_memremap_pages. The initial user for
this hardware page migration capability is the Frontier supercomputer
project. This functionality is not AMD-specific. We expect other GPU
vendors to find this functionality useful, and possibly other hardware
types in the future.

Our test nodes in the lab are similar to the Frontier configuration,
with .5 TB of system memory plus 256 GB of device memory split across
4 GPUs, all in a single coherent address space. Page migration is
expected to improve application efficiency significantly. We will
report empirical results as they become available.

Coherent device type pages at gup are now migrated back to system
memory if they are being pinned long-term (FOLL_LONGTERM). The reason
is, that long-term pinning would interfere with the device memory
manager owning the device-coherent pages (e.g. evictions in TTM).
These series incorporate Alistair Popple patches to do this
migration from pin_user_pages() calls. hmm_gup_test has been added to
hmm-test to test different get user pages calls.

This series includes handling of device-managed anonymous pages
returned by vm_normal_pages. Although they behave like normal pages
for purposes of mapping in CPU page tables and for COW, they do not
support LRU lists, NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they
expect to put pages on an LRU list.

v2:
- Rebase to latest 5.18-rc7.
- Drop patch "mm: add device coherent checker to remove migration pte"
and modify try_to_migrate_one, to let DEVICE_COHERENT pages fall
through to normal page path. Based on Alistair Popple's comment.
- Fix comment formatting.
- Reword comment in vm_normal_page about pte_devmap().
- Merge "drm/amdkfd: coherent type as sys mem on migration to ram" to
"drm/amdkfd: add SPM support for SVM".

v3:
- Rebase to latest 5.18.0.
- Patch "mm: handling Non-LRU pages returned by vm_normal_pages"
reordered.
- Add WARN_ON_ONCE for thp device coherent case.

v4:
- Rebase to latest 5.18.0
- Fix consitency between pages with FOLL_LRU flag set and pte_devmap
at follow_page_pte.

v5:
- Remove unused zone_device_type from lib/test_hmm and
selftest/vm/hmm-test.c.

v6:
- Rebase to 5.19.0-rc4
- Rename is_pinnable_page to is_longterm_pinnable_page and add a
coherent device checker.
- Add a new gup test to hmm-test to cover fast pinnable case with
FOLL_LONGTERM.

v7:
- Reorder patch series.
- Remove FOLL_LRU and check on each caller for LRU pages handling
instead.

v8:
- Add "mm: move page zone helpers into new header-specific file"
patch. The intention is to centralize all page zone helpers and keep
them independent from mm.h and memremap.h.

v9:
- Rebase to 5.19.0-rc6
- Include latest Alistair's patch
"mm/gup: migrate device coherent pages when pinning instead of failing"
with changes based on David Hildenbrand comments.
- Replace moving page zone helpers into new header-specific file.
Instead, those were moved to mmzone.h.
Patch "mm: move page zone helpers from mm.h to mmzone.h"

Alex Sierra (13):
  mm: rename is_pinnable_pages to is_longterm_pinnable_pages
  mm: move page zone helpers from mm.h to mmzone.h
  mm: add zone device coherent type memory support
  mm: handling Non-LRU pages returned by vm_normal_pages
  mm: add device coherent vma selection for memory migration
  drm/amdkfd: add SPM support for SVM
  lib: test_hmm add ioctl to get zone device type
  lib: test_hmm add module param for zone device type
  lib: add support for device coherent type in test_hmm
  tools: update hmm-test to support device coherent type
  tools: update test_hmm script to support SP config
  tools: add hmm gup tests for device coherent type
  tools: add selftests to hmm for COW in device memory

Alistair Popple (1):
  mm/gup: migrate device coherent pages when pinning instead of failing

[PATCH v9 08/14] lib: test_hmm add ioctl to get zone device type

2022-07-16 Thread Alex Sierra

new ioctl cmd added to query zone device type. This will be
used once the test_hmm adds zone device coherent type.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 11 +--
 lib/test_hmm_uapi.h | 14 ++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index cfe632047839..915ef6b5b0d4 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -87,6 +87,7 @@ struct dmirror_chunk {
 struct dmirror_device {
struct cdev cdevice;
struct hmm_devmem   *devmem;
+   unsigned intzone_device_type;
 
unsigned intdevmem_capacity;
unsigned intdevmem_count;
@@ -1260,14 +1261,20 @@ static void dmirror_device_remove(struct dmirror_device 
*mdevice)
 static int __init hmm_dmirror_init(void)
 {
int ret;
-   int id;
+   int id = 0;
+   int ndevices = 0;
 
ret = alloc_chrdev_region(_dev, 0, DMIRROR_NDEVICES,
  "HMM_DMIRROR");
if (ret)
goto err_unreg;
 
-   for (id = 0; id < DMIRROR_NDEVICES; id++) {
+   memset(dmirror_devices, 0, DMIRROR_NDEVICES * 
sizeof(dmirror_devices[0]));
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   for (id = 0; id < ndevices; id++) {
ret = dmirror_device_init(dmirror_devices + id, id);
if (ret)
goto err_chrdev;
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index f14dea5dcd06..0511af7464ee 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -31,10 +31,11 @@ struct hmm_dmirror_cmd {
 /* Expose the address space of the calling process through hmm device file */
 #define HMM_DMIRROR_READ   _IOWR('H', 0x00, struct hmm_dmirror_cmd)
 #define HMM_DMIRROR_WRITE  _IOWR('H', 0x01, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_MIGRATE_IOWR('H', 0x02, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x03, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x04, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x04, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x06, struct hmm_dmirror_cmd)
 
 /*
  * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
@@ -62,4 +63,9 @@ enum {
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
 };
 
+enum {
+   /* 0 is reserved to catch uninitialized type fields */
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1,
+};
+
 #endif /* _LIB_TEST_HMM_UAPI_H */
-- 
2.32.0

[PATCH] drm/amdkfd: track unified memory reservation with xnack off

2022-07-16 Thread Alex Sierra

[WHY]
Unified memory with xnack off should be tracked, as userptr mappings
and legacy allocations do. To avoid oversuscribe system memory when
xnack off.
[How]
Exposing functions reserve_mem_limit and unreserve_mem_limit to SVM
API and call them on every prange creation and free.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  4 ++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 23 ---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  | 60 +--
 3 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 73bf8b5f2aa9..83d955f0c52f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -305,6 +305,10 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
 
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 2bc36ff0aa0f..39d589394160 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -129,7 +129,7 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
  *
  * Return: returns -ENOMEM in case of error, ZERO otherwise
  */
-static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
uint64_t reserved_for_pt =
@@ -169,7 +169,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) ||
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
-   (adev->kfd.vram_used + vram_needed >
+   (adev && adev->kfd.vram_used + vram_needed >
 adev->gmc.real_vram_size -
 atomic64_read(>vram_pin_size) -
 reserved_for_pt)) {
@@ -180,7 +180,10 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
/* Update memory accounting by decreasing available system
 * memory, TTM memory and GPU memory as computed above
 */
-   adev->kfd.vram_used += vram_needed;
+   WARN_ONCE(vram_needed && !adev,
+ "adev reference can't be null when vram is used");
+   if (adev)
+   adev->kfd.vram_used += vram_needed;
kfd_mem_limit.system_mem_used += system_mem_needed;
kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
 
@@ -189,7 +192,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
return ret;
 }
 
-static void unreserve_mem_limit(struct amdgpu_device *adev,
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
spin_lock(_mem_limit.mem_limit_lock);
@@ -198,7 +201,10 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
+   WARN_ONCE(!adev,
+ "adev reference can't be null when alloc mem flags 
vram is set");
+   if (adev)
+   adev->kfd.vram_used -= ALIGN(size, 
VRAM_ALLOCATION_ALIGN);
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
kfd_mem_limit.system_mem_used -= size;
} else if (!(alloc_flag &
@@ -207,8 +213,7 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
goto release;
}
-
-   WARN_ONCE(adev->kfd.vram_used < 0,
+   WARN_ONCE(adev && adev->kfd.vram_used < 0,
  "KFD VRAM memory accounting unbalanced");
WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,
  "KFD TTM memory accounting unbalanced");
@@ -225,7 +230,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
u32 alloc_flags = bo->kfd_bo->alloc_flags;
u64 size = amdgpu_bo_size(bo);
 
-   unreserve_mem_limit(adev, size, alloc_flags);
+   amdgpu_am

[PATCH v9 12/14] tools: update test_hmm script to support SP config

2022-07-16 Thread Alex Sierra

Add two more parameters to set spm_addr_dev0 & spm_addr_dev1
addresses. These two parameters configure the start SP
addresses for each device in test_hmm driver.
Consequently, this configures zone device type as coherent.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/test_hmm.sh | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/test_hmm.sh 
b/tools/testing/selftests/vm/test_hmm.sh
index 0647b525a625..539c9371e592 100755
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ b/tools/testing/selftests/vm/test_hmm.sh
@@ -40,11 +40,26 @@ check_test_requirements()
 
 load_driver()
 {
-   modprobe $DRIVER > /dev/null 2>&1
+   if [ $# -eq 0 ]; then
+   modprobe $DRIVER > /dev/null 2>&1
+   else
+   if [ $# -eq 2 ]; then
+   modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+   > /dev/null 2>&1
+   else
+   echo "Missing module parameters. Make sure pass"\
+   "spm_addr_dev0 and spm_addr_dev1"
+   usage
+   fi
+   fi
if [ $? == 0 ]; then
major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices)
mknod /dev/hmm_dmirror0 c $major 0
mknod /dev/hmm_dmirror1 c $major 1
+   if [ $# -eq 2 ]; then
+   mknod /dev/hmm_dmirror2 c $major 2
+   mknod /dev/hmm_dmirror3 c $major 3
+   fi
fi
 }
 
@@ -58,7 +73,7 @@ run_smoke()
 {
echo "Running smoke test. Note, this test provides basic coverage."
 
-   load_driver
+   load_driver $1 $2
$(dirname "${BASH_SOURCE[0]}")/hmm-tests
unload_driver
 }
@@ -75,6 +90,9 @@ usage()
echo "# Smoke testing"
echo "./${TEST_NAME}.sh smoke"
echo
+   echo "# Smoke testing with SPM enabled"
+   echo "./${TEST_NAME}.sh smoke  "
+   echo
exit 0
 }
 
@@ -84,7 +102,7 @@ function run_test()
usage
else
if [ "$1" = "smoke" ]; then
-   run_smoke
+   run_smoke $2 $3
else
usage
fi
-- 
2.32.0

[PATCH v9 07/14] drm/amdkfd: add SPM support for SVM

2022-07-16 Thread Alex Sierra

When CPU is connected throug XGMI, it has coherent
access to VRAM resource. In this case that resource
is taken from a table in the device gmc aperture base.
This resource is used along with the device type, which could
be DEVICE_PRIVATE or DEVICE_COHERENT to create the device
page map region.
Also, MIGRATE_VMA_SELECT_DEVICE_COHERENT flag is selected for
coherent type case during migration to device.

Signed-off-by: Alex Sierra 
Reviewed-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 34 +++-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index e44376c2ecdc..f73e3e340413 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -671,13 +671,15 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
migrate.vma = vma;
migrate.start = start;
migrate.end = end;
-   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
+   if (adev->gmc.xgmi.connected_to_cpu)
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+   else
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
 
buf = kvcalloc(npages,
   2 * sizeof(*migrate.src) + sizeof(uint64_t) + 
sizeof(dma_addr_t),
   GFP_KERNEL);
-
if (!buf)
goto out;
 
@@ -947,7 +949,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
 {
struct kfd_dev *kfddev = adev->kfd.dev;
struct dev_pagemap *pgmap;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long size;
void *r;
 
@@ -962,28 +964,34 @@ int svm_migrate_init(struct amdgpu_device *adev)
 * should remove reserved size
 */
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
-   res = devm_request_free_mem_region(adev->dev, _resource, size);
-   if (IS_ERR(res))
-   return -ENOMEM;
+   if (adev->gmc.xgmi.connected_to_cpu) {
+   pgmap->range.start = adev->gmc.aper_base;
+   pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 
1;
+   pgmap->type = MEMORY_DEVICE_COHERENT;
+   } else {
+   res = devm_request_free_mem_region(adev->dev, _resource, 
size);
+   if (IS_ERR(res))
+   return -ENOMEM;
+   pgmap->range.start = res->start;
+   pgmap->range.end = res->end;
+   pgmap->type = MEMORY_DEVICE_PRIVATE;
+   }
 
-   pgmap->type = MEMORY_DEVICE_PRIVATE;
pgmap->nr_range = 1;
-   pgmap->range.start = res->start;
-   pgmap->range.end = res->end;
pgmap->ops = _migrate_pgmap_ops;
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
-   pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
-
+   pgmap->flags = 0;
/* Device manager releases device-specific resources, memory region and
 * pgmap when driver disconnects from device.
 */
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
-
/* Disable SVM support capability */
pgmap->type = 0;
-   devm_release_mem_region(adev->dev, res->start, 
resource_size(res));
+   if (pgmap->type == MEMORY_DEVICE_PRIVATE)
+   devm_release_mem_region(adev->dev, res->start,
+   res->end - res->start + 1);
return PTR_ERR(r);
}
 
-- 
2.32.0

[PATCH v9 04/14] mm: handling Non-LRU pages returned by vm_normal_pages

2022-07-16 Thread Alex Sierra

With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages. Although they
behave like normal pages for purposes of mapping in CPU page, and for
COW. They do not support LRU lists, NUMA migration or THP.

Callers to follow_page() currently don't expect ZONE_DEVICE pages,
however, with DEVICE_COHERENT we might now return ZONE_DEVICE. Check
for ZONE_DEVICE pages in applicable users of follow_page() as well.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling  (v2)
Reviewed-by: Alistair Popple  (v6)
---
 fs/proc/task_mmu.c |  2 +-
 mm/huge_memory.c   |  2 +-
 mm/khugepaged.c|  9 ++---
 mm/ksm.c   |  6 +++---
 mm/madvise.c   |  4 ++--
 mm/memory.c| 10 +-
 mm/mempolicy.c |  2 +-
 mm/migrate.c   |  4 ++--
 mm/mlock.c |  2 +-
 mm/mprotect.c  |  2 +-
 10 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2d04e3470d4c..2dd8c8a66924 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1792,7 +1792,7 @@ static struct page *can_gather_numa_stats(pte_t pte, 
struct vm_area_struct *vma,
return NULL;
 
page = vm_normal_page(vma, addr, pte);
-   if (!page)
+   if (!page || is_zone_device_page(page))
return NULL;
 
if (PageReserved(page))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 834f288b3769..c47e95b02244 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2910,7 +2910,7 @@ static int split_huge_pages_pid(int pid, unsigned long 
vaddr_start,
 
if (IS_ERR(page))
continue;
-   if (!page)
+   if (!page || is_zone_device_page(page))
continue;
 
if (!is_transparent_hugepage(page))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 16be62d493cd..671ac7800e53 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -618,7 +618,7 @@ static int __collapse_huge_page_isolate(struct 
vm_area_struct *vma,
goto out;
}
page = vm_normal_page(vma, address, pteval);
-   if (unlikely(!page)) {
+   if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
}
@@ -1267,7 +1267,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
writable = true;
 
page = vm_normal_page(vma, _address, pteval);
-   if (unlikely(!page)) {
+   if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
@@ -1479,7 +1479,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr)
goto abort;
 
page = vm_normal_page(vma, addr, *pte);
-
+   if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+   page = NULL;
/*
 * Note that uprobe, debugger, or MAP_PRIVATE may change the
 * page table, but the new page will not be a subpage of hpage.
@@ -1497,6 +1498,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr)
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
+   if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+   goto abort;
page_remove_rmap(page, vma, false);
}
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 54f78c9eecae..831b18a7a50b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned 
long addr)
cond_resched();
page = follow_page(vma, addr,
FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-   if (IS_ERR_OR_NULL(page))
+   if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
@@ -560,7 +560,7 @@ static struct page *get_mergeable_page(struct rmap_item 
*rmap_item)
goto out;
 
page = follow_page(vma, addr, FOLL_GET);
-   if (IS_ERR_OR_NULL(page))
+   if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
goto out;
if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
@@ -2308,7 +2308,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct 
page **page)
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
-   if (IS_ERR_OR_NULL(*page)) {
+   if (IS_ERR_OR_NULL(*page) || 
is_zone_d

[PATCH v9 14/14] tools: add selftests to hmm for COW in device memory

2022-07-16 Thread Alex Sierra

The objective is to test device migration mechanism in pages marked
as COW, for private and coherent device type. In case of writing to
COW private page(s), a page fault will migrate pages back to system
memory first. Then, these pages will be duplicated. In case of COW
device coherent type, pages are duplicated directly from device
memory.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
---
 tools/testing/selftests/vm/hmm-tests.c | 80 ++
 1 file changed, 80 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index bb38b9777610..716b62c05e3d 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1874,4 +1874,84 @@ TEST_F(hmm, hmm_gup_test)
close(gup_fd);
hmm_buffer_free(buffer);
 }
+
+/*
+ * Test copy-on-write in device pages.
+ * In case of writing to COW private page(s), a page fault will migrate pages
+ * back to system memory first. Then, these pages will be duplicated. In case
+ * of COW device coherent type, pages are duplicated directly from device
+ * memory.
+ */
+TEST_F(hmm, hmm_cow_in_device)
+{
+   struct hmm_buffer *buffer;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+   pid_t pid;
+   int status;
+
+   npages = 4;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+
+   pid = fork();
+   if (pid == -1)
+   ASSERT_EQ(pid, 0);
+   if (!pid) {
+   /* Child process waitd for SIGTERM from the parent. */
+   while (1) {
+   }
+   perror("Should not reach this\n");
+   exit(0);
+   }
+   /* Parent process writes to COW pages(s) and gets a
+* new copy in system. In case of device private pages,
+* this write causes a migration to system mem first.
+*/
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Terminate child and wait */
+   EXPECT_EQ(0, kill(pid, SIGTERM));
+   EXPECT_EQ(pid, waitpid(pid, , 0));
+   EXPECT_NE(0, WIFSIGNALED(status));
+   EXPECT_EQ(SIGTERM, WTERMSIG(status));
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   for (i = 0; i < npages; i++)
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
+
+   hmm_buffer_free(buffer);
+}
 TEST_HARNESS_MAIN
-- 
2.32.0

[PATCH v9 06/14] mm/gup: migrate device coherent pages when pinning instead of failing

2022-07-16 Thread Alex Sierra

From: Alistair Popple 

Currently any attempts to pin a device coherent page will fail. This is
because device coherent pages need to be managed by a device driver, and
pinning them would prevent a driver from migrating them off the device.

However this is no reason to fail pinning of these pages. These are
coherent and accessible from the CPU so can be migrated just like
pinning ZONE_MOVABLE pages. So instead of failing all attempts to pin
them first try migrating them out of ZONE_DEVICE.

[hch: rebased to the split device memory checks,
  moved migrate_device_page to migrate_device.c]

Signed-off-by: Alistair Popple 
Acked-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 mm/gup.c| 50 +--
 mm/internal.h   |  1 +
 mm/migrate_device.c | 52 +
 3 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index b65fe8bf5af4..22b97ab61cd9 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1881,7 +1881,7 @@ static long check_and_migrate_movable_pages(unsigned long 
nr_pages,
unsigned long isolation_error_count = 0, i;
struct folio *prev_folio = NULL;
LIST_HEAD(movable_page_list);
-   bool drain_allow = true;
+   bool drain_allow = true, coherent_pages = false;
int ret = 0;
 
for (i = 0; i < nr_pages; i++) {
@@ -1891,9 +1891,38 @@ static long check_and_migrate_movable_pages(unsigned 
long nr_pages,
continue;
prev_folio = folio;
 
-   if (folio_is_longterm_pinnable(folio))
+   /*
+* Device coherent pages are managed by a driver and should not
+* be pinned indefinitely as it prevents the driver moving the
+* page. So when trying to pin with FOLL_LONGTERM instead try
+* to migrate the page out of device memory.
+*/
+   if (folio_is_device_coherent(folio)) {
+   /*
+* We always want a new GUP lookup with device coherent
+* pages.
+*/
+   pages[i] = 0;
+   coherent_pages = true;
+
+   /*
+* Migration will fail if the page is pinned, so convert
+* the pin on the source page to a normal reference.
+*/
+   if (gup_flags & FOLL_PIN) {
+   get_page(>page);
+   unpin_user_page(>page);
+   }
+
+   ret = migrate_device_coherent_page(>page);
+   if (ret)
+   goto unpin_pages;
+
continue;
+   }
 
+   if (folio_is_longterm_pinnable(folio))
+   continue;
/*
 * Try to move out any movable page before pinning the range.
 */
@@ -1919,7 +1948,8 @@ static long check_and_migrate_movable_pages(unsigned long 
nr_pages,
folio_nr_pages(folio));
}
 
-   if (!list_empty(_page_list) || isolation_error_count)
+   if (!list_empty(_page_list) || isolation_error_count
+   || coherent_pages)
goto unpin_pages;
 
/*
@@ -1929,10 +1959,16 @@ static long check_and_migrate_movable_pages(unsigned 
long nr_pages,
return nr_pages;
 
 unpin_pages:
-   if (gup_flags & FOLL_PIN) {
-   unpin_user_pages(pages, nr_pages);
-   } else {
-   for (i = 0; i < nr_pages; i++)
+   /*
+* pages[i] might be NULL if any device coherent pages were found.
+*/
+   for (i = 0; i < nr_pages; i++) {
+   if (!pages[i])
+   continue;
+
+   if (gup_flags & FOLL_PIN)
+   unpin_user_page(pages[i]);
+   else
put_page(pages[i]);
}
 
diff --git a/mm/internal.h b/mm/internal.h
index c0f8fbe0445b..899dab512c5a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -853,6 +853,7 @@ int numa_migrate_prep(struct page *page, struct 
vm_area_struct *vma,
  unsigned long addr, int page_nid, int *flags);
 
 void free_zone_device_page(struct page *page);
+int migrate_device_coherent_page(struct page *page);
 
 /*
  * mm/gup.c
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 18bc6483f63a..7feeb447e3b9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -686,6 +686,12 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
 
if (!page) {
+   /*
+* The only time there is no vma is when called from
+* migrate_device_coherent_page(). However this isn't
+* called if

[PATCH v9 13/14] tools: add hmm gup tests for device coherent type

2022-07-16 Thread Alex Sierra

The intention is to test hmm device coherent type under different get
user pages paths. Also, test gup with FOLL_LONGTERM flag set in
device coherent pages. These pages should get migrated back to system
memory.

Signed-off-by: Alex Sierra 
Reviewed-by: Alistair Popple 
---
 tools/testing/selftests/vm/hmm-tests.c | 110 +
 1 file changed, 110 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 4b547188ec40..bb38b9777610 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -36,6 +36,7 @@
  * in the usual include/uapi/... directory.
  */
 #include "../../../../lib/test_hmm_uapi.h"
+#include "../../../../mm/gup_test.h"
 
 struct hmm_buffer {
void*ptr;
@@ -59,6 +60,9 @@ enum {
 #define NTIMES 10
 
 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01/* check pte is writable */
+#define FOLL_LONGTERM   0x1 /* mapping lifetime is indefinite */
 
 FIXTURE(hmm)
 {
@@ -1764,4 +1768,110 @@ TEST_F(hmm, exclusive_cow)
hmm_buffer_free(buffer);
 }
 
+static int gup_test_exec(int gup_fd, unsigned long addr, int cmd,
+int npages, int size, int flags)
+{
+   struct gup_test gup = {
+   .nr_pages_per_call  = npages,
+   .addr   = addr,
+   .gup_flags  = FOLL_WRITE | flags,
+   .size   = size,
+   };
+
+   if (ioctl(gup_fd, cmd, )) {
+   perror("ioctl on error\n");
+   return errno;
+   }
+
+   return 0;
+}
+
+/*
+ * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
+ * This should trigger a migration back to system memory for both, private
+ * and coherent type pages.
+ * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
+ * to your configuration before you run it.
+ */
+TEST_F(hmm, hmm_gup_test)
+{
+   struct hmm_buffer *buffer;
+   int gup_fd;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+
+   gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+   if (gup_fd == -1)
+   SKIP(return, "Skipping test, could not find gup_test driver");
+
+   npages = 4;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   /* Check what the device read. */
+   for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+   ASSERT_EQ(ptr[i], i);
+
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr,
+   GUP_BASIC_TEST, 1, self->page_size, 0), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 1 * 
self->page_size,
+   GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 2 * 
self->page_size,
+   PIN_FAST_BENCHMARK, 1, self->page_size, 
FOLL_LONGTERM), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 3 * 
self->page_size,
+   PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 
0);
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   if (hmm_is_coherent_type(variant->device_number)) {
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[1]);
+   } else {
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
+

[PATCH v9 09/14] lib: test_hmm add module param for zone device type

2022-07-16 Thread Alex Sierra

In order to configure device coherent in test_hmm, two module parameters
should be passed, which correspond to the SP start address of each
device (2) spm_addr_dev0 & spm_addr_dev1. If no parameters are passed,
private device type is configured.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 73 -
 lib/test_hmm_uapi.h |  1 +
 2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 915ef6b5b0d4..afb30af9f3ff 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -37,6 +37,16 @@
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+static unsigned long spm_addr_dev0;
+module_param(spm_addr_dev0, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev0,
+   "Specify start address for SPM (special purpose memory) used 
for device 0. By setting this Coherent device type will be used. Make sure 
spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
+static unsigned long spm_addr_dev1;
+module_param(spm_addr_dev1, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev1,
+   "Specify start address for SPM (special purpose memory) used 
for device 1. By setting this Coherent device type will be used. Make sure 
spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
 static const struct dev_pagemap_ops dmirror_devmem_ops;
 static const struct mmu_interval_notifier_ops dmirror_min_ops;
 static dev_t dmirror_dev;
@@ -455,28 +465,44 @@ static int dmirror_write(struct dmirror *dmirror, struct 
hmm_dmirror_cmd *cmd)
return ret;
 }
 
-static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
+static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
   struct page **ppage)
 {
struct dmirror_chunk *devmem;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long pfn;
unsigned long pfn_first;
unsigned long pfn_last;
void *ptr;
+   int ret = -ENOMEM;
 
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
-   return false;
+   return ret;
 
-   res = request_free_mem_region(_resource, DEVMEM_CHUNK_SIZE,
- "hmm_dmirror");
-   if (IS_ERR(res))
+   switch (mdevice->zone_device_type) {
+   case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
+   res = request_free_mem_region(_resource, 
DEVMEM_CHUNK_SIZE,
+ "hmm_dmirror");
+   if (IS_ERR_OR_NULL(res))
+   goto err_devmem;
+   devmem->pagemap.range.start = res->start;
+   devmem->pagemap.range.end = res->end;
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   break;
+   case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
+   devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) 
?
+   spm_addr_dev0 :
+   spm_addr_dev1;
+   devmem->pagemap.range.end = devmem->pagemap.range.start +
+   DEVMEM_CHUNK_SIZE - 1;
+   devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
+   break;
+   default:
+   ret = -EINVAL;
goto err_devmem;
+   }
 
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-   devmem->pagemap.range.start = res->start;
-   devmem->pagemap.range.end = res->end;
devmem->pagemap.nr_range = 1;
devmem->pagemap.ops = _devmem_ops;
devmem->pagemap.owner = mdevice;
@@ -497,10 +523,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
mdevice->devmem_capacity = new_capacity;
mdevice->devmem_chunks = new_chunks;
}
-
ptr = memremap_pages(>pagemap, numa_node_id());
-   if (IS_ERR(ptr))
+   if (IS_ERR_OR_NULL(ptr)) {
+   if (ptr)
+   ret = PTR_ERR(ptr);
+   else
+   ret = -EFAULT;
goto err_release;
+   }
 
devmem->mdevice = mdevice;
pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
@@ -529,15 +559,17 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
}
spin_unlock(>lock);
 
-   return true;
+   return 0;
 
 err_release:
mutex_unlock(>devmem_lock);
-   release_mem_region(devmem->pagemap.range.start, 
range_len(>pagemap.range));
+   if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
+   release_mem_region(devme

[PATCH v9 10/14] lib: add support for device coherent type in test_hmm

2022-07-16 Thread Alex Sierra

Device Coherent type uses device memory that is coherently accesible by
the CPU. This could be shown as SP (special purpose) memory range
at the BIOS-e820 memory enumeration. If no SP memory is supported in
system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x1 &
0x14000 physical address. Ex.
efi_fake_mem=1G@0x1:0x4,1G@0x14000:0x4

Private and coherent device mirror instances can be created in the same
probed. This is done by passing the module parameters spm_addr_dev0 &
spm_addr_dev1. In this case, it will create four instances of
device_mirror. The first two correspond to private device type, the
last two to coherent type. Then, they can be easily accessed from user
space through /dev/hmm_mirror. Usually num_device 0 and 1
are for private, and 2 and 3 for coherent types. If no module
parameters are passed, two instances of private type device_mirror will
be created only.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
---
 lib/test_hmm.c  | 253 +---
 lib/test_hmm_uapi.h |   4 +
 2 files changed, 196 insertions(+), 61 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index afb30af9f3ff..7930853e7fc5 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -32,11 +32,22 @@
 
 #include "test_hmm_uapi.h"
 
-#define DMIRROR_NDEVICES   2
+#define DMIRROR_NDEVICES   4
 #define DMIRROR_RANGE_FAULT_TIMEOUT1000
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+/*
+ * For device_private pages, dpage is just a dummy struct page
+ * representing a piece of device memory. dmirror_devmem_alloc_page
+ * allocates a real system memory page as backing storage to fake a
+ * real device. zone_device_data points to that backing page. But
+ * for device_coherent memory, the struct page represents real
+ * physical CPU-accessible memory that we can use directly.
+ */
+#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
+  (page)->zone_device_data : (page))
+
 static unsigned long spm_addr_dev0;
 module_param(spm_addr_dev0, long, 0644);
 MODULE_PARM_DESC(spm_addr_dev0,
@@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce 
*bounce,
return 0;
 }
 
+static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
+{
+   return (mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+}
+
+static enum migrate_vma_direction
+dmirror_select_device(struct dmirror *dmirror)
+{
+   return (dmirror->mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
+   MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+}
+
 static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
 {
vfree(bounce->ptr);
@@ -575,16 +601,19 @@ static int dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 {
struct page *dpage = NULL;
-   struct page *rpage;
+   struct page *rpage = NULL;
 
/*
-* This is a fake device so we alloc real system memory to store
-* our device memory.
+* For ZONE_DEVICE private type, this is a fake device so we allocate
+* real system memory to store our device memory.
+* For ZONE_DEVICE coherent type we use the actual dpage to store the
+* data and ignore rpage.
 */
-   rpage = alloc_page(GFP_HIGHUSER);
-   if (!rpage)
-   return NULL;
-
+   if (dmirror_is_private_zone(mdevice)) {
+   rpage = alloc_page(GFP_HIGHUSER);
+   if (!rpage)
+   return NULL;
+   }
spin_lock(>lock);
 
if (mdevice->free_pages) {
@@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct 
dmirror_device *mdevice)
return dpage;
 
 error:
-   __free_page(rpage);
+   if (rpage)
+   __free_page(rpage);
return NULL;
 }
 
@@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct 
migrate_vma *args,
 * unallocated pte_none() or read-only zero page.
 */
spage = migrate_pfn_to_page(*src);
+   if (WARN(spage && is_zone_device_page(spage),
+"page already in device spage pfn: 0x%lx\n",
+page_to_pfn(spage)))
+   continue;
 
dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;
 
-   rpage = dpage->

[PATCH v9 03/14] mm: add zone device coherent type memory support

2022-07-16 Thread Alex Sierra

Device memory that is cache coherent from device and CPU point of view.
This is used on platforms that have an advanced system bus (like CAPI
or CXL). Any page of a process can be migrated to such memory. However,
no one should be allowed to pin such memory so that it can always be
evicted.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
[hch: rebased ontop of the refcount changes,
  removed is_dev_private_or_coherent_page]
Signed-off-by: Christoph Hellwig 
Acked-by: David Hildenbrand 
---
 include/linux/memremap.h | 19 +++
 include/linux/mm.h   |  5 -
 mm/memcontrol.c  |  7 ---
 mm/memory-failure.c  |  8 ++--
 mm/memremap.c| 10 ++
 mm/migrate_device.c  | 16 +++-
 mm/rmap.c|  5 +++--
 7 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 77229165c914..f27b142fd3d0 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -41,6 +41,13 @@ struct vmem_altmap {
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -61,6 +68,7 @@ struct vmem_altmap {
 enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
+   MEMORY_DEVICE_COHERENT,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA,
@@ -150,6 +158,17 @@ static inline bool is_pci_p2pdma_page(const struct page 
*page)
page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
 
+static inline bool is_device_coherent_page(const struct page *page)
+{
+   return is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_DEVICE_COHERENT;
+}
+
+static inline bool folio_is_device_coherent(const struct folio *folio)
+{
+   return is_device_coherent_page(>page);
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2df8c2b98d36..3ed101dfbfab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mempolicy;
 struct anon_vma;
@@ -1522,7 +1523,9 @@ static inline bool is_longterm_pinnable_page(struct page 
*page)
if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
return false;
 #endif
-   return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
+   return !(is_device_coherent_page(page) ||
+is_zone_movable_page(page) ||
+is_zero_pfn(page_to_pfn(page)));
 }
 #else
 static inline bool is_longterm_pinnable_page(struct page *page)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 618c366a2f07..5d37a85c67da 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5665,8 +5665,8 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  * target for charge migration. if @target is not NULL, the entry is stored
  * in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
+ *   thus not on the lru.
  * For now we such page is charge like a regular page would be as for all
  * intent and purposes it is just special memory taking the place of a
  * regular page.
@@ -5704,7 +5704,8 @@ static enum mc_target_type get_mctgt_type(struct 
vm_area_struct *vma,
 */
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
-   if (is_device_private_page(page))
+   if (is_device_private_page(page) ||
+   is_device_coherent_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index da39ec8afca8..79f175eeb190 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1685,12 +1685,16 @@ static int memory_failure_dev_pa

[PATCH v9 11/14] tools: update hmm-test to support device coherent type

2022-07-16 Thread Alex Sierra

Test cases such as migrate_fault and migrate_multiple, were modified to
explicit migrate from device to sys memory without the need of page
faults, when using device coherent type.

Snapshot test case updated to read memory device type first and based
on that, get the proper returned results migrate_ping_pong test case
added to test explicit migration from device to sys memory for both
private and coherent zone types.

Helpers to migrate from device to sys memory and vicerversa
were also added.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/hmm-tests.c | 121 -
 1 file changed, 100 insertions(+), 21 deletions(-)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 203323967b50..4b547188ec40 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -46,6 +46,13 @@ struct hmm_buffer {
uint64_tfaults;
 };
 
+enum {
+   HMM_PRIVATE_DEVICE_ONE,
+   HMM_PRIVATE_DEVICE_TWO,
+   HMM_COHERENCE_DEVICE_ONE,
+   HMM_COHERENCE_DEVICE_TWO,
+};
+
 #define TWOMEG (1 << 21)
 #define HMM_BUFFER_SIZE (1024 << 12)
 #define HMM_PATH_MAX64
@@ -60,6 +67,21 @@ FIXTURE(hmm)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm)
+{
+   int device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+   .device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+   .device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
 FIXTURE(hmm2)
 {
int fd0;
@@ -68,6 +90,24 @@ FIXTURE(hmm2)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm2)
+{
+   int device_number0;
+   int device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+   .device_number0 = HMM_PRIVATE_DEVICE_ONE,
+   .device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+   .device_number0 = HMM_COHERENCE_DEVICE_ONE,
+   .device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
 static int hmm_open(int unit)
 {
char pathname[HMM_PATH_MAX];
@@ -81,12 +121,19 @@ static int hmm_open(int unit)
return fd;
 }
 
+static bool hmm_is_coherent_type(int dev_num)
+{
+   return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
 FIXTURE_SETUP(hmm)
 {
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd = hmm_open(0);
+   self->fd = hmm_open(variant->device_number);
+   if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd, 0);
 }
 
@@ -95,9 +142,11 @@ FIXTURE_SETUP(hmm2)
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd0 = hmm_open(0);
+   self->fd0 = hmm_open(variant->device_number0);
+   if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd0, 0);
-   self->fd1 = hmm_open(1);
+   self->fd1 = hmm_open(variant->device_number1);
ASSERT_GE(self->fd1, 0);
 }
 
@@ -211,6 +260,20 @@ static void hmm_nanosleep(unsigned int n)
nanosleep(, NULL);
 }
 
+static int hmm_migrate_sys_to_dev(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
 /*
  * Simple NULL test of device open/close.
  */
@@ -875,7 +938,7 @@ TEST_F(hmm, migrate)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -923,7 +986,7 @@ TEST_F(hmm, migrate_fault)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -936,7 +999,7 @@ TEST_F(hmm, migrate_fault)
ASSERT_EQ(ptr[i], i);
 
/* Migrate memory to the device again. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRA

[PATCH v9 01/14] mm: rename is_pinnable_pages to is_longterm_pinnable_pages

2022-07-16 Thread Alex Sierra

is_pinnable_page() and folio_is_pinnable() were renamed to
is_longterm_pinnable_page() and folio_is_longterm_pinnable()
respectively. These functions are used in the FOLL_LONGTERM flag
context.

Signed-off-by: Alex Sierra 
Reviewed-by: David Hildenbrand 
---
 include/linux/mm.h | 8 
 mm/gup.c   | 4 ++--
 mm/gup_test.c  | 2 +-
 mm/hugetlb.c   | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..3b31b33bd5be 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1592,7 +1592,7 @@ static inline bool page_needs_cow_for_dma(struct 
vm_area_struct *vma,
 
 /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
 #ifdef CONFIG_MIGRATION
-static inline bool is_pinnable_page(struct page *page)
+static inline bool is_longterm_pinnable_page(struct page *page)
 {
 #ifdef CONFIG_CMA
int mt = get_pageblock_migratetype(page);
@@ -1603,15 +1603,15 @@ static inline bool is_pinnable_page(struct page *page)
return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
 }
 #else
-static inline bool is_pinnable_page(struct page *page)
+static inline bool is_longterm_pinnable_page(struct page *page)
 {
return true;
 }
 #endif
 
-static inline bool folio_is_pinnable(struct folio *folio)
+static inline bool folio_is_longterm_pinnable(struct folio *folio)
 {
-   return is_pinnable_page(>page);
+   return is_longterm_pinnable_page(>page);
 }
 
 static inline void set_page_zone(struct page *page, enum zone_type zone)
diff --git a/mm/gup.c b/mm/gup.c
index 551264407624..b65fe8bf5af4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -133,7 +133,7 @@ struct folio *try_grab_folio(struct page *page, int refs, 
unsigned int flags)
 * path.
 */
if (unlikely((flags & FOLL_LONGTERM) &&
-!is_pinnable_page(page)))
+!is_longterm_pinnable_page(page)))
return NULL;
 
/*
@@ -1891,7 +1891,7 @@ static long check_and_migrate_movable_pages(unsigned long 
nr_pages,
continue;
prev_folio = folio;
 
-   if (folio_is_pinnable(folio))
+   if (folio_is_longterm_pinnable(folio))
continue;
 
/*
diff --git a/mm/gup_test.c b/mm/gup_test.c
index d974dec19e1c..12b0a91767d3 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -53,7 +53,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page 
**pages,
dump_page(page, "gup_test failure");
break;
} else if (cmd == PIN_LONGTERM_BENCHMARK &&
-   WARN(!is_pinnable_page(page),
+   WARN(!is_longterm_pinnable_page(page),
 "pages[%lu] is NOT pinnable but pinned\n",
 i)) {
dump_page(page, "gup_test failure");
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a57e1be41401..368fd33787b0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1135,7 +1135,7 @@ static struct page *dequeue_huge_page_node_exact(struct 
hstate *h, int nid)
 
lockdep_assert_held(_lock);
list_for_each_entry(page, >hugepage_freelists[nid], lru) {
-   if (pin && !is_pinnable_page(page))
+   if (pin && !is_longterm_pinnable_page(page))
continue;
 
if (PageHWPoison(page))
-- 
2.32.0

[PATCH v9 02/14] mm: move page zone helpers from mm.h to mmzone.h

2022-07-16 Thread Alex Sierra

[WHY]
It makes more sense to have these helpers in zone specific header
file, rather than the generic mm.h

Signed-off-by: Alex Sierra 
---
 include/linux/memremap.h |  2 +-
 include/linux/mm.h   | 78 ---
 include/linux/mmzone.h   | 80 
 3 files changed, 81 insertions(+), 79 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8af304f6b504..77229165c914 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_MEMREMAP_H_
 #define _LINUX_MEMREMAP_H_
 
-#include 
+#include 
 #include 
 #include 
 #include 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3b31b33bd5be..2df8c2b98d36 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1049,84 +1049,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
  *   back into memory.
  */
 
-/*
- * The zone field is never updated after free_area_init_core()
- * sets it, so none of the operations on it need to be atomic.
- */
-
-/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
-#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
-#define NODES_PGOFF(SECTIONS_PGOFF - NODES_WIDTH)
-#define ZONES_PGOFF(NODES_PGOFF - ZONES_WIDTH)
-#define LAST_CPUPID_PGOFF  (ZONES_PGOFF - LAST_CPUPID_WIDTH)
-#define KASAN_TAG_PGOFF(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
-
-/*
- * Define the bit shifts to access each section.  For non-existent
- * sections we define the shift as 0; that plus a 0 mask ensures
- * the compiler will optimise away reference to them.
- */
-#define SECTIONS_PGSHIFT   (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
-#define NODES_PGSHIFT  (NODES_PGOFF * (NODES_WIDTH != 0))
-#define ZONES_PGSHIFT  (ZONES_PGOFF * (ZONES_WIDTH != 0))
-#define LAST_CPUPID_PGSHIFT(LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
-#define KASAN_TAG_PGSHIFT  (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))
-
-/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
-#ifdef NODE_NOT_IN_PAGE_FLAGS
-#define ZONEID_SHIFT   (SECTIONS_SHIFT + ZONES_SHIFT)
-#define ZONEID_PGOFF   ((SECTIONS_PGOFF < ZONES_PGOFF)? \
-   SECTIONS_PGOFF : ZONES_PGOFF)
-#else
-#define ZONEID_SHIFT   (NODES_SHIFT + ZONES_SHIFT)
-#define ZONEID_PGOFF   ((NODES_PGOFF < ZONES_PGOFF)? \
-   NODES_PGOFF : ZONES_PGOFF)
-#endif
-
-#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0))
-
-#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
-#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
-#define SECTIONS_MASK  ((1UL << SECTIONS_WIDTH) - 1)
-#define LAST_CPUPID_MASK   ((1UL << LAST_CPUPID_SHIFT) - 1)
-#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1)
-#define ZONEID_MASK((1UL << ZONEID_SHIFT) - 1)
-
-static inline enum zone_type page_zonenum(const struct page *page)
-{
-   ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
-   return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
-}
-
-static inline enum zone_type folio_zonenum(const struct folio *folio)
-{
-   return page_zonenum(>page);
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_zone_device_page(const struct page *page)
-{
-   return page_zonenum(page) == ZONE_DEVICE;
-}
-extern void memmap_init_zone_device(struct zone *, unsigned long,
-   unsigned long, struct dev_pagemap *);
-#else
-static inline bool is_zone_device_page(const struct page *page)
-{
-   return false;
-}
-#endif
-
-static inline bool folio_is_zone_device(const struct folio *folio)
-{
-   return is_zone_device_page(>page);
-}
-
-static inline bool is_zone_movable_page(const struct page *page)
-{
-   return page_zonenum(page) == ZONE_MOVABLE;
-}
-
 #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aab70355d64f..47fc41f43c48 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -730,6 +730,86 @@ static inline bool zone_is_empty(struct zone *zone)
return zone->spanned_pages == 0;
 }
 
+#ifndef BUILD_VDSO32_64
+/*
+ * The zone field is never updated after free_area_init_core()
+ * sets it, so none of the operations on it need to be atomic.
+ */
+
+/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
+#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
+#define NODES_PGOFF(SECTIONS_PGOFF - NODES_WIDTH)
+#define ZONES_PGOFF(NODES_PGOFF - ZONES_WIDTH)
+#define LAST_CPUPID_PGOFF  (ZONES_PGOFF - LAST_CPUPID_WIDTH)
+#define KASAN_TAG_PGOFF

[PATCH v9 05/14] mm: add device coherent vma selection for memory migration

2022-07-16 Thread Alex Sierra

This case is used to migrate pages from device memory, back to system
memory. Device coherent type memory is cache coherent from device and CPU
point of view.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
Reviewed-by: David Hildenbrand 
---
 include/linux/migrate.h |  1 +
 mm/migrate_device.c | 12 +---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 069a89e847f3..b84908debe5c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
 enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
 };
 
 struct migrate_vma {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index a4847ad65da3..18bc6483f63a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -148,15 +148,21 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
-   if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
-   goto next;
pfn = pte_pfn(pte);
-   if (is_zero_pfn(pfn)) {
+   if (is_zero_pfn(pfn) &&
+   (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
+   if (page && !is_zone_device_page(page) &&
+   !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+   goto next;
+   else if (page && is_device_coherent_page(page) &&
+   (!(migrate->flags & 
MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+page->pgmap->owner != migrate->pgmap_owner))
+   goto next;
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
-- 
2.32.0

[PATCH] mm: move page zone helpers from mm.h to mmzone.h

2022-07-14 Thread Alex Sierra

[WHY]
It makes more sense to have these helpers in zone specific header
file, rather than the generic mm.h

Signed-off-by: Alex Sierra 
---
 include/linux/memremap.h |  2 +-
 include/linux/mm.h   | 78 ---
 include/linux/mmzone.h   | 80 
 3 files changed, 81 insertions(+), 79 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8af304f6b504..77229165c914 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_MEMREMAP_H_
 #define _LINUX_MEMREMAP_H_
 
-#include 
+#include 
 #include 
 #include 
 #include 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3b31b33bd5be..2df8c2b98d36 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1049,84 +1049,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
  *   back into memory.
  */
 
-/*
- * The zone field is never updated after free_area_init_core()
- * sets it, so none of the operations on it need to be atomic.
- */
-
-/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
-#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
-#define NODES_PGOFF(SECTIONS_PGOFF - NODES_WIDTH)
-#define ZONES_PGOFF(NODES_PGOFF - ZONES_WIDTH)
-#define LAST_CPUPID_PGOFF  (ZONES_PGOFF - LAST_CPUPID_WIDTH)
-#define KASAN_TAG_PGOFF(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
-
-/*
- * Define the bit shifts to access each section.  For non-existent
- * sections we define the shift as 0; that plus a 0 mask ensures
- * the compiler will optimise away reference to them.
- */
-#define SECTIONS_PGSHIFT   (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
-#define NODES_PGSHIFT  (NODES_PGOFF * (NODES_WIDTH != 0))
-#define ZONES_PGSHIFT  (ZONES_PGOFF * (ZONES_WIDTH != 0))
-#define LAST_CPUPID_PGSHIFT(LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
-#define KASAN_TAG_PGSHIFT  (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))
-
-/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
-#ifdef NODE_NOT_IN_PAGE_FLAGS
-#define ZONEID_SHIFT   (SECTIONS_SHIFT + ZONES_SHIFT)
-#define ZONEID_PGOFF   ((SECTIONS_PGOFF < ZONES_PGOFF)? \
-   SECTIONS_PGOFF : ZONES_PGOFF)
-#else
-#define ZONEID_SHIFT   (NODES_SHIFT + ZONES_SHIFT)
-#define ZONEID_PGOFF   ((NODES_PGOFF < ZONES_PGOFF)? \
-   NODES_PGOFF : ZONES_PGOFF)
-#endif
-
-#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0))
-
-#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
-#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
-#define SECTIONS_MASK  ((1UL << SECTIONS_WIDTH) - 1)
-#define LAST_CPUPID_MASK   ((1UL << LAST_CPUPID_SHIFT) - 1)
-#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1)
-#define ZONEID_MASK((1UL << ZONEID_SHIFT) - 1)
-
-static inline enum zone_type page_zonenum(const struct page *page)
-{
-   ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
-   return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
-}
-
-static inline enum zone_type folio_zonenum(const struct folio *folio)
-{
-   return page_zonenum(>page);
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_zone_device_page(const struct page *page)
-{
-   return page_zonenum(page) == ZONE_DEVICE;
-}
-extern void memmap_init_zone_device(struct zone *, unsigned long,
-   unsigned long, struct dev_pagemap *);
-#else
-static inline bool is_zone_device_page(const struct page *page)
-{
-   return false;
-}
-#endif
-
-static inline bool folio_is_zone_device(const struct folio *folio)
-{
-   return is_zone_device_page(>page);
-}
-
-static inline bool is_zone_movable_page(const struct page *page)
-{
-   return page_zonenum(page) == ZONE_MOVABLE;
-}
-
 #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aab70355d64f..47fc41f43c48 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -730,6 +730,86 @@ static inline bool zone_is_empty(struct zone *zone)
return zone->spanned_pages == 0;
 }
 
+#ifndef BUILD_VDSO32_64
+/*
+ * The zone field is never updated after free_area_init_core()
+ * sets it, so none of the operations on it need to be atomic.
+ */
+
+/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
+#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
+#define NODES_PGOFF(SECTIONS_PGOFF - NODES_WIDTH)
+#define ZONES_PGOFF(NODES_PGOFF - ZONES_WIDTH)
+#define LAST_CPUPID_PGOFF  (ZONES_PGOFF - LAST_CPUPID_WIDTH)
+#define KASAN_TAG_PGOFF

[PATCH 3/3] drm/amdgpu: add debugfs for kfd system and ttm mem used

2022-07-11 Thread Alex Sierra

This keeps track of kfd system mem used and kfd ttm mem used.

Signed-off-by: Alex Sierra 
Reviewed-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  3 +++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 19 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c  |  2 ++
 3 files changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 83d955f0c52f..3c09dcc0986e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -172,6 +172,9 @@ int amdgpu_queue_mask_bit_to_set_resource_bit(struct 
amdgpu_device *adev,
 struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
struct mm_struct *mm,
struct svm_range_bo *svm_bo);
+#if defined(CONFIG_DEBUG_FS)
+int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data);
+#endif
 #if IS_ENABLED(CONFIG_HSA_AMD)
 bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
 struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 7480e7333e5d..8946e80fecfb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2920,3 +2920,22 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
}
return false;
 }
+
+#if defined(CONFIG_DEBUG_FS)
+
+int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data)
+{
+
+   spin_lock(_mem_limit.mem_limit_lock);
+   seq_printf(m, "System mem used %lldM out of %lluM\n",
+ (kfd_mem_limit.system_mem_used >> 20),
+ (kfd_mem_limit.max_system_mem_limit >> 20));
+   seq_printf(m, "TTM mem used %lldM out of %lluM\n",
+ (kfd_mem_limit.ttm_mem_used >> 20),
+ (kfd_mem_limit.max_ttm_mem_limit >> 20));
+   spin_unlock(_mem_limit.mem_limit_lock);
+
+   return 0;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index 581c3a30fee1..ad5a40a685ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -101,6 +101,8 @@ void kfd_debugfs_init(void)
kfd_debugfs_rls_by_device, _debugfs_fops);
debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
kfd_debugfs_hang_hws_read, 
_debugfs_hang_hws_fops);
+   debugfs_create_file("mem_limit", S_IFREG | 0200, debugfs_root,
+   kfd_debugfs_kfd_mem_limits, _debugfs_fops);
 }
 
 void kfd_debugfs_fini(void)
-- 
2.32.0

[PATCH 2/3] drm/amdkfd: track unified memory reservation with xnack off

2022-07-11 Thread Alex Sierra

[WHY]
Unified memory with xnack off should be tracked, as userptr mappings
and legacy allocations do. To avoid oversuscribe system memory when
xnack off.
[How]
Exposing functions reserve_mem_limit and unreserve_mem_limit to SVM
API and call them on every prange creation and free.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  4 ++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 25 
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  | 60 +--
 3 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 73bf8b5f2aa9..83d955f0c52f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -305,6 +305,10 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
 
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 2bc36ff0aa0f..7480e7333e5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -129,7 +129,7 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
  *
  * Return: returns -ENOMEM in case of error, ZERO otherwise
  */
-static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
uint64_t reserved_for_pt =
@@ -169,7 +169,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) ||
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
-   (adev->kfd.vram_used + vram_needed >
+   (adev && adev->kfd.vram_used + vram_needed >
 adev->gmc.real_vram_size -
 atomic64_read(>vram_pin_size) -
 reserved_for_pt)) {
@@ -180,7 +180,10 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
/* Update memory accounting by decreasing available system
 * memory, TTM memory and GPU memory as computed above
 */
-   adev->kfd.vram_used += vram_needed;
+   WARN_ONCE(vram_needed && !adev,
+ "adev reference can't be null when vram is used");
+   if (adev)
+   adev->kfd.vram_used += vram_needed;
kfd_mem_limit.system_mem_used += system_mem_needed;
kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
 
@@ -189,7 +192,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
return ret;
 }
 
-static void unreserve_mem_limit(struct amdgpu_device *adev,
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
spin_lock(_mem_limit.mem_limit_lock);
@@ -198,7 +201,10 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
+   WARN_ONCE(!adev,
+ "adev reference can't be null when alloc mem flags 
vram is set");
+   if (adev)
+   adev->kfd.vram_used -= ALIGN(size, 
VRAM_ALLOCATION_ALIGN);
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
kfd_mem_limit.system_mem_used -= size;
} else if (!(alloc_flag &
@@ -207,11 +213,8 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
goto release;
}
-
-   WARN_ONCE(adev->kfd.vram_used < 0,
+   WARN_ONCE(adev && adev->kfd.vram_used < 0,
  "KFD VRAM memory accounting unbalanced");
-   WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,
- "KFD TTM memory accounting unbalanced");
WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
  "KFD system memory accounting unbalanced");
 
@@ -225,7 +228,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
u32 alloc_flags = bo->kfd_

[PATCH 1/3] drm/amdgpu: remove acc_size from reserve/unreserve mem

2022-07-11 Thread Alex Sierra

TTM used to track the "acc_size" of all BOs internally. We needed to
keep track of it in our memory reservation to avoid TTM running out
of memory in its own accounting. However, that "acc_size" accounting
has since been removed from TTM. Therefore we don't really need to
track it any more.

Signed-off-by: Alex Sierra 
Reviewed-by: Philip Yang 
Reviewed-by: Felix Kuehling 
Acked-by: Christian König 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 60 ++-
 1 file changed, 17 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 4effee12a4ac..2bc36ff0aa0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -115,21 +115,12 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
  * compromise that should work in most cases without reserving too
  * much memory for page tables unnecessarily (factor 16K, >> 14).
  */
-#define ESTIMATE_PT_SIZE(mem_size) max(((mem_size) >> 14), 
AMDGPU_VM_RESERVED_VRAM)
-
-static size_t amdgpu_amdkfd_acc_size(uint64_t size)
-{
-   size >>= PAGE_SHIFT;
-   size *= sizeof(dma_addr_t) + sizeof(void *);
 
-   return __roundup_pow_of_two(sizeof(struct amdgpu_bo)) +
-   __roundup_pow_of_two(sizeof(struct ttm_tt)) +
-   PAGE_ALIGN(size);
-}
+#define ESTIMATE_PT_SIZE(mem_size) max(((mem_size) >> 14), 
AMDGPU_VM_RESERVED_VRAM)
 
 /**
  * amdgpu_amdkfd_reserve_mem_limit() - Decrease available memory by size
- * of buffer including any reserved for control structures
+ * of buffer.
  *
  * @adev: Device to which allocated BO belongs to
  * @size: Size of buffer, in bytes, encapsulated by B0. This should be
@@ -143,19 +134,16 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 {
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
-   size_t acc_size, system_mem_needed, ttm_mem_needed, vram_needed;
+   size_t system_mem_needed, ttm_mem_needed, vram_needed;
int ret = 0;
 
-   acc_size = amdgpu_amdkfd_acc_size(size);
-
+   system_mem_needed = 0;
+   ttm_mem_needed = 0;
vram_needed = 0;
if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
-   system_mem_needed = acc_size + size;
-   ttm_mem_needed = acc_size + size;
+   system_mem_needed = size;
+   ttm_mem_needed = size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   system_mem_needed = acc_size;
-   ttm_mem_needed = acc_size;
-
/*
 * Conservatively round up the allocation requirement to 2 MB
 * to avoid fragmentation caused by 4K allocations in the tail
@@ -163,14 +151,10 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 */
vram_needed = ALIGN(size, VRAM_ALLOCATION_ALIGN);
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
-   system_mem_needed = acc_size + size;
-   ttm_mem_needed = acc_size;
-   } else if (alloc_flag &
-  (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
-   KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
-   system_mem_needed = acc_size;
-   ttm_mem_needed = acc_size;
-   } else {
+   system_mem_needed = size;
+   } else if (!(alloc_flag &
+   (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
return -ENOMEM;
}
@@ -208,28 +192,18 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 static void unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
-   size_t acc_size;
-
-   acc_size = amdgpu_amdkfd_acc_size(size);
-
spin_lock(_mem_limit.mem_limit_lock);
 
if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
-   kfd_mem_limit.system_mem_used -= (acc_size + size);
-   kfd_mem_limit.ttm_mem_used -= (acc_size + size);
+   kfd_mem_limit.system_mem_used -= size;
+   kfd_mem_limit.ttm_mem_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   kfd_mem_limit.system_mem_used -= acc_size;
-   kfd_mem_limit.ttm_mem_used -= acc_size;
adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
-   kfd_mem_limit.system_mem_used -= (acc_size + size);
-   kfd_mem_limit.ttm_mem_used -= acc_size;
-   } else if (alloc_flag &
-

[PATCH v8 15/15] tools: add selftests to hmm for COW in device memory

2022-07-07 Thread Alex Sierra

The objective is to test device migration mechanism in pages marked
as COW, for private and coherent device type. In case of writing to
COW private page(s), a page fault will migrate pages back to system
memory first. Then, these pages will be duplicated. In case of COW
device coherent type, pages are duplicated directly from device
memory.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
---
 tools/testing/selftests/vm/hmm-tests.c | 80 ++
 1 file changed, 80 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index bb38b9777610..716b62c05e3d 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1874,4 +1874,84 @@ TEST_F(hmm, hmm_gup_test)
close(gup_fd);
hmm_buffer_free(buffer);
 }
+
+/*
+ * Test copy-on-write in device pages.
+ * In case of writing to COW private page(s), a page fault will migrate pages
+ * back to system memory first. Then, these pages will be duplicated. In case
+ * of COW device coherent type, pages are duplicated directly from device
+ * memory.
+ */
+TEST_F(hmm, hmm_cow_in_device)
+{
+   struct hmm_buffer *buffer;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+   pid_t pid;
+   int status;
+
+   npages = 4;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+
+   pid = fork();
+   if (pid == -1)
+   ASSERT_EQ(pid, 0);
+   if (!pid) {
+   /* Child process waitd for SIGTERM from the parent. */
+   while (1) {
+   }
+   perror("Should not reach this\n");
+   exit(0);
+   }
+   /* Parent process writes to COW pages(s) and gets a
+* new copy in system. In case of device private pages,
+* this write causes a migration to system mem first.
+*/
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Terminate child and wait */
+   EXPECT_EQ(0, kill(pid, SIGTERM));
+   EXPECT_EQ(pid, waitpid(pid, , 0));
+   EXPECT_NE(0, WIFSIGNALED(status));
+   EXPECT_EQ(SIGTERM, WTERMSIG(status));
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   for (i = 0; i < npages; i++)
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
+
+   hmm_buffer_free(buffer);
+}
 TEST_HARNESS_MAIN
-- 
2.32.0

[PATCH v8 12/15] tools: update hmm-test to support device coherent type

2022-07-07 Thread Alex Sierra

Test cases such as migrate_fault and migrate_multiple, were modified to
explicit migrate from device to sys memory without the need of page
faults, when using device coherent type.

Snapshot test case updated to read memory device type first and based
on that, get the proper returned results migrate_ping_pong test case
added to test explicit migration from device to sys memory for both
private and coherent zone types.

Helpers to migrate from device to sys memory and vicerversa
were also added.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/hmm-tests.c | 121 -
 1 file changed, 100 insertions(+), 21 deletions(-)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 203323967b50..4b547188ec40 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -46,6 +46,13 @@ struct hmm_buffer {
uint64_tfaults;
 };
 
+enum {
+   HMM_PRIVATE_DEVICE_ONE,
+   HMM_PRIVATE_DEVICE_TWO,
+   HMM_COHERENCE_DEVICE_ONE,
+   HMM_COHERENCE_DEVICE_TWO,
+};
+
 #define TWOMEG (1 << 21)
 #define HMM_BUFFER_SIZE (1024 << 12)
 #define HMM_PATH_MAX64
@@ -60,6 +67,21 @@ FIXTURE(hmm)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm)
+{
+   int device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+   .device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+   .device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
 FIXTURE(hmm2)
 {
int fd0;
@@ -68,6 +90,24 @@ FIXTURE(hmm2)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm2)
+{
+   int device_number0;
+   int device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+   .device_number0 = HMM_PRIVATE_DEVICE_ONE,
+   .device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+   .device_number0 = HMM_COHERENCE_DEVICE_ONE,
+   .device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
 static int hmm_open(int unit)
 {
char pathname[HMM_PATH_MAX];
@@ -81,12 +121,19 @@ static int hmm_open(int unit)
return fd;
 }
 
+static bool hmm_is_coherent_type(int dev_num)
+{
+   return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
 FIXTURE_SETUP(hmm)
 {
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd = hmm_open(0);
+   self->fd = hmm_open(variant->device_number);
+   if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd, 0);
 }
 
@@ -95,9 +142,11 @@ FIXTURE_SETUP(hmm2)
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd0 = hmm_open(0);
+   self->fd0 = hmm_open(variant->device_number0);
+   if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd0, 0);
-   self->fd1 = hmm_open(1);
+   self->fd1 = hmm_open(variant->device_number1);
ASSERT_GE(self->fd1, 0);
 }
 
@@ -211,6 +260,20 @@ static void hmm_nanosleep(unsigned int n)
nanosleep(, NULL);
 }
 
+static int hmm_migrate_sys_to_dev(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
 /*
  * Simple NULL test of device open/close.
  */
@@ -875,7 +938,7 @@ TEST_F(hmm, migrate)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -923,7 +986,7 @@ TEST_F(hmm, migrate_fault)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -936,7 +999,7 @@ TEST_F(hmm, migrate_fault)
ASSERT_EQ(ptr[i], i);
 
/* Migrate memory to the device again. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRA

[PATCH v8 14/15] tools: add hmm gup tests for device coherent type

2022-07-07 Thread Alex Sierra

The intention is to test hmm device coherent type under different get
user pages paths. Also, test gup with FOLL_LONGTERM flag set in
device coherent pages. These pages should get migrated back to system
memory.

Signed-off-by: Alex Sierra 
Reviewed-by: Alistair Popple 
---
 tools/testing/selftests/vm/hmm-tests.c | 110 +
 1 file changed, 110 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 4b547188ec40..bb38b9777610 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -36,6 +36,7 @@
  * in the usual include/uapi/... directory.
  */
 #include "../../../../lib/test_hmm_uapi.h"
+#include "../../../../mm/gup_test.h"
 
 struct hmm_buffer {
void*ptr;
@@ -59,6 +60,9 @@ enum {
 #define NTIMES 10
 
 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01/* check pte is writable */
+#define FOLL_LONGTERM   0x1 /* mapping lifetime is indefinite */
 
 FIXTURE(hmm)
 {
@@ -1764,4 +1768,110 @@ TEST_F(hmm, exclusive_cow)
hmm_buffer_free(buffer);
 }
 
+static int gup_test_exec(int gup_fd, unsigned long addr, int cmd,
+int npages, int size, int flags)
+{
+   struct gup_test gup = {
+   .nr_pages_per_call  = npages,
+   .addr   = addr,
+   .gup_flags  = FOLL_WRITE | flags,
+   .size   = size,
+   };
+
+   if (ioctl(gup_fd, cmd, )) {
+   perror("ioctl on error\n");
+   return errno;
+   }
+
+   return 0;
+}
+
+/*
+ * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
+ * This should trigger a migration back to system memory for both, private
+ * and coherent type pages.
+ * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
+ * to your configuration before you run it.
+ */
+TEST_F(hmm, hmm_gup_test)
+{
+   struct hmm_buffer *buffer;
+   int gup_fd;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+
+   gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+   if (gup_fd == -1)
+   SKIP(return, "Skipping test, could not find gup_test driver");
+
+   npages = 4;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   /* Check what the device read. */
+   for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+   ASSERT_EQ(ptr[i], i);
+
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr,
+   GUP_BASIC_TEST, 1, self->page_size, 0), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 1 * 
self->page_size,
+   GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 2 * 
self->page_size,
+   PIN_FAST_BENCHMARK, 1, self->page_size, 
FOLL_LONGTERM), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 3 * 
self->page_size,
+   PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 
0);
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   if (hmm_is_coherent_type(variant->device_number)) {
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[1]);
+   } else {
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
+

[PATCH v8 10/15] lib: test_hmm add module param for zone device type

2022-07-07 Thread Alex Sierra

In order to configure device coherent in test_hmm, two module parameters
should be passed, which correspond to the SP start address of each
device (2) spm_addr_dev0 & spm_addr_dev1. If no parameters are passed,
private device type is configured.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 73 -
 lib/test_hmm_uapi.h |  1 +
 2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 915ef6b5b0d4..afb30af9f3ff 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -37,6 +37,16 @@
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+static unsigned long spm_addr_dev0;
+module_param(spm_addr_dev0, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev0,
+   "Specify start address for SPM (special purpose memory) used 
for device 0. By setting this Coherent device type will be used. Make sure 
spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
+static unsigned long spm_addr_dev1;
+module_param(spm_addr_dev1, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev1,
+   "Specify start address for SPM (special purpose memory) used 
for device 1. By setting this Coherent device type will be used. Make sure 
spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
 static const struct dev_pagemap_ops dmirror_devmem_ops;
 static const struct mmu_interval_notifier_ops dmirror_min_ops;
 static dev_t dmirror_dev;
@@ -455,28 +465,44 @@ static int dmirror_write(struct dmirror *dmirror, struct 
hmm_dmirror_cmd *cmd)
return ret;
 }
 
-static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
+static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
   struct page **ppage)
 {
struct dmirror_chunk *devmem;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long pfn;
unsigned long pfn_first;
unsigned long pfn_last;
void *ptr;
+   int ret = -ENOMEM;
 
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
-   return false;
+   return ret;
 
-   res = request_free_mem_region(_resource, DEVMEM_CHUNK_SIZE,
- "hmm_dmirror");
-   if (IS_ERR(res))
+   switch (mdevice->zone_device_type) {
+   case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
+   res = request_free_mem_region(_resource, 
DEVMEM_CHUNK_SIZE,
+ "hmm_dmirror");
+   if (IS_ERR_OR_NULL(res))
+   goto err_devmem;
+   devmem->pagemap.range.start = res->start;
+   devmem->pagemap.range.end = res->end;
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   break;
+   case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
+   devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) 
?
+   spm_addr_dev0 :
+   spm_addr_dev1;
+   devmem->pagemap.range.end = devmem->pagemap.range.start +
+   DEVMEM_CHUNK_SIZE - 1;
+   devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
+   break;
+   default:
+   ret = -EINVAL;
goto err_devmem;
+   }
 
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-   devmem->pagemap.range.start = res->start;
-   devmem->pagemap.range.end = res->end;
devmem->pagemap.nr_range = 1;
devmem->pagemap.ops = _devmem_ops;
devmem->pagemap.owner = mdevice;
@@ -497,10 +523,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
mdevice->devmem_capacity = new_capacity;
mdevice->devmem_chunks = new_chunks;
}
-
ptr = memremap_pages(>pagemap, numa_node_id());
-   if (IS_ERR(ptr))
+   if (IS_ERR_OR_NULL(ptr)) {
+   if (ptr)
+   ret = PTR_ERR(ptr);
+   else
+   ret = -EFAULT;
goto err_release;
+   }
 
devmem->mdevice = mdevice;
pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
@@ -529,15 +559,17 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
}
spin_unlock(>lock);
 
-   return true;
+   return 0;
 
 err_release:
mutex_unlock(>devmem_lock);
-   release_mem_region(devmem->pagemap.range.start, 
range_len(>pagemap.range));
+   if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
+   release_mem_region(devme

[PATCH v8 13/15] tools: update test_hmm script to support SP config

2022-07-07 Thread Alex Sierra

Add two more parameters to set spm_addr_dev0 & spm_addr_dev1
addresses. These two parameters configure the start SP
addresses for each device in test_hmm driver.
Consequently, this configures zone device type as coherent.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/test_hmm.sh | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/test_hmm.sh 
b/tools/testing/selftests/vm/test_hmm.sh
index 0647b525a625..539c9371e592 100755
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ b/tools/testing/selftests/vm/test_hmm.sh
@@ -40,11 +40,26 @@ check_test_requirements()
 
 load_driver()
 {
-   modprobe $DRIVER > /dev/null 2>&1
+   if [ $# -eq 0 ]; then
+   modprobe $DRIVER > /dev/null 2>&1
+   else
+   if [ $# -eq 2 ]; then
+   modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+   > /dev/null 2>&1
+   else
+   echo "Missing module parameters. Make sure pass"\
+   "spm_addr_dev0 and spm_addr_dev1"
+   usage
+   fi
+   fi
if [ $? == 0 ]; then
major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices)
mknod /dev/hmm_dmirror0 c $major 0
mknod /dev/hmm_dmirror1 c $major 1
+   if [ $# -eq 2 ]; then
+   mknod /dev/hmm_dmirror2 c $major 2
+   mknod /dev/hmm_dmirror3 c $major 3
+   fi
fi
 }
 
@@ -58,7 +73,7 @@ run_smoke()
 {
echo "Running smoke test. Note, this test provides basic coverage."
 
-   load_driver
+   load_driver $1 $2
$(dirname "${BASH_SOURCE[0]}")/hmm-tests
unload_driver
 }
@@ -75,6 +90,9 @@ usage()
echo "# Smoke testing"
echo "./${TEST_NAME}.sh smoke"
echo
+   echo "# Smoke testing with SPM enabled"
+   echo "./${TEST_NAME}.sh smoke  "
+   echo
exit 0
 }
 
@@ -84,7 +102,7 @@ function run_test()
usage
else
if [ "$1" = "smoke" ]; then
-   run_smoke
+   run_smoke $2 $3
else
usage
fi
-- 
2.32.0

[PATCH v8 11/15] lib: add support for device coherent type in test_hmm

2022-07-07 Thread Alex Sierra

Device Coherent type uses device memory that is coherently accesible by
the CPU. This could be shown as SP (special purpose) memory range
at the BIOS-e820 memory enumeration. If no SP memory is supported in
system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x1 &
0x14000 physical address. Ex.
efi_fake_mem=1G@0x1:0x4,1G@0x14000:0x4

Private and coherent device mirror instances can be created in the same
probed. This is done by passing the module parameters spm_addr_dev0 &
spm_addr_dev1. In this case, it will create four instances of
device_mirror. The first two correspond to private device type, the
last two to coherent type. Then, they can be easily accessed from user
space through /dev/hmm_mirror. Usually num_device 0 and 1
are for private, and 2 and 3 for coherent types. If no module
parameters are passed, two instances of private type device_mirror will
be created only.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
---
 lib/test_hmm.c  | 253 +---
 lib/test_hmm_uapi.h |   4 +
 2 files changed, 196 insertions(+), 61 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index afb30af9f3ff..7930853e7fc5 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -32,11 +32,22 @@
 
 #include "test_hmm_uapi.h"
 
-#define DMIRROR_NDEVICES   2
+#define DMIRROR_NDEVICES   4
 #define DMIRROR_RANGE_FAULT_TIMEOUT1000
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+/*
+ * For device_private pages, dpage is just a dummy struct page
+ * representing a piece of device memory. dmirror_devmem_alloc_page
+ * allocates a real system memory page as backing storage to fake a
+ * real device. zone_device_data points to that backing page. But
+ * for device_coherent memory, the struct page represents real
+ * physical CPU-accessible memory that we can use directly.
+ */
+#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
+  (page)->zone_device_data : (page))
+
 static unsigned long spm_addr_dev0;
 module_param(spm_addr_dev0, long, 0644);
 MODULE_PARM_DESC(spm_addr_dev0,
@@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce 
*bounce,
return 0;
 }
 
+static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
+{
+   return (mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+}
+
+static enum migrate_vma_direction
+dmirror_select_device(struct dmirror *dmirror)
+{
+   return (dmirror->mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
+   MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+}
+
 static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
 {
vfree(bounce->ptr);
@@ -575,16 +601,19 @@ static int dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 {
struct page *dpage = NULL;
-   struct page *rpage;
+   struct page *rpage = NULL;
 
/*
-* This is a fake device so we alloc real system memory to store
-* our device memory.
+* For ZONE_DEVICE private type, this is a fake device so we allocate
+* real system memory to store our device memory.
+* For ZONE_DEVICE coherent type we use the actual dpage to store the
+* data and ignore rpage.
 */
-   rpage = alloc_page(GFP_HIGHUSER);
-   if (!rpage)
-   return NULL;
-
+   if (dmirror_is_private_zone(mdevice)) {
+   rpage = alloc_page(GFP_HIGHUSER);
+   if (!rpage)
+   return NULL;
+   }
spin_lock(>lock);
 
if (mdevice->free_pages) {
@@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct 
dmirror_device *mdevice)
return dpage;
 
 error:
-   __free_page(rpage);
+   if (rpage)
+   __free_page(rpage);
return NULL;
 }
 
@@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct 
migrate_vma *args,
 * unallocated pte_none() or read-only zero page.
 */
spage = migrate_pfn_to_page(*src);
+   if (WARN(spage && is_zone_device_page(spage),
+"page already in device spage pfn: 0x%lx\n",
+page_to_pfn(spage)))
+   continue;
 
dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;
 
-   rpage = dpage->

[PATCH v8 09/15] lib: test_hmm add ioctl to get zone device type

2022-07-07 Thread Alex Sierra

new ioctl cmd added to query zone device type. This will be
used once the test_hmm adds zone device coherent type.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 11 +--
 lib/test_hmm_uapi.h | 14 ++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index cfe632047839..915ef6b5b0d4 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -87,6 +87,7 @@ struct dmirror_chunk {
 struct dmirror_device {
struct cdev cdevice;
struct hmm_devmem   *devmem;
+   unsigned intzone_device_type;
 
unsigned intdevmem_capacity;
unsigned intdevmem_count;
@@ -1260,14 +1261,20 @@ static void dmirror_device_remove(struct dmirror_device 
*mdevice)
 static int __init hmm_dmirror_init(void)
 {
int ret;
-   int id;
+   int id = 0;
+   int ndevices = 0;
 
ret = alloc_chrdev_region(_dev, 0, DMIRROR_NDEVICES,
  "HMM_DMIRROR");
if (ret)
goto err_unreg;
 
-   for (id = 0; id < DMIRROR_NDEVICES; id++) {
+   memset(dmirror_devices, 0, DMIRROR_NDEVICES * 
sizeof(dmirror_devices[0]));
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   for (id = 0; id < ndevices; id++) {
ret = dmirror_device_init(dmirror_devices + id, id);
if (ret)
goto err_chrdev;
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index f14dea5dcd06..0511af7464ee 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -31,10 +31,11 @@ struct hmm_dmirror_cmd {
 /* Expose the address space of the calling process through hmm device file */
 #define HMM_DMIRROR_READ   _IOWR('H', 0x00, struct hmm_dmirror_cmd)
 #define HMM_DMIRROR_WRITE  _IOWR('H', 0x01, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_MIGRATE_IOWR('H', 0x02, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x03, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x04, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x04, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x06, struct hmm_dmirror_cmd)
 
 /*
  * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
@@ -62,4 +63,9 @@ enum {
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
 };
 
+enum {
+   /* 0 is reserved to catch uninitialized type fields */
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1,
+};
+
 #endif /* _LIB_TEST_HMM_UAPI_H */
-- 
2.32.0

[PATCH v8 07/15] mm/gup: migrate device coherent pages when pinning instead of failing

2022-07-07 Thread Alex Sierra

From: Alistair Popple 

Currently any attempts to pin a device coherent page will fail. This is
because device coherent pages need to be managed by a device driver, and
pinning them would prevent a driver from migrating them off the device.

However this is no reason to fail pinning of these pages. These are
coherent and accessible from the CPU so can be migrated just like
pinning ZONE_MOVABLE pages. So instead of failing all attempts to pin
them first try migrating them out of ZONE_DEVICE.

Signed-off-by: Alistair Popple 
Acked-by: Felix Kuehling 
[hch: rebased to the split device memory checks,
  moved migrate_device_page to migrate_device.c]
Signed-off-by: Christoph Hellwig 
---
 mm/gup.c| 47 +++-
 mm/internal.h   |  1 +
 mm/migrate_device.c | 53 +
 3 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index b65fe8bf5af4..9b6b9923d22d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1891,9 +1891,43 @@ static long check_and_migrate_movable_pages(unsigned 
long nr_pages,
continue;
prev_folio = folio;
 
-   if (folio_is_longterm_pinnable(folio))
+   /*
+* Device private pages will get faulted in during gup so it
+* shouldn't be possible to see one here.
+*/
+   if (WARN_ON_ONCE(folio_is_device_private(folio))) {
+   ret = -EFAULT;
+   goto unpin_pages;
+   }
+
+   /*
+* Device coherent pages are managed by a driver and should not
+* be pinned indefinitely as it prevents the driver moving the
+* page. So when trying to pin with FOLL_LONGTERM instead try
+* to migrate the page out of device memory.
+*/
+   if (folio_is_device_coherent(folio)) {
+   WARN_ON_ONCE(PageCompound(>page));
+
+   /*
+* Migration will fail if the page is pinned, so convert
+* the pin on the source page to a normal reference.
+*/
+   if (gup_flags & FOLL_PIN) {
+   get_page(>page);
+   unpin_user_page(>page);
+   }
+
+   pages[i] = migrate_device_page(>page, gup_flags);
+   if (!pages[i]) {
+   ret = -EBUSY;
+   goto unpin_pages;
+   }
continue;
+   }
 
+   if (folio_is_longterm_pinnable(folio))
+   continue;
/*
 * Try to move out any movable page before pinning the range.
 */
@@ -1929,10 +1963,13 @@ static long check_and_migrate_movable_pages(unsigned 
long nr_pages,
return nr_pages;
 
 unpin_pages:
-   if (gup_flags & FOLL_PIN) {
-   unpin_user_pages(pages, nr_pages);
-   } else {
-   for (i = 0; i < nr_pages; i++)
+   for (i = 0; i < nr_pages; i++) {
+   if (!pages[i])
+   continue;
+
+   if (gup_flags & FOLL_PIN)
+   unpin_user_page(pages[i]);
+   else
put_page(pages[i]);
}
 
diff --git a/mm/internal.h b/mm/internal.h
index c0f8fbe0445b..eeab4ee7a4a3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -853,6 +853,7 @@ int numa_migrate_prep(struct page *page, struct 
vm_area_struct *vma,
  unsigned long addr, int page_nid, int *flags);
 
 void free_zone_device_page(struct page *page);
+struct page *migrate_device_page(struct page *page, unsigned int gup_flags);
 
 /*
  * mm/gup.c
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index cf9668376c5a..5decd26dd551 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -794,3 +794,56 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
 }
 EXPORT_SYMBOL(migrate_vma_finalize);
+
+/*
+ * Migrate a device coherent page back to normal memory.  The caller should 
have
+ * a reference on page which will be copied to the new page if migration is
+ * successful or dropped on failure.
+ */
+struct page *migrate_device_page(struct page *page, unsigned int gup_flags)
+{
+   unsigned long src_pfn, dst_pfn = 0;
+   struct migrate_vma args;
+   struct page *dpage;
+
+   lock_page(page);
+   src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
+   args.src = _pfn;
+   args.dst = _pfn;
+   args.cpages = 1;
+   args.npages = 1;
+   args.vma = NULL;
+   migrate_vma_setup();
+   if (!(src_pfn & MIGRATE_PFN_MIGRATE))
+   return NULL;
+
+   dpage = alloc_pages(GFP_USER | __GFP_NOWARN, 0);
+
+   /*
+*

[PATCH v8 08/15] drm/amdkfd: add SPM support for SVM

2022-07-07 Thread Alex Sierra

When CPU is connected throug XGMI, it has coherent
access to VRAM resource. In this case that resource
is taken from a table in the device gmc aperture base.
This resource is used along with the device type, which could
be DEVICE_PRIVATE or DEVICE_COHERENT to create the device
page map region.
Also, MIGRATE_VMA_SELECT_DEVICE_COHERENT flag is selected for
coherent type case during migration to device.

Signed-off-by: Alex Sierra 
Reviewed-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 34 +++-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index e44376c2ecdc..f73e3e340413 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -671,13 +671,15 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
migrate.vma = vma;
migrate.start = start;
migrate.end = end;
-   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
+   if (adev->gmc.xgmi.connected_to_cpu)
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+   else
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
 
buf = kvcalloc(npages,
   2 * sizeof(*migrate.src) + sizeof(uint64_t) + 
sizeof(dma_addr_t),
   GFP_KERNEL);
-
if (!buf)
goto out;
 
@@ -947,7 +949,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
 {
struct kfd_dev *kfddev = adev->kfd.dev;
struct dev_pagemap *pgmap;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long size;
void *r;
 
@@ -962,28 +964,34 @@ int svm_migrate_init(struct amdgpu_device *adev)
 * should remove reserved size
 */
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
-   res = devm_request_free_mem_region(adev->dev, _resource, size);
-   if (IS_ERR(res))
-   return -ENOMEM;
+   if (adev->gmc.xgmi.connected_to_cpu) {
+   pgmap->range.start = adev->gmc.aper_base;
+   pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 
1;
+   pgmap->type = MEMORY_DEVICE_COHERENT;
+   } else {
+   res = devm_request_free_mem_region(adev->dev, _resource, 
size);
+   if (IS_ERR(res))
+   return -ENOMEM;
+   pgmap->range.start = res->start;
+   pgmap->range.end = res->end;
+   pgmap->type = MEMORY_DEVICE_PRIVATE;
+   }
 
-   pgmap->type = MEMORY_DEVICE_PRIVATE;
pgmap->nr_range = 1;
-   pgmap->range.start = res->start;
-   pgmap->range.end = res->end;
pgmap->ops = _migrate_pgmap_ops;
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
-   pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
-
+   pgmap->flags = 0;
/* Device manager releases device-specific resources, memory region and
 * pgmap when driver disconnects from device.
 */
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
-
/* Disable SVM support capability */
pgmap->type = 0;
-   devm_release_mem_region(adev->dev, res->start, 
resource_size(res));
+   if (pgmap->type == MEMORY_DEVICE_PRIVATE)
+   devm_release_mem_region(adev->dev, res->start,
+   res->end - res->start + 1);
return PTR_ERR(r);
}
 
-- 
2.32.0

[PATCH v8 06/15] mm: remove the vma check in migrate_vma_setup()

2022-07-07 Thread Alex Sierra

From: Alistair Popple 

migrate_vma_setup() checks that a valid vma is passed so that the page
tables can be walked to find the pfns associated with a given address
range. However in some cases the pfns are already known, such as when
migrating device coherent pages during pin_user_pages() meaning a valid
vma isn't required.

Signed-off-by: Alistair Popple 
Acked-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 mm/migrate_device.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 18bc6483f63a..cf9668376c5a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -486,24 +486,24 @@ int migrate_vma_setup(struct migrate_vma *args)
 
args->start &= PAGE_MASK;
args->end &= PAGE_MASK;
-   if (!args->vma || is_vm_hugetlb_page(args->vma) ||
-   (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
-   return -EINVAL;
-   if (nr_pages <= 0)
-   return -EINVAL;
-   if (args->start < args->vma->vm_start ||
-   args->start >= args->vma->vm_end)
-   return -EINVAL;
-   if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
-   return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
-
-   memset(args->src, 0, sizeof(*args->src) * nr_pages);
-   args->cpages = 0;
-   args->npages = 0;
-
-   migrate_vma_collect(args);
+   if (args->vma) {
+   if (is_vm_hugetlb_page(args->vma) ||
+   (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+   return -EINVAL;
+   if (args->start < args->vma->vm_start ||
+   args->start >= args->vma->vm_end)
+   return -EINVAL;
+   if (args->end <= args->vma->vm_start ||
+   args->end > args->vma->vm_end)
+   return -EINVAL;
+   memset(args->src, 0, sizeof(*args->src) * nr_pages);
+   args->cpages = 0;
+   args->npages = 0;
+
+   migrate_vma_collect(args);
+   }
 
if (args->cpages)
migrate_vma_unmap(args);
@@ -685,7 +685,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
continue;
}
 
-   if (!page) {
+   if (!page && migrate->vma) {
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
if (!notified) {
-- 
2.32.0

[PATCH v8 01/15] mm: rename is_pinnable_pages to is_longterm_pinnable_pages

2022-07-07 Thread Alex Sierra

is_pinnable_page() and folio_is_pinnable() were renamed to
is_longterm_pinnable_page() and folio_is_longterm_pinnable()
respectively. These functions are used in the FOLL_LONGTERM flag
context.

Signed-off-by: Alex Sierra 
---
 include/linux/mm.h | 8 
 mm/gup.c   | 4 ++--
 mm/gup_test.c  | 2 +-
 mm/hugetlb.c   | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..3b31b33bd5be 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1592,7 +1592,7 @@ static inline bool page_needs_cow_for_dma(struct 
vm_area_struct *vma,
 
 /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
 #ifdef CONFIG_MIGRATION
-static inline bool is_pinnable_page(struct page *page)
+static inline bool is_longterm_pinnable_page(struct page *page)
 {
 #ifdef CONFIG_CMA
int mt = get_pageblock_migratetype(page);
@@ -1603,15 +1603,15 @@ static inline bool is_pinnable_page(struct page *page)
return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
 }
 #else
-static inline bool is_pinnable_page(struct page *page)
+static inline bool is_longterm_pinnable_page(struct page *page)
 {
return true;
 }
 #endif
 
-static inline bool folio_is_pinnable(struct folio *folio)
+static inline bool folio_is_longterm_pinnable(struct folio *folio)
 {
-   return is_pinnable_page(>page);
+   return is_longterm_pinnable_page(>page);
 }
 
 static inline void set_page_zone(struct page *page, enum zone_type zone)
diff --git a/mm/gup.c b/mm/gup.c
index 551264407624..b65fe8bf5af4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -133,7 +133,7 @@ struct folio *try_grab_folio(struct page *page, int refs, 
unsigned int flags)
 * path.
 */
if (unlikely((flags & FOLL_LONGTERM) &&
-!is_pinnable_page(page)))
+!is_longterm_pinnable_page(page)))
return NULL;
 
/*
@@ -1891,7 +1891,7 @@ static long check_and_migrate_movable_pages(unsigned long 
nr_pages,
continue;
prev_folio = folio;
 
-   if (folio_is_pinnable(folio))
+   if (folio_is_longterm_pinnable(folio))
continue;
 
/*
diff --git a/mm/gup_test.c b/mm/gup_test.c
index d974dec19e1c..12b0a91767d3 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -53,7 +53,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page 
**pages,
dump_page(page, "gup_test failure");
break;
} else if (cmd == PIN_LONGTERM_BENCHMARK &&
-   WARN(!is_pinnable_page(page),
+   WARN(!is_longterm_pinnable_page(page),
 "pages[%lu] is NOT pinnable but pinned\n",
 i)) {
dump_page(page, "gup_test failure");
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a57e1be41401..368fd33787b0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1135,7 +1135,7 @@ static struct page *dequeue_huge_page_node_exact(struct 
hstate *h, int nid)
 
lockdep_assert_held(_lock);
list_for_each_entry(page, >hugepage_freelists[nid], lru) {
-   if (pin && !is_pinnable_page(page))
+   if (pin && !is_longterm_pinnable_page(page))
continue;
 
if (PageHWPoison(page))
-- 
2.32.0

[PATCH v8 04/15] mm: handling Non-LRU pages returned by vm_normal_pages

2022-07-07 Thread Alex Sierra

With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages. Although they
behave like normal pages for purposes of mapping in CPU page, and for
COW. They do not support LRU lists, NUMA migration or THP.

Callers to follow_page() currently don't expect ZONE_DEVICE pages,
however, with DEVICE_COHERENT we might now return ZONE_DEVICE. Check
for ZONE_DEVICE pages in applicable users of follow_page() as well.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling  (v2)
Reviewed-by: Alistair Popple  (v6)
---
 fs/proc/task_mmu.c |  2 +-
 mm/huge_memory.c   |  2 +-
 mm/khugepaged.c|  9 ++---
 mm/ksm.c   |  6 +++---
 mm/madvise.c   |  4 ++--
 mm/memory.c| 10 +-
 mm/mempolicy.c |  2 +-
 mm/migrate.c   |  4 ++--
 mm/mlock.c |  2 +-
 mm/mprotect.c  |  2 +-
 10 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2d04e3470d4c..2dd8c8a66924 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1792,7 +1792,7 @@ static struct page *can_gather_numa_stats(pte_t pte, 
struct vm_area_struct *vma,
return NULL;
 
page = vm_normal_page(vma, addr, pte);
-   if (!page)
+   if (!page || is_zone_device_page(page))
return NULL;
 
if (PageReserved(page))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 834f288b3769..c47e95b02244 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2910,7 +2910,7 @@ static int split_huge_pages_pid(int pid, unsigned long 
vaddr_start,
 
if (IS_ERR(page))
continue;
-   if (!page)
+   if (!page || is_zone_device_page(page))
continue;
 
if (!is_transparent_hugepage(page))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 16be62d493cd..671ac7800e53 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -618,7 +618,7 @@ static int __collapse_huge_page_isolate(struct 
vm_area_struct *vma,
goto out;
}
page = vm_normal_page(vma, address, pteval);
-   if (unlikely(!page)) {
+   if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
}
@@ -1267,7 +1267,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
writable = true;
 
page = vm_normal_page(vma, _address, pteval);
-   if (unlikely(!page)) {
+   if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
@@ -1479,7 +1479,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr)
goto abort;
 
page = vm_normal_page(vma, addr, *pte);
-
+   if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+   page = NULL;
/*
 * Note that uprobe, debugger, or MAP_PRIVATE may change the
 * page table, but the new page will not be a subpage of hpage.
@@ -1497,6 +1498,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr)
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
+   if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+   goto abort;
page_remove_rmap(page, vma, false);
}
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 54f78c9eecae..831b18a7a50b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned 
long addr)
cond_resched();
page = follow_page(vma, addr,
FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-   if (IS_ERR_OR_NULL(page))
+   if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
@@ -560,7 +560,7 @@ static struct page *get_mergeable_page(struct rmap_item 
*rmap_item)
goto out;
 
page = follow_page(vma, addr, FOLL_GET);
-   if (IS_ERR_OR_NULL(page))
+   if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
goto out;
if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
@@ -2308,7 +2308,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct 
page **page)
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
-   if (IS_ERR_OR_NULL(*page)) {
+   if (IS_ERR_OR_NULL(*page) || 
is_zone_d

[PATCH v8 05/15] mm: add device coherent vma selection for memory migration

2022-07-07 Thread Alex Sierra

This case is used to migrate pages from device memory, back to system
memory. Device coherent type memory is cache coherent from device and CPU
point of view.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
Reviewed-by: David Hildenbrand 
---
 include/linux/migrate.h |  1 +
 mm/migrate_device.c | 12 +---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 069a89e847f3..b84908debe5c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
 enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
 };
 
 struct migrate_vma {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index a4847ad65da3..18bc6483f63a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -148,15 +148,21 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
-   if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
-   goto next;
pfn = pte_pfn(pte);
-   if (is_zero_pfn(pfn)) {
+   if (is_zero_pfn(pfn) &&
+   (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
+   if (page && !is_zone_device_page(page) &&
+   !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+   goto next;
+   else if (page && is_device_coherent_page(page) &&
+   (!(migrate->flags & 
MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+page->pgmap->owner != migrate->pgmap_owner))
+   goto next;
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
-- 
2.32.0

[PATCH v8 02/15] mm: move page zone helpers into new header-specific file

2022-07-07 Thread Alex Sierra

[WHY]
Have a cleaner way to expose all page zone helpers in one header
file, rather than split them between mm.h and memremap.h files.

Signed-off-by: Alex Sierra 
---
 drivers/infiniband/core/rw.c  |   1 -
 drivers/nvme/target/io-cmd-bdev.c |   1 -
 include/linux/memremap.h  | 113 +
 include/linux/mm.h|  79 +---
 include/linux/page_zone.h | 194 ++
 mm/memcontrol.c   |   1 -
 6 files changed, 196 insertions(+), 193 deletions(-)
 create mode 100644 include/linux/page_zone.h

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 4d98f931a13d..5a3bd41b331c 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -2,7 +2,6 @@
 /*
  * Copyright (c) 2016 HGST, a Western Digital Company.
  */
-#include 
 #include 
 #include 
 #include 
diff --git a/drivers/nvme/target/io-cmd-bdev.c 
b/drivers/nvme/target/io-cmd-bdev.c
index 27a72504d31c..16a8b7665fe4 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -6,7 +6,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include 
 #include 
-#include 
 #include 
 #include "nvmet.h"
 
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8af304f6b504..0f22f6f42e7d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -2,70 +2,14 @@
 #ifndef _LINUX_MEMREMAP_H_
 #define _LINUX_MEMREMAP_H_
 
-#include 
 #include 
 #include 
 #include 
+#include 
 
 struct resource;
 struct device;
 
-/**
- * struct vmem_altmap - pre-allocated storage for vmemmap_populate
- * @base_pfn: base of the entire dev_pagemap mapping
- * @reserve: pages mapped, but reserved for driver use (relative to @base)
- * @free: free pages set aside in the mapping for memmap storage
- * @align: pages reserved to meet allocation alignments
- * @alloc: track pages consumed, private to vmemmap_populate()
- */
-struct vmem_altmap {
-   unsigned long base_pfn;
-   const unsigned long end_pfn;
-   const unsigned long reserve;
-   unsigned long free;
-   unsigned long align;
-   unsigned long alloc;
-};
-
-/*
- * Specialize ZONE_DEVICE memory into multiple types each has a different
- * usage.
- *
- * MEMORY_DEVICE_PRIVATE:
- * Device memory that is not directly addressable by the CPU: CPU can neither
- * read nor write private memory. In this case, we do still have struct pages
- * backing the device memory. Doing so simplifies the implementation, but it is
- * important to remember that there are certain points at which the struct page
- * must be treated as an opaque object, rather than a "normal" struct page.
- *
- * A more complete discussion of unaddressable memory may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.rst.
- *
- * MEMORY_DEVICE_FS_DAX:
- * Host memory that has similar access semantics as System RAM i.e. DMA
- * coherent and supports page pinning. In support of coordinating page
- * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
- * wakeup event whenever a page is unpinned and becomes idle. This
- * wakeup is used to coordinate physical address space management (ex:
- * fs truncate/hole punch) vs pinned pages (ex: device dma).
- *
- * MEMORY_DEVICE_GENERIC:
- * Host memory that has similar access semantics as System RAM i.e. DMA
- * coherent and supports page pinning. This is for example used by DAX devices
- * that expose memory using a character device.
- *
- * MEMORY_DEVICE_PCI_P2PDMA:
- * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
- * transactions.
- */
-enum memory_type {
-   /* 0 is reserved to catch uninitialized type fields */
-   MEMORY_DEVICE_PRIVATE = 1,
-   MEMORY_DEVICE_FS_DAX,
-   MEMORY_DEVICE_GENERIC,
-   MEMORY_DEVICE_PCI_P2PDMA,
-};
-
 struct dev_pagemap_ops {
/*
 * Called once the page refcount reaches 0.  The reference count will be
@@ -83,42 +27,6 @@ struct dev_pagemap_ops {
 
 #define PGMAP_ALTMAP_VALID (1 << 0)
 
-/**
- * struct dev_pagemap - metadata for ZONE_DEVICE mappings
- * @altmap: pre-allocated/reserved memory for vmemmap allocations
- * @ref: reference count that pins the devm_memremap_pages() mapping
- * @done: completion for @ref
- * @type: memory type: see MEMORY_* in memory_hotplug.h
- * @flags: PGMAP_* flags to specify defailed behavior
- * @vmemmap_shift: structural definition of how the vmemmap page metadata
- *  is populated, specifically the metadata page order.
- * A zero value (default) uses base pages as the vmemmap metadata
- * representation. A bigger value will set up compound struct pages
- * of the requested order value.
- * @ops: method table
- * @owner: an opaque pointer identifying the entity that manages this
- * instance.  Used by various helpers to make sure that no
- * foreign ZONE_DEVICE memory is accessed.
- * @nr_range: number of r

[PATCH v8 03/15] mm: add zone device coherent type memory support

2022-07-07 Thread Alex Sierra

Device memory that is cache coherent from device and CPU point of view.
This is used on platforms that have an advanced system bus (like CAPI
or CXL). Any page of a process can be migrated to such memory. However,
no one should be allowed to pin such memory so that it can always be
evicted.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
[hch: rebased ontop of the refcount changes,
  removed is_dev_private_or_coherent_page]
Signed-off-by: Christoph Hellwig 
Acked-by: David Hildenbrand 
---
 include/linux/mm.h|  4 +++-
 include/linux/page_zone.h | 19 +++
 mm/memcontrol.c   |  7 ---
 mm/memory-failure.c   |  8 ++--
 mm/memremap.c | 10 ++
 mm/migrate_device.c   | 16 +++-
 mm/rmap.c |  5 +++--
 7 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e551616cd208..83902102ffe6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1523,7 +1523,9 @@ static inline bool is_longterm_pinnable_page(struct page 
*page)
if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
return false;
 #endif
-   return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
+   return !(is_device_coherent_page(page) ||
+is_zone_movable_page(page) ||
+is_zero_pfn(page_to_pfn(page)));
 }
 #else
 static inline bool is_longterm_pinnable_page(struct page *page)
diff --git a/include/linux/page_zone.h b/include/linux/page_zone.h
index 2a7a347173ee..a5d90ef13394 100644
--- a/include/linux/page_zone.h
+++ b/include/linux/page_zone.h
@@ -59,6 +59,13 @@
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -79,6 +86,7 @@
 enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
+   MEMORY_DEVICE_COHERENT,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA,
@@ -191,4 +199,15 @@ static inline bool is_pci_p2pdma_page(const struct page 
*page)
page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
 
+static inline bool is_device_coherent_page(const struct page *page)
+{
+   return is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_DEVICE_COHERENT;
+}
+
+static inline bool folio_is_device_coherent(const struct folio *folio)
+{
+   return is_device_coherent_page(>page);
+}
+
 #endif /* _PAGE_ZONE_H_ */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a2df2f193f06..a9853f2b0d9b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5664,8 +5664,8 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  * target for charge migration. if @target is not NULL, the entry is stored
  * in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
+ *   thus not on the lru.
  * For now we such page is charge like a regular page would be as for all
  * intent and purposes it is just special memory taking the place of a
  * regular page.
@@ -5703,7 +5703,8 @@ static enum mc_target_type get_mctgt_type(struct 
vm_area_struct *vma,
 */
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
-   if (is_device_private_page(page))
+   if (is_device_private_page(page) ||
+   is_device_coherent_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index da39ec8afca8..79f175eeb190 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1685,12 +1685,16 @@ static int memory_failure_dev_pagemap(unsigned long 
pfn, int flags,
goto unlock;
}
 
-   if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+   switch (pgmap->type) {
+   case MEMORY_DEVICE_PRIVATE:

[PATCH v8 00/15] Add MEMORY_DEVICE_COHERENT for coherent device memory mapping

2022-07-07 Thread Alex Sierra

This is our MEMORY_DEVICE_COHERENT patch series rebased and updated
for current 5.19.0-rc5

Changes since the last version:
- Fixed problems with migration during long-term pinning in
get_user_pages
- Open coded vm_normal_lru_pages as suggested in previous code review
- Update hmm_gup_test with more get_user_pages calls, include
hmm_cow_in_device in hmm-test.

This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
owned by a device that can be mapped into CPU page tables like
MEMORY_DEVICE_GENERIC and can also be migrated like
MEMORY_DEVICE_PRIVATE.

This patch series is mostly self-contained except for a few places where
it needs to update other subsystems to handle the new memory type.

System stability and performance are not affected according to our
ongoing testing, including xfstests.

How it works: The system BIOS advertises the GPU device memory
(aka VRAM) as SPM (special purpose memory) in the UEFI system address
map.

The amdgpu driver registers the memory with devmap as
MEMORY_DEVICE_COHERENT using devm_memremap_pages. The initial user for
this hardware page migration capability is the Frontier supercomputer
project. This functionality is not AMD-specific. We expect other GPU
vendors to find this functionality useful, and possibly other hardware
types in the future.

Our test nodes in the lab are similar to the Frontier configuration,
with .5 TB of system memory plus 256 GB of device memory split across
4 GPUs, all in a single coherent address space. Page migration is
expected to improve application efficiency significantly. We will
report empirical results as they become available.

Coherent device type pages at gup are now migrated back to system
memory if they are being pinned long-term (FOLL_LONGTERM). The reason
is, that long-term pinning would interfere with the device memory
manager owning the device-coherent pages (e.g. evictions in TTM).
These series incorporate Alistair Popple patches to do this
migration from pin_user_pages() calls. hmm_gup_test has been added to
hmm-test to test different get user pages calls.

This series includes handling of device-managed anonymous pages
returned by vm_normal_pages. Although they behave like normal pages
for purposes of mapping in CPU page tables and for COW, they do not
support LRU lists, NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they
expect to put pages on an LRU list.

v2:
- Rebase to latest 5.18-rc7.
- Drop patch "mm: add device coherent checker to remove migration pte"
and modify try_to_migrate_one, to let DEVICE_COHERENT pages fall
through to normal page path. Based on Alistair Popple's comment.
- Fix comment formatting.
- Reword comment in vm_normal_page about pte_devmap().
- Merge "drm/amdkfd: coherent type as sys mem on migration to ram" to
"drm/amdkfd: add SPM support for SVM".

v3:
- Rebase to latest 5.18.0.
- Patch "mm: handling Non-LRU pages returned by vm_normal_pages"
reordered.
- Add WARN_ON_ONCE for thp device coherent case.

v4:
- Rebase to latest 5.18.0
- Fix consitency between pages with FOLL_LRU flag set and pte_devmap
at follow_page_pte.

v5:
- Remove unused zone_device_type from lib/test_hmm and
selftest/vm/hmm-test.c.

v6:
- Rebase to 5.19.0-rc4
- Rename is_pinnable_page to is_longterm_pinnable_page and add a
coherent device checker.
- Add a new gup test to hmm-test to cover fast pinnable case with
FOLL_LONGTERM.

v7:
- Reorder patch series.
- Remove FOLL_LRU and check on each caller for LRU pages handling
instead.

v8:
- Add "mm: move page zone helpers into new header-specific file"
patch. The intention is to centralize all page zone helpers and keep
them independent from mm.h and memremap.h.

Alex Sierra (13):
  mm: rename is_pinnable_pages to is_longterm_pinnable_pages
  mm: move page zone helpers into new header-specific file
  mm: add zone device coherent type memory support
  mm: handling Non-LRU pages returned by vm_normal_pages
  mm: add device coherent vma selection for memory migration
  drm/amdkfd: add SPM support for SVM
  lib: test_hmm add ioctl to get zone device type
  lib: test_hmm add module param for zone device type
  lib: add support for device coherent type in test_hmm
  tools: update hmm-test to support device coherent type
  tools: update test_hmm script to support SP config
  tools: add hmm gup tests for device coherent type
  tools: add selftests to hmm for COW in device memory

Alistair Popple (2):
  mm: remove the vma check in migrate_vma_setup()
  mm/gup: migrate device coherent pages when pinning instead of failing

 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  34 ++-
 drivers/infiniband/core/rw.c |   1 -
 drivers/nvme/target/io-cmd-bdev.c|   1 -
 fs/proc/task_mmu.c   |   2 +-
 include/linux/memremap.h | 113 +---
 include/linux/migrate.h  |   1

[PATCH v7 14/14] tools: add selftests to hmm for COW in device memory

2022-06-28 Thread Alex Sierra

The objective is to test device migration mechanism in pages marked
as COW, for private and coherent device type. In case of writing to
COW private page(s), a page fault will migrate pages back to system
memory first. Then, these pages will be duplicated. In case of COW
device coherent type, pages are duplicated directly from device
memory.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
---
 tools/testing/selftests/vm/hmm-tests.c | 80 ++
 1 file changed, 80 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index bb38b9777610..716b62c05e3d 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1874,4 +1874,84 @@ TEST_F(hmm, hmm_gup_test)
close(gup_fd);
hmm_buffer_free(buffer);
 }
+
+/*
+ * Test copy-on-write in device pages.
+ * In case of writing to COW private page(s), a page fault will migrate pages
+ * back to system memory first. Then, these pages will be duplicated. In case
+ * of COW device coherent type, pages are duplicated directly from device
+ * memory.
+ */
+TEST_F(hmm, hmm_cow_in_device)
+{
+   struct hmm_buffer *buffer;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+   pid_t pid;
+   int status;
+
+   npages = 4;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+
+   pid = fork();
+   if (pid == -1)
+   ASSERT_EQ(pid, 0);
+   if (!pid) {
+   /* Child process waitd for SIGTERM from the parent. */
+   while (1) {
+   }
+   perror("Should not reach this\n");
+   exit(0);
+   }
+   /* Parent process writes to COW pages(s) and gets a
+* new copy in system. In case of device private pages,
+* this write causes a migration to system mem first.
+*/
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Terminate child and wait */
+   EXPECT_EQ(0, kill(pid, SIGTERM));
+   EXPECT_EQ(pid, waitpid(pid, , 0));
+   EXPECT_NE(0, WIFSIGNALED(status));
+   EXPECT_EQ(SIGTERM, WTERMSIG(status));
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   for (i = 0; i < npages; i++)
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
+
+   hmm_buffer_free(buffer);
+}
 TEST_HARNESS_MAIN
-- 
2.32.0

[PATCH v7 13/14] tools: add hmm gup tests for device coherent type

2022-06-28 Thread Alex Sierra

The intention is to test hmm device coherent type under different get
user pages paths. Also, test gup with FOLL_LONGTERM flag set in
device coherent pages. These pages should get migrated back to system
memory.

Signed-off-by: Alex Sierra 
Reviewed-by: Alistair Popple 
---
 tools/testing/selftests/vm/hmm-tests.c | 110 +
 1 file changed, 110 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 4b547188ec40..bb38b9777610 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -36,6 +36,7 @@
  * in the usual include/uapi/... directory.
  */
 #include "../../../../lib/test_hmm_uapi.h"
+#include "../../../../mm/gup_test.h"
 
 struct hmm_buffer {
void*ptr;
@@ -59,6 +60,9 @@ enum {
 #define NTIMES 10
 
 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01/* check pte is writable */
+#define FOLL_LONGTERM   0x1 /* mapping lifetime is indefinite */
 
 FIXTURE(hmm)
 {
@@ -1764,4 +1768,110 @@ TEST_F(hmm, exclusive_cow)
hmm_buffer_free(buffer);
 }
 
+static int gup_test_exec(int gup_fd, unsigned long addr, int cmd,
+int npages, int size, int flags)
+{
+   struct gup_test gup = {
+   .nr_pages_per_call  = npages,
+   .addr   = addr,
+   .gup_flags  = FOLL_WRITE | flags,
+   .size   = size,
+   };
+
+   if (ioctl(gup_fd, cmd, )) {
+   perror("ioctl on error\n");
+   return errno;
+   }
+
+   return 0;
+}
+
+/*
+ * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
+ * This should trigger a migration back to system memory for both, private
+ * and coherent type pages.
+ * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
+ * to your configuration before you run it.
+ */
+TEST_F(hmm, hmm_gup_test)
+{
+   struct hmm_buffer *buffer;
+   int gup_fd;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+
+   gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+   if (gup_fd == -1)
+   SKIP(return, "Skipping test, could not find gup_test driver");
+
+   npages = 4;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   /* Check what the device read. */
+   for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+   ASSERT_EQ(ptr[i], i);
+
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr,
+   GUP_BASIC_TEST, 1, self->page_size, 0), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 1 * 
self->page_size,
+   GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 2 * 
self->page_size,
+   PIN_FAST_BENCHMARK, 1, self->page_size, 
FOLL_LONGTERM), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 3 * 
self->page_size,
+   PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 
0);
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   if (hmm_is_coherent_type(variant->device_number)) {
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[1]);
+   } else {
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
+

[PATCH v7 11/14] tools: update hmm-test to support device coherent type

2022-06-28 Thread Alex Sierra

Test cases such as migrate_fault and migrate_multiple, were modified to
explicit migrate from device to sys memory without the need of page
faults, when using device coherent type.

Snapshot test case updated to read memory device type first and based
on that, get the proper returned results migrate_ping_pong test case
added to test explicit migration from device to sys memory for both
private and coherent zone types.

Helpers to migrate from device to sys memory and vicerversa
were also added.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/hmm-tests.c | 121 -
 1 file changed, 100 insertions(+), 21 deletions(-)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 203323967b50..4b547188ec40 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -46,6 +46,13 @@ struct hmm_buffer {
uint64_tfaults;
 };
 
+enum {
+   HMM_PRIVATE_DEVICE_ONE,
+   HMM_PRIVATE_DEVICE_TWO,
+   HMM_COHERENCE_DEVICE_ONE,
+   HMM_COHERENCE_DEVICE_TWO,
+};
+
 #define TWOMEG (1 << 21)
 #define HMM_BUFFER_SIZE (1024 << 12)
 #define HMM_PATH_MAX64
@@ -60,6 +67,21 @@ FIXTURE(hmm)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm)
+{
+   int device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+   .device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+   .device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
 FIXTURE(hmm2)
 {
int fd0;
@@ -68,6 +90,24 @@ FIXTURE(hmm2)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm2)
+{
+   int device_number0;
+   int device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+   .device_number0 = HMM_PRIVATE_DEVICE_ONE,
+   .device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+   .device_number0 = HMM_COHERENCE_DEVICE_ONE,
+   .device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
 static int hmm_open(int unit)
 {
char pathname[HMM_PATH_MAX];
@@ -81,12 +121,19 @@ static int hmm_open(int unit)
return fd;
 }
 
+static bool hmm_is_coherent_type(int dev_num)
+{
+   return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
 FIXTURE_SETUP(hmm)
 {
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd = hmm_open(0);
+   self->fd = hmm_open(variant->device_number);
+   if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd, 0);
 }
 
@@ -95,9 +142,11 @@ FIXTURE_SETUP(hmm2)
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd0 = hmm_open(0);
+   self->fd0 = hmm_open(variant->device_number0);
+   if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd0, 0);
-   self->fd1 = hmm_open(1);
+   self->fd1 = hmm_open(variant->device_number1);
ASSERT_GE(self->fd1, 0);
 }
 
@@ -211,6 +260,20 @@ static void hmm_nanosleep(unsigned int n)
nanosleep(, NULL);
 }
 
+static int hmm_migrate_sys_to_dev(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
 /*
  * Simple NULL test of device open/close.
  */
@@ -875,7 +938,7 @@ TEST_F(hmm, migrate)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -923,7 +986,7 @@ TEST_F(hmm, migrate_fault)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -936,7 +999,7 @@ TEST_F(hmm, migrate_fault)
ASSERT_EQ(ptr[i], i);
 
/* Migrate memory to the device again. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRA

[PATCH v7 05/14] mm: remove the vma check in migrate_vma_setup()

2022-06-28 Thread Alex Sierra

From: Alistair Popple 

migrate_vma_setup() checks that a valid vma is passed so that the page
tables can be walked to find the pfns associated with a given address
range. However in some cases the pfns are already known, such as when
migrating device coherent pages during pin_user_pages() meaning a valid
vma isn't required.

Signed-off-by: Alistair Popple 
Acked-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 mm/migrate_device.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 18bc6483f63a..cf9668376c5a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -486,24 +486,24 @@ int migrate_vma_setup(struct migrate_vma *args)
 
args->start &= PAGE_MASK;
args->end &= PAGE_MASK;
-   if (!args->vma || is_vm_hugetlb_page(args->vma) ||
-   (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
-   return -EINVAL;
-   if (nr_pages <= 0)
-   return -EINVAL;
-   if (args->start < args->vma->vm_start ||
-   args->start >= args->vma->vm_end)
-   return -EINVAL;
-   if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
-   return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
-
-   memset(args->src, 0, sizeof(*args->src) * nr_pages);
-   args->cpages = 0;
-   args->npages = 0;
-
-   migrate_vma_collect(args);
+   if (args->vma) {
+   if (is_vm_hugetlb_page(args->vma) ||
+   (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+   return -EINVAL;
+   if (args->start < args->vma->vm_start ||
+   args->start >= args->vma->vm_end)
+   return -EINVAL;
+   if (args->end <= args->vma->vm_start ||
+   args->end > args->vma->vm_end)
+   return -EINVAL;
+   memset(args->src, 0, sizeof(*args->src) * nr_pages);
+   args->cpages = 0;
+   args->npages = 0;
+
+   migrate_vma_collect(args);
+   }
 
if (args->cpages)
migrate_vma_unmap(args);
@@ -685,7 +685,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
continue;
}
 
-   if (!page) {
+   if (!page && migrate->vma) {
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
if (!notified) {
-- 
2.32.0

[PATCH v7 09/14] lib: test_hmm add module param for zone device type

2022-06-28 Thread Alex Sierra

In order to configure device coherent in test_hmm, two module parameters
should be passed, which correspond to the SP start address of each
device (2) spm_addr_dev0 & spm_addr_dev1. If no parameters are passed,
private device type is configured.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 73 -
 lib/test_hmm_uapi.h |  1 +
 2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 915ef6b5b0d4..afb30af9f3ff 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -37,6 +37,16 @@
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+static unsigned long spm_addr_dev0;
+module_param(spm_addr_dev0, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev0,
+   "Specify start address for SPM (special purpose memory) used 
for device 0. By setting this Coherent device type will be used. Make sure 
spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
+static unsigned long spm_addr_dev1;
+module_param(spm_addr_dev1, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev1,
+   "Specify start address for SPM (special purpose memory) used 
for device 1. By setting this Coherent device type will be used. Make sure 
spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
 static const struct dev_pagemap_ops dmirror_devmem_ops;
 static const struct mmu_interval_notifier_ops dmirror_min_ops;
 static dev_t dmirror_dev;
@@ -455,28 +465,44 @@ static int dmirror_write(struct dmirror *dmirror, struct 
hmm_dmirror_cmd *cmd)
return ret;
 }
 
-static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
+static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
   struct page **ppage)
 {
struct dmirror_chunk *devmem;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long pfn;
unsigned long pfn_first;
unsigned long pfn_last;
void *ptr;
+   int ret = -ENOMEM;
 
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
-   return false;
+   return ret;
 
-   res = request_free_mem_region(_resource, DEVMEM_CHUNK_SIZE,
- "hmm_dmirror");
-   if (IS_ERR(res))
+   switch (mdevice->zone_device_type) {
+   case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
+   res = request_free_mem_region(_resource, 
DEVMEM_CHUNK_SIZE,
+ "hmm_dmirror");
+   if (IS_ERR_OR_NULL(res))
+   goto err_devmem;
+   devmem->pagemap.range.start = res->start;
+   devmem->pagemap.range.end = res->end;
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   break;
+   case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
+   devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) 
?
+   spm_addr_dev0 :
+   spm_addr_dev1;
+   devmem->pagemap.range.end = devmem->pagemap.range.start +
+   DEVMEM_CHUNK_SIZE - 1;
+   devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
+   break;
+   default:
+   ret = -EINVAL;
goto err_devmem;
+   }
 
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-   devmem->pagemap.range.start = res->start;
-   devmem->pagemap.range.end = res->end;
devmem->pagemap.nr_range = 1;
devmem->pagemap.ops = _devmem_ops;
devmem->pagemap.owner = mdevice;
@@ -497,10 +523,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
mdevice->devmem_capacity = new_capacity;
mdevice->devmem_chunks = new_chunks;
}
-
ptr = memremap_pages(>pagemap, numa_node_id());
-   if (IS_ERR(ptr))
+   if (IS_ERR_OR_NULL(ptr)) {
+   if (ptr)
+   ret = PTR_ERR(ptr);
+   else
+   ret = -EFAULT;
goto err_release;
+   }
 
devmem->mdevice = mdevice;
pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
@@ -529,15 +559,17 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
}
spin_unlock(>lock);
 
-   return true;
+   return 0;
 
 err_release:
mutex_unlock(>devmem_lock);
-   release_mem_region(devmem->pagemap.range.start, 
range_len(>pagemap.range));
+   if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
+   release_mem_region(devme

[PATCH v7 12/14] tools: update test_hmm script to support SP config

2022-06-28 Thread Alex Sierra

Add two more parameters to set spm_addr_dev0 & spm_addr_dev1
addresses. These two parameters configure the start SP
addresses for each device in test_hmm driver.
Consequently, this configures zone device type as coherent.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/test_hmm.sh | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/test_hmm.sh 
b/tools/testing/selftests/vm/test_hmm.sh
index 0647b525a625..539c9371e592 100755
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ b/tools/testing/selftests/vm/test_hmm.sh
@@ -40,11 +40,26 @@ check_test_requirements()
 
 load_driver()
 {
-   modprobe $DRIVER > /dev/null 2>&1
+   if [ $# -eq 0 ]; then
+   modprobe $DRIVER > /dev/null 2>&1
+   else
+   if [ $# -eq 2 ]; then
+   modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+   > /dev/null 2>&1
+   else
+   echo "Missing module parameters. Make sure pass"\
+   "spm_addr_dev0 and spm_addr_dev1"
+   usage
+   fi
+   fi
if [ $? == 0 ]; then
major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices)
mknod /dev/hmm_dmirror0 c $major 0
mknod /dev/hmm_dmirror1 c $major 1
+   if [ $# -eq 2 ]; then
+   mknod /dev/hmm_dmirror2 c $major 2
+   mknod /dev/hmm_dmirror3 c $major 3
+   fi
fi
 }
 
@@ -58,7 +73,7 @@ run_smoke()
 {
echo "Running smoke test. Note, this test provides basic coverage."
 
-   load_driver
+   load_driver $1 $2
$(dirname "${BASH_SOURCE[0]}")/hmm-tests
unload_driver
 }
@@ -75,6 +90,9 @@ usage()
echo "# Smoke testing"
echo "./${TEST_NAME}.sh smoke"
echo
+   echo "# Smoke testing with SPM enabled"
+   echo "./${TEST_NAME}.sh smoke  "
+   echo
exit 0
 }
 
@@ -84,7 +102,7 @@ function run_test()
usage
else
if [ "$1" = "smoke" ]; then
-   run_smoke
+   run_smoke $2 $3
else
usage
fi
-- 
2.32.0

[PATCH v7 08/14] lib: test_hmm add ioctl to get zone device type

2022-06-28 Thread Alex Sierra

new ioctl cmd added to query zone device type. This will be
used once the test_hmm adds zone device coherent type.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 11 +--
 lib/test_hmm_uapi.h | 14 ++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index cfe632047839..915ef6b5b0d4 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -87,6 +87,7 @@ struct dmirror_chunk {
 struct dmirror_device {
struct cdev cdevice;
struct hmm_devmem   *devmem;
+   unsigned intzone_device_type;
 
unsigned intdevmem_capacity;
unsigned intdevmem_count;
@@ -1260,14 +1261,20 @@ static void dmirror_device_remove(struct dmirror_device 
*mdevice)
 static int __init hmm_dmirror_init(void)
 {
int ret;
-   int id;
+   int id = 0;
+   int ndevices = 0;
 
ret = alloc_chrdev_region(_dev, 0, DMIRROR_NDEVICES,
  "HMM_DMIRROR");
if (ret)
goto err_unreg;
 
-   for (id = 0; id < DMIRROR_NDEVICES; id++) {
+   memset(dmirror_devices, 0, DMIRROR_NDEVICES * 
sizeof(dmirror_devices[0]));
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   for (id = 0; id < ndevices; id++) {
ret = dmirror_device_init(dmirror_devices + id, id);
if (ret)
goto err_chrdev;
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index f14dea5dcd06..0511af7464ee 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -31,10 +31,11 @@ struct hmm_dmirror_cmd {
 /* Expose the address space of the calling process through hmm device file */
 #define HMM_DMIRROR_READ   _IOWR('H', 0x00, struct hmm_dmirror_cmd)
 #define HMM_DMIRROR_WRITE  _IOWR('H', 0x01, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_MIGRATE_IOWR('H', 0x02, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x03, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x04, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x04, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x06, struct hmm_dmirror_cmd)
 
 /*
  * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
@@ -62,4 +63,9 @@ enum {
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
 };
 
+enum {
+   /* 0 is reserved to catch uninitialized type fields */
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1,
+};
+
 #endif /* _LIB_TEST_HMM_UAPI_H */
-- 
2.32.0

[PATCH v7 07/14] drm/amdkfd: add SPM support for SVM

2022-06-28 Thread Alex Sierra

When CPU is connected throug XGMI, it has coherent
access to VRAM resource. In this case that resource
is taken from a table in the device gmc aperture base.
This resource is used along with the device type, which could
be DEVICE_PRIVATE or DEVICE_COHERENT to create the device
page map region.
Also, MIGRATE_VMA_SELECT_DEVICE_COHERENT flag is selected for
coherent type case during migration to device.

Signed-off-by: Alex Sierra 
Reviewed-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 34 +++-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index e44376c2ecdc..f73e3e340413 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -671,13 +671,15 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
migrate.vma = vma;
migrate.start = start;
migrate.end = end;
-   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
+   if (adev->gmc.xgmi.connected_to_cpu)
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+   else
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
 
buf = kvcalloc(npages,
   2 * sizeof(*migrate.src) + sizeof(uint64_t) + 
sizeof(dma_addr_t),
   GFP_KERNEL);
-
if (!buf)
goto out;
 
@@ -947,7 +949,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
 {
struct kfd_dev *kfddev = adev->kfd.dev;
struct dev_pagemap *pgmap;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long size;
void *r;
 
@@ -962,28 +964,34 @@ int svm_migrate_init(struct amdgpu_device *adev)
 * should remove reserved size
 */
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
-   res = devm_request_free_mem_region(adev->dev, _resource, size);
-   if (IS_ERR(res))
-   return -ENOMEM;
+   if (adev->gmc.xgmi.connected_to_cpu) {
+   pgmap->range.start = adev->gmc.aper_base;
+   pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 
1;
+   pgmap->type = MEMORY_DEVICE_COHERENT;
+   } else {
+   res = devm_request_free_mem_region(adev->dev, _resource, 
size);
+   if (IS_ERR(res))
+   return -ENOMEM;
+   pgmap->range.start = res->start;
+   pgmap->range.end = res->end;
+   pgmap->type = MEMORY_DEVICE_PRIVATE;
+   }
 
-   pgmap->type = MEMORY_DEVICE_PRIVATE;
pgmap->nr_range = 1;
-   pgmap->range.start = res->start;
-   pgmap->range.end = res->end;
pgmap->ops = _migrate_pgmap_ops;
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
-   pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
-
+   pgmap->flags = 0;
/* Device manager releases device-specific resources, memory region and
 * pgmap when driver disconnects from device.
 */
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
-
/* Disable SVM support capability */
pgmap->type = 0;
-   devm_release_mem_region(adev->dev, res->start, 
resource_size(res));
+   if (pgmap->type == MEMORY_DEVICE_PRIVATE)
+   devm_release_mem_region(adev->dev, res->start,
+   res->end - res->start + 1);
return PTR_ERR(r);
}
 
-- 
2.32.0

[PATCH v7 06/14] mm/gup: migrate device coherent pages when pinning instead of failing

2022-06-28 Thread Alex Sierra

From: Alistair Popple 

Currently any attempts to pin a device coherent page will fail. This is
because device coherent pages need to be managed by a device driver, and
pinning them would prevent a driver from migrating them off the device.

However this is no reason to fail pinning of these pages. These are
coherent and accessible from the CPU so can be migrated just like
pinning ZONE_MOVABLE pages. So instead of failing all attempts to pin
them first try migrating them out of ZONE_DEVICE.

Signed-off-by: Alistair Popple 
Acked-by: Felix Kuehling 
[hch: rebased to the split device memory checks,
  moved migrate_device_page to migrate_device.c]
Signed-off-by: Christoph Hellwig 
---
 mm/gup.c| 47 +++-
 mm/internal.h   |  1 +
 mm/migrate_device.c | 53 +
 3 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index b65fe8bf5af4..9b6b9923d22d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1891,9 +1891,43 @@ static long check_and_migrate_movable_pages(unsigned 
long nr_pages,
continue;
prev_folio = folio;
 
-   if (folio_is_longterm_pinnable(folio))
+   /*
+* Device private pages will get faulted in during gup so it
+* shouldn't be possible to see one here.
+*/
+   if (WARN_ON_ONCE(folio_is_device_private(folio))) {
+   ret = -EFAULT;
+   goto unpin_pages;
+   }
+
+   /*
+* Device coherent pages are managed by a driver and should not
+* be pinned indefinitely as it prevents the driver moving the
+* page. So when trying to pin with FOLL_LONGTERM instead try
+* to migrate the page out of device memory.
+*/
+   if (folio_is_device_coherent(folio)) {
+   WARN_ON_ONCE(PageCompound(>page));
+
+   /*
+* Migration will fail if the page is pinned, so convert
+* the pin on the source page to a normal reference.
+*/
+   if (gup_flags & FOLL_PIN) {
+   get_page(>page);
+   unpin_user_page(>page);
+   }
+
+   pages[i] = migrate_device_page(>page, gup_flags);
+   if (!pages[i]) {
+   ret = -EBUSY;
+   goto unpin_pages;
+   }
continue;
+   }
 
+   if (folio_is_longterm_pinnable(folio))
+   continue;
/*
 * Try to move out any movable page before pinning the range.
 */
@@ -1929,10 +1963,13 @@ static long check_and_migrate_movable_pages(unsigned 
long nr_pages,
return nr_pages;
 
 unpin_pages:
-   if (gup_flags & FOLL_PIN) {
-   unpin_user_pages(pages, nr_pages);
-   } else {
-   for (i = 0; i < nr_pages; i++)
+   for (i = 0; i < nr_pages; i++) {
+   if (!pages[i])
+   continue;
+
+   if (gup_flags & FOLL_PIN)
+   unpin_user_page(pages[i]);
+   else
put_page(pages[i]);
}
 
diff --git a/mm/internal.h b/mm/internal.h
index c0f8fbe0445b..eeab4ee7a4a3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -853,6 +853,7 @@ int numa_migrate_prep(struct page *page, struct 
vm_area_struct *vma,
  unsigned long addr, int page_nid, int *flags);
 
 void free_zone_device_page(struct page *page);
+struct page *migrate_device_page(struct page *page, unsigned int gup_flags);
 
 /*
  * mm/gup.c
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index cf9668376c5a..5decd26dd551 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -794,3 +794,56 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
 }
 EXPORT_SYMBOL(migrate_vma_finalize);
+
+/*
+ * Migrate a device coherent page back to normal memory.  The caller should 
have
+ * a reference on page which will be copied to the new page if migration is
+ * successful or dropped on failure.
+ */
+struct page *migrate_device_page(struct page *page, unsigned int gup_flags)
+{
+   unsigned long src_pfn, dst_pfn = 0;
+   struct migrate_vma args;
+   struct page *dpage;
+
+   lock_page(page);
+   src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
+   args.src = _pfn;
+   args.dst = _pfn;
+   args.cpages = 1;
+   args.npages = 1;
+   args.vma = NULL;
+   migrate_vma_setup();
+   if (!(src_pfn & MIGRATE_PFN_MIGRATE))
+   return NULL;
+
+   dpage = alloc_pages(GFP_USER | __GFP_NOWARN, 0);
+
+   /*
+*

[PATCH v7 10/14] lib: add support for device coherent type in test_hmm

2022-06-28 Thread Alex Sierra

Device Coherent type uses device memory that is coherently accesible by
the CPU. This could be shown as SP (special purpose) memory range
at the BIOS-e820 memory enumeration. If no SP memory is supported in
system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x1 &
0x14000 physical address. Ex.
efi_fake_mem=1G@0x1:0x4,1G@0x14000:0x4

Private and coherent device mirror instances can be created in the same
probed. This is done by passing the module parameters spm_addr_dev0 &
spm_addr_dev1. In this case, it will create four instances of
device_mirror. The first two correspond to private device type, the
last two to coherent type. Then, they can be easily accessed from user
space through /dev/hmm_mirror. Usually num_device 0 and 1
are for private, and 2 and 3 for coherent types. If no module
parameters are passed, two instances of private type device_mirror will
be created only.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
---
 lib/test_hmm.c  | 253 +---
 lib/test_hmm_uapi.h |   4 +
 2 files changed, 196 insertions(+), 61 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index afb30af9f3ff..7930853e7fc5 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -32,11 +32,22 @@
 
 #include "test_hmm_uapi.h"
 
-#define DMIRROR_NDEVICES   2
+#define DMIRROR_NDEVICES   4
 #define DMIRROR_RANGE_FAULT_TIMEOUT1000
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+/*
+ * For device_private pages, dpage is just a dummy struct page
+ * representing a piece of device memory. dmirror_devmem_alloc_page
+ * allocates a real system memory page as backing storage to fake a
+ * real device. zone_device_data points to that backing page. But
+ * for device_coherent memory, the struct page represents real
+ * physical CPU-accessible memory that we can use directly.
+ */
+#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
+  (page)->zone_device_data : (page))
+
 static unsigned long spm_addr_dev0;
 module_param(spm_addr_dev0, long, 0644);
 MODULE_PARM_DESC(spm_addr_dev0,
@@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce 
*bounce,
return 0;
 }
 
+static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
+{
+   return (mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+}
+
+static enum migrate_vma_direction
+dmirror_select_device(struct dmirror *dmirror)
+{
+   return (dmirror->mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
+   MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+}
+
 static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
 {
vfree(bounce->ptr);
@@ -575,16 +601,19 @@ static int dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 {
struct page *dpage = NULL;
-   struct page *rpage;
+   struct page *rpage = NULL;
 
/*
-* This is a fake device so we alloc real system memory to store
-* our device memory.
+* For ZONE_DEVICE private type, this is a fake device so we allocate
+* real system memory to store our device memory.
+* For ZONE_DEVICE coherent type we use the actual dpage to store the
+* data and ignore rpage.
 */
-   rpage = alloc_page(GFP_HIGHUSER);
-   if (!rpage)
-   return NULL;
-
+   if (dmirror_is_private_zone(mdevice)) {
+   rpage = alloc_page(GFP_HIGHUSER);
+   if (!rpage)
+   return NULL;
+   }
spin_lock(>lock);
 
if (mdevice->free_pages) {
@@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct 
dmirror_device *mdevice)
return dpage;
 
 error:
-   __free_page(rpage);
+   if (rpage)
+   __free_page(rpage);
return NULL;
 }
 
@@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct 
migrate_vma *args,
 * unallocated pte_none() or read-only zero page.
 */
spage = migrate_pfn_to_page(*src);
+   if (WARN(spage && is_zone_device_page(spage),
+"page already in device spage pfn: 0x%lx\n",
+page_to_pfn(spage)))
+   continue;
 
dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;
 
-   rpage = dpage->

[PATCH v7 03/14] mm: handling Non-LRU pages returned by vm_normal_pages

2022-06-28 Thread Alex Sierra

With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages. Although they
behave like normal pages for purposes of mapping in CPU page, and for
COW. They do not support LRU lists, NUMA migration or THP.

Callers to follow_page that expect LRU pages, are also checked for
device zone pages due to DEVICE_COHERENT type.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling  (v2)
Reviewed-by: Alistair Popple  (v6)
---
 fs/proc/task_mmu.c | 2 +-
 mm/huge_memory.c   | 2 +-
 mm/khugepaged.c| 9 ++---
 mm/ksm.c   | 6 +++---
 mm/madvise.c   | 4 ++--
 mm/memory.c| 9 -
 mm/mempolicy.c | 2 +-
 mm/migrate.c   | 4 ++--
 mm/mlock.c | 2 +-
 mm/mprotect.c  | 2 +-
 10 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2d04e3470d4c..2dd8c8a66924 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1792,7 +1792,7 @@ static struct page *can_gather_numa_stats(pte_t pte, 
struct vm_area_struct *vma,
return NULL;
 
page = vm_normal_page(vma, addr, pte);
-   if (!page)
+   if (!page || is_zone_device_page(page))
return NULL;
 
if (PageReserved(page))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 834f288b3769..c47e95b02244 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2910,7 +2910,7 @@ static int split_huge_pages_pid(int pid, unsigned long 
vaddr_start,
 
if (IS_ERR(page))
continue;
-   if (!page)
+   if (!page || is_zone_device_page(page))
continue;
 
if (!is_transparent_hugepage(page))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 16be62d493cd..671ac7800e53 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -618,7 +618,7 @@ static int __collapse_huge_page_isolate(struct 
vm_area_struct *vma,
goto out;
}
page = vm_normal_page(vma, address, pteval);
-   if (unlikely(!page)) {
+   if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
}
@@ -1267,7 +1267,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
writable = true;
 
page = vm_normal_page(vma, _address, pteval);
-   if (unlikely(!page)) {
+   if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
@@ -1479,7 +1479,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr)
goto abort;
 
page = vm_normal_page(vma, addr, *pte);
-
+   if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+   page = NULL;
/*
 * Note that uprobe, debugger, or MAP_PRIVATE may change the
 * page table, but the new page will not be a subpage of hpage.
@@ -1497,6 +1498,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr)
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
+   if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+   goto abort;
page_remove_rmap(page, vma, false);
}
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 54f78c9eecae..831b18a7a50b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned 
long addr)
cond_resched();
page = follow_page(vma, addr,
FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-   if (IS_ERR_OR_NULL(page))
+   if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
@@ -560,7 +560,7 @@ static struct page *get_mergeable_page(struct rmap_item 
*rmap_item)
goto out;
 
page = follow_page(vma, addr, FOLL_GET);
-   if (IS_ERR_OR_NULL(page))
+   if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
goto out;
if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
@@ -2308,7 +2308,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct 
page **page)
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
-   if (IS_ERR_OR_NULL(*page)) {
+   if (IS_ERR_OR_NULL(*page) || 
is_zone_device_page(*page)) {
ksm_scan.addr

[PATCH v7 04/14] mm: add device coherent vma selection for memory migration

2022-06-28 Thread Alex Sierra

This case is used to migrate pages from device memory, back to system
memory. Device coherent type memory is cache coherent from device and CPU
point of view.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 include/linux/migrate.h |  1 +
 mm/migrate_device.c | 12 +---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 069a89e847f3..b84908debe5c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
 enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
 };
 
 struct migrate_vma {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index a4847ad65da3..18bc6483f63a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -148,15 +148,21 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
-   if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
-   goto next;
pfn = pte_pfn(pte);
-   if (is_zero_pfn(pfn)) {
+   if (is_zero_pfn(pfn) &&
+   (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
+   if (page && !is_zone_device_page(page) &&
+   !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+   goto next;
+   else if (page && is_device_coherent_page(page) &&
+   (!(migrate->flags & 
MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+page->pgmap->owner != migrate->pgmap_owner))
+   goto next;
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
-- 
2.32.0

[PATCH v7 01/14] mm: rename is_pinnable_pages to is_pinnable_longterm_pages

2022-06-28 Thread Alex Sierra

is_pinnable_page() and folio_is_pinnable() were renamed to
is_longterm_pinnable_page() and folio_is_longterm_pinnable()
respectively. These functions are used in the FOLL_LONGTERM flag
context.

Signed-off-by: Alex Sierra 
---
 include/linux/memremap.h | 24 
 include/linux/mm.h   | 24 
 mm/gup.c |  4 ++--
 mm/gup_test.c|  4 ++--
 mm/hugetlb.c |  2 +-
 5 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8af304f6b504..c272bd0af3c1 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -150,6 +150,30 @@ static inline bool is_pci_p2pdma_page(const struct page 
*page)
page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
 
+/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
+#ifdef CONFIG_MIGRATION
+static inline bool is_longterm_pinnable_page(struct page *page)
+{
+#ifdef CONFIG_CMA
+   int mt = get_pageblock_migratetype(page);
+
+   if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
+   return false;
+#endif
+   return !(is_zone_movable_page(page) ||
+is_zero_pfn(page_to_pfn(page)));
+}
+#else
+static inline bool is_longterm_pinnable_page(struct page *page)
+{
+   return true;
+}
+#endif
+static inline bool folio_is_longterm_pinnable(struct folio *folio)
+{
+   return is_longterm_pinnable_page(>page);
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..bc0f201a4cff 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1590,30 +1590,6 @@ static inline bool page_needs_cow_for_dma(struct 
vm_area_struct *vma,
return page_maybe_dma_pinned(page);
 }
 
-/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
-#ifdef CONFIG_MIGRATION
-static inline bool is_pinnable_page(struct page *page)
-{
-#ifdef CONFIG_CMA
-   int mt = get_pageblock_migratetype(page);
-
-   if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
-   return false;
-#endif
-   return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
-}
-#else
-static inline bool is_pinnable_page(struct page *page)
-{
-   return true;
-}
-#endif
-
-static inline bool folio_is_pinnable(struct folio *folio)
-{
-   return is_pinnable_page(>page);
-}
-
 static inline void set_page_zone(struct page *page, enum zone_type zone)
 {
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
diff --git a/mm/gup.c b/mm/gup.c
index 551264407624..b65fe8bf5af4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -133,7 +133,7 @@ struct folio *try_grab_folio(struct page *page, int refs, 
unsigned int flags)
 * path.
 */
if (unlikely((flags & FOLL_LONGTERM) &&
-!is_pinnable_page(page)))
+!is_longterm_pinnable_page(page)))
return NULL;
 
/*
@@ -1891,7 +1891,7 @@ static long check_and_migrate_movable_pages(unsigned long 
nr_pages,
continue;
prev_folio = folio;
 
-   if (folio_is_pinnable(folio))
+   if (folio_is_longterm_pinnable(folio))
continue;
 
/*
diff --git a/mm/gup_test.c b/mm/gup_test.c
index d974dec19e1c..9d705ba6737e 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -1,5 +1,5 @@
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -53,7 +53,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page 
**pages,
dump_page(page, "gup_test failure");
break;
} else if (cmd == PIN_LONGTERM_BENCHMARK &&
-   WARN(!is_pinnable_page(page),
+   WARN(!is_longterm_pinnable_page(page),
 "pages[%lu] is NOT pinnable but pinned\n",
 i)) {
dump_page(page, "gup_test failure");
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a57e1be41401..368fd33787b0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1135,7 +1135,7 @@ static struct page *dequeue_huge_page_node_exact(struct 
hstate *h, int nid)
 
lockdep_assert_held(_lock);
list_for_each_entry(page, >hugepage_freelists[nid], lru) {
-   if (pin && !is_pinnable_page(page))
+   if (pin && !is_longterm_pinnable_page(page))
continue;
 
if (PageHWPoison(page))
-- 
2.32.0

[PATCH v7 02/14] mm: add zone device coherent type memory support

2022-06-28 Thread Alex Sierra

Device memory that is cache coherent from device and CPU point of view.
This is used on platforms that have an advanced system bus (like CAPI
or CXL). Any page of a process can be migrated to such memory. However,
no one should be allowed to pin such memory so that it can always be
evicted.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
[hch: rebased ontop of the refcount changes,
  removed is_dev_private_or_coherent_page]
Signed-off-by: Christoph Hellwig 
---
 include/linux/memremap.h | 22 +-
 mm/memcontrol.c  |  7 ---
 mm/memory-failure.c  |  8 ++--
 mm/memremap.c| 10 ++
 mm/migrate_device.c  | 16 +++-
 mm/rmap.c|  5 +++--
 6 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c272bd0af3c1..6fc0ced64b2d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -41,6 +41,13 @@ struct vmem_altmap {
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -61,6 +68,7 @@ struct vmem_altmap {
 enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
+   MEMORY_DEVICE_COHERENT,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA,
@@ -143,6 +151,17 @@ static inline bool folio_is_device_private(const struct 
folio *folio)
return is_device_private_page(>page);
 }
 
+static inline bool is_device_coherent_page(const struct page *page)
+{
+   return is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_DEVICE_COHERENT;
+}
+
+static inline bool folio_is_device_coherent(const struct folio *folio)
+{
+   return is_device_coherent_page(>page);
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
@@ -160,7 +179,8 @@ static inline bool is_longterm_pinnable_page(struct page 
*page)
if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
return false;
 #endif
-   return !(is_zone_movable_page(page) ||
+   return !(is_device_coherent_page(page) ||
+is_zone_movable_page(page) ||
 is_zero_pfn(page_to_pfn(page)));
 }
 #else
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 618c366a2f07..5d37a85c67da 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5665,8 +5665,8 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  * target for charge migration. if @target is not NULL, the entry is stored
  * in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
+ *   thus not on the lru.
  * For now we such page is charge like a regular page would be as for all
  * intent and purposes it is just special memory taking the place of a
  * regular page.
@@ -5704,7 +5704,8 @@ static enum mc_target_type get_mctgt_type(struct 
vm_area_struct *vma,
 */
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
-   if (is_device_private_page(page))
+   if (is_device_private_page(page) ||
+   is_device_coherent_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index da39ec8afca8..79f175eeb190 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1685,12 +1685,16 @@ static int memory_failure_dev_pagemap(unsigned long 
pfn, int flags,
goto unlock;
}
 
-   if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+   switch (pgmap->type) {
+   case MEMORY_DEVICE_PRIVATE:
+   case MEMORY_DEVICE_COHERENT:
/*
-* TODO: Handle HMM pages which may need coordination
+* TODO: Handle device pages which may need coordination

[PATCH v7 00/14] Add MEMORY_DEVICE_COHERENT for coherent device memory mapping

2022-06-28 Thread Alex Sierra

This is our MEMORY_DEVICE_COHERENT patch series rebased and updated
for current 5.19.0-rc4

Changes since the last version:
- Fixed problems with migration during long-term pinning in
get_user_pages
- Open coded vm_normal_lru_pages as suggested in previous code review
- Update hmm_gup_test with more get_user_pages calls, include
hmm_cow_in_device in hmm-test.

This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
owned by a device that can be mapped into CPU page tables like
MEMORY_DEVICE_GENERIC and can also be migrated like
MEMORY_DEVICE_PRIVATE.

This patch series is mostly self-contained except for a few places where
it needs to update other subsystems to handle the new memory type.

System stability and performance are not affected according to our
ongoing testing, including xfstests.

How it works: The system BIOS advertises the GPU device memory
(aka VRAM) as SPM (special purpose memory) in the UEFI system address
map.

The amdgpu driver registers the memory with devmap as
MEMORY_DEVICE_COHERENT using devm_memremap_pages. The initial user for
this hardware page migration capability is the Frontier supercomputer
project. This functionality is not AMD-specific. We expect other GPU
vendors to find this functionality useful, and possibly other hardware
types in the future.

Our test nodes in the lab are similar to the Frontier configuration,
with .5 TB of system memory plus 256 GB of device memory split across
4 GPUs, all in a single coherent address space. Page migration is
expected to improve application efficiency significantly. We will
report empirical results as they become available.

Coherent device type pages at gup are now migrated back to system
memory if they are being pinned long-term (FOLL_LONGTERM). The reason
is, that long-term pinning would interfere with the device memory
manager owning the device-coherent pages (e.g. evictions in TTM).
These series incorporate Alistair Popple patches to do this
migration from pin_user_pages() calls. hmm_gup_test has been added to
hmm-test to test different get user pages calls.

This series includes handling of device-managed anonymous pages
returned by vm_normal_pages. Although they behave like normal pages
for purposes of mapping in CPU page tables and for COW, they do not
support LRU lists, NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they
expect to put pages on an LRU list.

v2:
- Rebase to latest 5.18-rc7.
- Drop patch "mm: add device coherent checker to remove migration pte"
and modify try_to_migrate_one, to let DEVICE_COHERENT pages fall
through to normal page path. Based on Alistair Popple's comment.
- Fix comment formatting.
- Reword comment in vm_normal_page about pte_devmap().
- Merge "drm/amdkfd: coherent type as sys mem on migration to ram" to
"drm/amdkfd: add SPM support for SVM".

v3:
- Rebase to latest 5.18.0.
- Patch "mm: handling Non-LRU pages returned by vm_normal_pages"
reordered.
- Add WARN_ON_ONCE for thp device coherent case.

v4:
- Rebase to latest 5.18.0
- Fix consitency between pages with FOLL_LRU flag set and pte_devmap
at follow_page_pte.

v5:
- Remove unused zone_device_type from lib/test_hmm and
selftest/vm/hmm-test.c.

v6:
- Rebase to 5.19.0-rc4
- Rename is_pinnable_page to is_longterm_pinnable_page and add a
coherent device checker.
- Add a new gup test to hmm-test to cover fast pinnable case with
FOLL_LONGTERM.

v7:
- Reorder patch series.
- Remove FOLL_LRU and check on each caller for LRU pages handling
instead.

Alex Sierra (12):
  mm: rename is_pinnable_pages to is_pinnable_longterm_pages
  mm: add zone device coherent type memory support
  mm: handling Non-LRU pages returned by vm_normal_pages
  mm: add device coherent vma selection for memory migration
  drm/amdkfd: add SPM support for SVM
  lib: test_hmm add ioctl to get zone device type
  lib: test_hmm add module param for zone device type
  lib: add support for device coherent type in test_hmm
  tools: update hmm-test to support device coherent type
  tools: update test_hmm script to support SP config
  tools: add hmm gup tests for device coherent type
  tools: add selftests to hmm for COW in device memory

Alistair Popple (2):
  mm: remove the vma check in migrate_vma_setup()
  mm/gup: migrate device coherent pages when pinning instead of failing

 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  34 ++-
 fs/proc/task_mmu.c   |   2 +-
 include/linux/memremap.h |  44 +++
 include/linux/migrate.h  |   1 +
 include/linux/mm.h   |  24 --
 lib/test_hmm.c   | 337 +--
 lib/test_hmm_uapi.h  |  19 +-
 mm/gup.c |  49 +++-
 mm/gup_test.c|   4 +-
 mm/huge_memory.c |   2 +-
 mm/hugetlb.c

[PATCH 3/3] drm/amdgpu: add debugfs for kfd system and ttm mem used

2022-06-27 Thread Alex Sierra

This keeps track of kfd system mem used and kfd ttm mem used.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  3 +++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 19 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c  |  2 ++
 3 files changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index e6244182a3a4..53cdf7f00b3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -172,6 +172,9 @@ int amdgpu_queue_mask_bit_to_set_resource_bit(struct 
amdgpu_device *adev,
 struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
struct mm_struct *mm,
struct svm_range_bo *svm_bo);
+#if defined(CONFIG_DEBUG_FS)
+int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data);
+#endif
 #if IS_ENABLED(CONFIG_HSA_AMD)
 bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
 struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 9719577ecc6d..c48557b683c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2935,3 +2935,22 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
}
return false;
 }
+
+#if defined(CONFIG_DEBUG_FS)
+
+int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data)
+{
+
+   spin_lock(_mem_limit.mem_limit_lock);
+   seq_printf(m, "System mem used %lldM out of %lluM\n",
+ (kfd_mem_limit.system_mem_used >> 20),
+ (kfd_mem_limit.max_system_mem_limit >> 20));
+   seq_printf(m, "TTM mem used %lldM out of %lluM\n",
+ (kfd_mem_limit.ttm_mem_used >> 20),
+ (kfd_mem_limit.max_ttm_mem_limit >> 20));
+   spin_unlock(_mem_limit.mem_limit_lock);
+
+   return 0;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index 581c3a30fee1..ad5a40a685ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -101,6 +101,8 @@ void kfd_debugfs_init(void)
kfd_debugfs_rls_by_device, _debugfs_fops);
debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
kfd_debugfs_hang_hws_read, 
_debugfs_hang_hws_fops);
+   debugfs_create_file("mem_limit", S_IFREG | 0200, debugfs_root,
+   kfd_debugfs_kfd_mem_limits, _debugfs_fops);
 }
 
 void kfd_debugfs_fini(void)
-- 
2.32.0

[PATCH 2/3] drm/amdkfd: track unified memory reservation with xnack off

2022-06-27 Thread Alex Sierra

[WHY]
Unified memory with xnack off should be tracked, as userptr mappings
and legacy allocations do. To avoid oversuscribe system memory when
xnack off.
[How]
Exposing functions reserve_mem_limit and unreserve_mem_limit to SVM
API and call them on every prange creation and free.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  4 ++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 25 
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  | 58 +--
 3 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index b25b41f50213..e6244182a3a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -305,6 +305,10 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
 
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 9142f6cc3f4d..9719577ecc6d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -128,7 +128,7 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
  *
  * Return: returns -ENOMEM in case of error, ZERO otherwise
  */
-static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
uint64_t reserved_for_pt =
@@ -168,7 +168,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) ||
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
-   (adev->kfd.vram_used + vram_needed >
+   (adev && adev->kfd.vram_used + vram_needed >
 adev->gmc.real_vram_size -
 atomic64_read(>vram_pin_size) -
 reserved_for_pt)) {
@@ -179,7 +179,10 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
/* Update memory accounting by decreasing available system
 * memory, TTM memory and GPU memory as computed above
 */
-   adev->kfd.vram_used += vram_needed;
+   WARN_ONCE(vram_needed && !adev,
+ "adev reference can't be null when vram is used");
+   if (adev)
+   adev->kfd.vram_used += vram_needed;
kfd_mem_limit.system_mem_used += system_mem_needed;
kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
 
@@ -188,7 +191,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
return ret;
 }
 
-static void unreserve_mem_limit(struct amdgpu_device *adev,
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
spin_lock(_mem_limit.mem_limit_lock);
@@ -197,7 +200,10 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
+   WARN_ONCE(!adev,
+ "adev reference can't be null when alloc mem flags 
vram is set");
+   if (adev)
+   adev->kfd.vram_used -= ALIGN(size, 
VRAM_ALLOCATION_ALIGN);
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
kfd_mem_limit.system_mem_used -= size;
} else if (!(alloc_flag &
@@ -206,11 +212,8 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
goto release;
}
-
-   WARN_ONCE(adev->kfd.vram_used < 0,
+   WARN_ONCE(adev && adev->kfd.vram_used < 0,
  "KFD VRAM memory accounting unbalanced");
-   WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,
- "KFD TTM memory accounting unbalanced");
WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
  "KFD system memory accounting unbalanced");
 
@@ -224,7 +227,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
u32 alloc_flags = bo->kfd_

[PATCH 1/3] drm/amdgpu: remove acc_size from reserve/unreserve mem

2022-06-27 Thread Alex Sierra

TTM used to track the "acc_size" of all BOs internally. We needed to
keep track of it in our memory reservation to avoid TTM running out
of memory in its own accounting. However, that "acc_size" accounting
has since been removed from TTM. Therefore we don't really need to
track it any more.

Signed-off-by: Alex Sierra 
Reviewed-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 60 ++-
 1 file changed, 17 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 5ba9070d8722..9142f6cc3f4d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -114,21 +114,12 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
  * compromise that should work in most cases without reserving too
  * much memory for page tables unnecessarily (factor 16K, >> 14).
  */
-#define ESTIMATE_PT_SIZE(mem_size) max(((mem_size) >> 14), 
AMDGPU_VM_RESERVED_VRAM)
-
-static size_t amdgpu_amdkfd_acc_size(uint64_t size)
-{
-   size >>= PAGE_SHIFT;
-   size *= sizeof(dma_addr_t) + sizeof(void *);
 
-   return __roundup_pow_of_two(sizeof(struct amdgpu_bo)) +
-   __roundup_pow_of_two(sizeof(struct ttm_tt)) +
-   PAGE_ALIGN(size);
-}
+#define ESTIMATE_PT_SIZE(mem_size) max(((mem_size) >> 14), 
AMDGPU_VM_RESERVED_VRAM)
 
 /**
  * amdgpu_amdkfd_reserve_mem_limit() - Decrease available memory by size
- * of buffer including any reserved for control structures
+ * of buffer.
  *
  * @adev: Device to which allocated BO belongs to
  * @size: Size of buffer, in bytes, encapsulated by B0. This should be
@@ -142,19 +133,16 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 {
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
-   size_t acc_size, system_mem_needed, ttm_mem_needed, vram_needed;
+   size_t system_mem_needed, ttm_mem_needed, vram_needed;
int ret = 0;
 
-   acc_size = amdgpu_amdkfd_acc_size(size);
-
+   system_mem_needed = 0;
+   ttm_mem_needed = 0;
vram_needed = 0;
if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
-   system_mem_needed = acc_size + size;
-   ttm_mem_needed = acc_size + size;
+   system_mem_needed = size;
+   ttm_mem_needed = size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   system_mem_needed = acc_size;
-   ttm_mem_needed = acc_size;
-
/*
 * Conservatively round up the allocation requirement to 2 MB
 * to avoid fragmentation caused by 4K allocations in the tail
@@ -162,14 +150,10 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 */
vram_needed = ALIGN(size, VRAM_ALLOCATION_ALIGN);
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
-   system_mem_needed = acc_size + size;
-   ttm_mem_needed = acc_size;
-   } else if (alloc_flag &
-  (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
-   KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
-   system_mem_needed = acc_size;
-   ttm_mem_needed = acc_size;
-   } else {
+   system_mem_needed = size;
+   } else if (!(alloc_flag &
+   (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
return -ENOMEM;
}
@@ -207,28 +191,18 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 static void unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
-   size_t acc_size;
-
-   acc_size = amdgpu_amdkfd_acc_size(size);
-
spin_lock(_mem_limit.mem_limit_lock);
 
if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
-   kfd_mem_limit.system_mem_used -= (acc_size + size);
-   kfd_mem_limit.ttm_mem_used -= (acc_size + size);
+   kfd_mem_limit.system_mem_used -= size;
+   kfd_mem_limit.ttm_mem_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   kfd_mem_limit.system_mem_used -= acc_size;
-   kfd_mem_limit.ttm_mem_used -= acc_size;
adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
-   kfd_mem_limit.system_mem_used -= (acc_size + size);
-   kfd_mem_limit.ttm_mem_used -= acc_size;
-   } else if (alloc_flag &
-  (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
-

[PATCH v6 12/14] tools: update test_hmm script to support SP config

2022-06-27 Thread Alex Sierra

Add two more parameters to set spm_addr_dev0 & spm_addr_dev1
addresses. These two parameters configure the start SP
addresses for each device in test_hmm driver.
Consequently, this configures zone device type as coherent.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/test_hmm.sh | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/test_hmm.sh 
b/tools/testing/selftests/vm/test_hmm.sh
index 0647b525a625..539c9371e592 100755
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ b/tools/testing/selftests/vm/test_hmm.sh
@@ -40,11 +40,26 @@ check_test_requirements()
 
 load_driver()
 {
-   modprobe $DRIVER > /dev/null 2>&1
+   if [ $# -eq 0 ]; then
+   modprobe $DRIVER > /dev/null 2>&1
+   else
+   if [ $# -eq 2 ]; then
+   modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+   > /dev/null 2>&1
+   else
+   echo "Missing module parameters. Make sure pass"\
+   "spm_addr_dev0 and spm_addr_dev1"
+   usage
+   fi
+   fi
if [ $? == 0 ]; then
major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices)
mknod /dev/hmm_dmirror0 c $major 0
mknod /dev/hmm_dmirror1 c $major 1
+   if [ $# -eq 2 ]; then
+   mknod /dev/hmm_dmirror2 c $major 2
+   mknod /dev/hmm_dmirror3 c $major 3
+   fi
fi
 }
 
@@ -58,7 +73,7 @@ run_smoke()
 {
echo "Running smoke test. Note, this test provides basic coverage."
 
-   load_driver
+   load_driver $1 $2
$(dirname "${BASH_SOURCE[0]}")/hmm-tests
unload_driver
 }
@@ -75,6 +90,9 @@ usage()
echo "# Smoke testing"
echo "./${TEST_NAME}.sh smoke"
echo
+   echo "# Smoke testing with SPM enabled"
+   echo "./${TEST_NAME}.sh smoke  "
+   echo
exit 0
 }
 
@@ -84,7 +102,7 @@ function run_test()
usage
else
if [ "$1" = "smoke" ]; then
-   run_smoke
+   run_smoke $2 $3
else
usage
fi
-- 
2.32.0

[PATCH v6 10/14] lib: add support for device coherent type in test_hmm

2022-06-27 Thread Alex Sierra

Device Coherent type uses device memory that is coherently accesible by
the CPU. This could be shown as SP (special purpose) memory range
at the BIOS-e820 memory enumeration. If no SP memory is supported in
system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x1 &
0x14000 physical address. Ex.
efi_fake_mem=1G@0x1:0x4,1G@0x14000:0x4

Private and coherent device mirror instances can be created in the same
probed. This is done by passing the module parameters spm_addr_dev0 &
spm_addr_dev1. In this case, it will create four instances of
device_mirror. The first two correspond to private device type, the
last two to coherent type. Then, they can be easily accessed from user
space through /dev/hmm_mirror. Usually num_device 0 and 1
are for private, and 2 and 3 for coherent types. If no module
parameters are passed, two instances of private type device_mirror will
be created only.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
---
 lib/test_hmm.c  | 253 +---
 lib/test_hmm_uapi.h |   4 +
 2 files changed, 196 insertions(+), 61 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index afb30af9f3ff..7930853e7fc5 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -32,11 +32,22 @@
 
 #include "test_hmm_uapi.h"
 
-#define DMIRROR_NDEVICES   2
+#define DMIRROR_NDEVICES   4
 #define DMIRROR_RANGE_FAULT_TIMEOUT1000
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+/*
+ * For device_private pages, dpage is just a dummy struct page
+ * representing a piece of device memory. dmirror_devmem_alloc_page
+ * allocates a real system memory page as backing storage to fake a
+ * real device. zone_device_data points to that backing page. But
+ * for device_coherent memory, the struct page represents real
+ * physical CPU-accessible memory that we can use directly.
+ */
+#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
+  (page)->zone_device_data : (page))
+
 static unsigned long spm_addr_dev0;
 module_param(spm_addr_dev0, long, 0644);
 MODULE_PARM_DESC(spm_addr_dev0,
@@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce 
*bounce,
return 0;
 }
 
+static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
+{
+   return (mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+}
+
+static enum migrate_vma_direction
+dmirror_select_device(struct dmirror *dmirror)
+{
+   return (dmirror->mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
+   MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+}
+
 static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
 {
vfree(bounce->ptr);
@@ -575,16 +601,19 @@ static int dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 {
struct page *dpage = NULL;
-   struct page *rpage;
+   struct page *rpage = NULL;
 
/*
-* This is a fake device so we alloc real system memory to store
-* our device memory.
+* For ZONE_DEVICE private type, this is a fake device so we allocate
+* real system memory to store our device memory.
+* For ZONE_DEVICE coherent type we use the actual dpage to store the
+* data and ignore rpage.
 */
-   rpage = alloc_page(GFP_HIGHUSER);
-   if (!rpage)
-   return NULL;
-
+   if (dmirror_is_private_zone(mdevice)) {
+   rpage = alloc_page(GFP_HIGHUSER);
+   if (!rpage)
+   return NULL;
+   }
spin_lock(>lock);
 
if (mdevice->free_pages) {
@@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct 
dmirror_device *mdevice)
return dpage;
 
 error:
-   __free_page(rpage);
+   if (rpage)
+   __free_page(rpage);
return NULL;
 }
 
@@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct 
migrate_vma *args,
 * unallocated pte_none() or read-only zero page.
 */
spage = migrate_pfn_to_page(*src);
+   if (WARN(spage && is_zone_device_page(spage),
+"page already in device spage pfn: 0x%lx\n",
+page_to_pfn(spage)))
+   continue;
 
dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;
 
-   rpage = dpage->

[PATCH v6 13/14] tools: add hmm gup tests for device coherent type

2022-06-27 Thread Alex Sierra

The intention is to test hmm device coherent type under different get
user pages paths. Also, test gup with FOLL_LONGTERM flag set in
device coherent pages. These pages should get migrated back to system
memory.

Signed-off-by: Alex Sierra 
Reviewed-by: Alistair Popple 
---
 tools/testing/selftests/vm/hmm-tests.c | 110 +
 1 file changed, 110 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 4b547188ec40..bb38b9777610 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -36,6 +36,7 @@
  * in the usual include/uapi/... directory.
  */
 #include "../../../../lib/test_hmm_uapi.h"
+#include "../../../../mm/gup_test.h"
 
 struct hmm_buffer {
void*ptr;
@@ -59,6 +60,9 @@ enum {
 #define NTIMES 10
 
 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01/* check pte is writable */
+#define FOLL_LONGTERM   0x1 /* mapping lifetime is indefinite */
 
 FIXTURE(hmm)
 {
@@ -1764,4 +1768,110 @@ TEST_F(hmm, exclusive_cow)
hmm_buffer_free(buffer);
 }
 
+static int gup_test_exec(int gup_fd, unsigned long addr, int cmd,
+int npages, int size, int flags)
+{
+   struct gup_test gup = {
+   .nr_pages_per_call  = npages,
+   .addr   = addr,
+   .gup_flags  = FOLL_WRITE | flags,
+   .size   = size,
+   };
+
+   if (ioctl(gup_fd, cmd, )) {
+   perror("ioctl on error\n");
+   return errno;
+   }
+
+   return 0;
+}
+
+/*
+ * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
+ * This should trigger a migration back to system memory for both, private
+ * and coherent type pages.
+ * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
+ * to your configuration before you run it.
+ */
+TEST_F(hmm, hmm_gup_test)
+{
+   struct hmm_buffer *buffer;
+   int gup_fd;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+
+   gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+   if (gup_fd == -1)
+   SKIP(return, "Skipping test, could not find gup_test driver");
+
+   npages = 4;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   /* Check what the device read. */
+   for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+   ASSERT_EQ(ptr[i], i);
+
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr,
+   GUP_BASIC_TEST, 1, self->page_size, 0), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 1 * 
self->page_size,
+   GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 2 * 
self->page_size,
+   PIN_FAST_BENCHMARK, 1, self->page_size, 
FOLL_LONGTERM), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 3 * 
self->page_size,
+   PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 
0);
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   if (hmm_is_coherent_type(variant->device_number)) {
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[1]);
+   } else {
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
+

[PATCH v6 11/14] tools: update hmm-test to support device coherent type

2022-06-27 Thread Alex Sierra

Test cases such as migrate_fault and migrate_multiple, were modified to
explicit migrate from device to sys memory without the need of page
faults, when using device coherent type.

Snapshot test case updated to read memory device type first and based
on that, get the proper returned results migrate_ping_pong test case
added to test explicit migration from device to sys memory for both
private and coherent zone types.

Helpers to migrate from device to sys memory and vicerversa
were also added.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/hmm-tests.c | 121 -
 1 file changed, 100 insertions(+), 21 deletions(-)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 203323967b50..4b547188ec40 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -46,6 +46,13 @@ struct hmm_buffer {
uint64_tfaults;
 };
 
+enum {
+   HMM_PRIVATE_DEVICE_ONE,
+   HMM_PRIVATE_DEVICE_TWO,
+   HMM_COHERENCE_DEVICE_ONE,
+   HMM_COHERENCE_DEVICE_TWO,
+};
+
 #define TWOMEG (1 << 21)
 #define HMM_BUFFER_SIZE (1024 << 12)
 #define HMM_PATH_MAX64
@@ -60,6 +67,21 @@ FIXTURE(hmm)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm)
+{
+   int device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+   .device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+   .device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
 FIXTURE(hmm2)
 {
int fd0;
@@ -68,6 +90,24 @@ FIXTURE(hmm2)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm2)
+{
+   int device_number0;
+   int device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+   .device_number0 = HMM_PRIVATE_DEVICE_ONE,
+   .device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+   .device_number0 = HMM_COHERENCE_DEVICE_ONE,
+   .device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
 static int hmm_open(int unit)
 {
char pathname[HMM_PATH_MAX];
@@ -81,12 +121,19 @@ static int hmm_open(int unit)
return fd;
 }
 
+static bool hmm_is_coherent_type(int dev_num)
+{
+   return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
 FIXTURE_SETUP(hmm)
 {
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd = hmm_open(0);
+   self->fd = hmm_open(variant->device_number);
+   if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd, 0);
 }
 
@@ -95,9 +142,11 @@ FIXTURE_SETUP(hmm2)
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd0 = hmm_open(0);
+   self->fd0 = hmm_open(variant->device_number0);
+   if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd0, 0);
-   self->fd1 = hmm_open(1);
+   self->fd1 = hmm_open(variant->device_number1);
ASSERT_GE(self->fd1, 0);
 }
 
@@ -211,6 +260,20 @@ static void hmm_nanosleep(unsigned int n)
nanosleep(, NULL);
 }
 
+static int hmm_migrate_sys_to_dev(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
 /*
  * Simple NULL test of device open/close.
  */
@@ -875,7 +938,7 @@ TEST_F(hmm, migrate)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -923,7 +986,7 @@ TEST_F(hmm, migrate_fault)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -936,7 +999,7 @@ TEST_F(hmm, migrate_fault)
ASSERT_EQ(ptr[i], i);
 
/* Migrate memory to the device again. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRA

[PATCH v6 14/14] tools: add selftests to hmm for COW in device memory

2022-06-27 Thread Alex Sierra

The objective is to test device migration mechanism in pages marked
as COW, for private and coherent device type. In case of writing to
COW private page(s), a page fault will migrate pages back to system
memory first. Then, these pages will be duplicated. In case of COW
device coherent type, pages are duplicated directly from device
memory.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
---
 tools/testing/selftests/vm/hmm-tests.c | 80 ++
 1 file changed, 80 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index bb38b9777610..716b62c05e3d 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1874,4 +1874,84 @@ TEST_F(hmm, hmm_gup_test)
close(gup_fd);
hmm_buffer_free(buffer);
 }
+
+/*
+ * Test copy-on-write in device pages.
+ * In case of writing to COW private page(s), a page fault will migrate pages
+ * back to system memory first. Then, these pages will be duplicated. In case
+ * of COW device coherent type, pages are duplicated directly from device
+ * memory.
+ */
+TEST_F(hmm, hmm_cow_in_device)
+{
+   struct hmm_buffer *buffer;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+   pid_t pid;
+   int status;
+
+   npages = 4;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+
+   pid = fork();
+   if (pid == -1)
+   ASSERT_EQ(pid, 0);
+   if (!pid) {
+   /* Child process waitd for SIGTERM from the parent. */
+   while (1) {
+   }
+   perror("Should not reach this\n");
+   exit(0);
+   }
+   /* Parent process writes to COW pages(s) and gets a
+* new copy in system. In case of device private pages,
+* this write causes a migration to system mem first.
+*/
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Terminate child and wait */
+   EXPECT_EQ(0, kill(pid, SIGTERM));
+   EXPECT_EQ(pid, waitpid(pid, , 0));
+   EXPECT_NE(0, WIFSIGNALED(status));
+   EXPECT_EQ(SIGTERM, WTERMSIG(status));
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   for (i = 0; i < npages; i++)
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
+
+   hmm_buffer_free(buffer);
+}
 TEST_HARNESS_MAIN
-- 
2.32.0

[PATCH v6 07/14] drm/amdkfd: add SPM support for SVM

2022-06-27 Thread Alex Sierra

When CPU is connected throug XGMI, it has coherent
access to VRAM resource. In this case that resource
is taken from a table in the device gmc aperture base.
This resource is used along with the device type, which could
be DEVICE_PRIVATE or DEVICE_COHERENT to create the device
page map region.
Also, MIGRATE_VMA_SELECT_DEVICE_COHERENT flag is selected for
coherent type case during migration to device.

Signed-off-by: Alex Sierra 
Reviewed-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 34 +++-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index e44376c2ecdc..f73e3e340413 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -671,13 +671,15 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
migrate.vma = vma;
migrate.start = start;
migrate.end = end;
-   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
+   if (adev->gmc.xgmi.connected_to_cpu)
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+   else
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
 
buf = kvcalloc(npages,
   2 * sizeof(*migrate.src) + sizeof(uint64_t) + 
sizeof(dma_addr_t),
   GFP_KERNEL);
-
if (!buf)
goto out;
 
@@ -947,7 +949,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
 {
struct kfd_dev *kfddev = adev->kfd.dev;
struct dev_pagemap *pgmap;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long size;
void *r;
 
@@ -962,28 +964,34 @@ int svm_migrate_init(struct amdgpu_device *adev)
 * should remove reserved size
 */
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
-   res = devm_request_free_mem_region(adev->dev, _resource, size);
-   if (IS_ERR(res))
-   return -ENOMEM;
+   if (adev->gmc.xgmi.connected_to_cpu) {
+   pgmap->range.start = adev->gmc.aper_base;
+   pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 
1;
+   pgmap->type = MEMORY_DEVICE_COHERENT;
+   } else {
+   res = devm_request_free_mem_region(adev->dev, _resource, 
size);
+   if (IS_ERR(res))
+   return -ENOMEM;
+   pgmap->range.start = res->start;
+   pgmap->range.end = res->end;
+   pgmap->type = MEMORY_DEVICE_PRIVATE;
+   }
 
-   pgmap->type = MEMORY_DEVICE_PRIVATE;
pgmap->nr_range = 1;
-   pgmap->range.start = res->start;
-   pgmap->range.end = res->end;
pgmap->ops = _migrate_pgmap_ops;
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
-   pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
-
+   pgmap->flags = 0;
/* Device manager releases device-specific resources, memory region and
 * pgmap when driver disconnects from device.
 */
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
-
/* Disable SVM support capability */
pgmap->type = 0;
-   devm_release_mem_region(adev->dev, res->start, 
resource_size(res));
+   if (pgmap->type == MEMORY_DEVICE_PRIVATE)
+   devm_release_mem_region(adev->dev, res->start,
+   res->end - res->start + 1);
return PTR_ERR(r);
}
 
-- 
2.32.0

[PATCH v6 08/14] lib: test_hmm add ioctl to get zone device type

2022-06-27 Thread Alex Sierra

new ioctl cmd added to query zone device type. This will be
used once the test_hmm adds zone device coherent type.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 11 +--
 lib/test_hmm_uapi.h | 14 ++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index cfe632047839..915ef6b5b0d4 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -87,6 +87,7 @@ struct dmirror_chunk {
 struct dmirror_device {
struct cdev cdevice;
struct hmm_devmem   *devmem;
+   unsigned intzone_device_type;
 
unsigned intdevmem_capacity;
unsigned intdevmem_count;
@@ -1260,14 +1261,20 @@ static void dmirror_device_remove(struct dmirror_device 
*mdevice)
 static int __init hmm_dmirror_init(void)
 {
int ret;
-   int id;
+   int id = 0;
+   int ndevices = 0;
 
ret = alloc_chrdev_region(_dev, 0, DMIRROR_NDEVICES,
  "HMM_DMIRROR");
if (ret)
goto err_unreg;
 
-   for (id = 0; id < DMIRROR_NDEVICES; id++) {
+   memset(dmirror_devices, 0, DMIRROR_NDEVICES * 
sizeof(dmirror_devices[0]));
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   for (id = 0; id < ndevices; id++) {
ret = dmirror_device_init(dmirror_devices + id, id);
if (ret)
goto err_chrdev;
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index f14dea5dcd06..0511af7464ee 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -31,10 +31,11 @@ struct hmm_dmirror_cmd {
 /* Expose the address space of the calling process through hmm device file */
 #define HMM_DMIRROR_READ   _IOWR('H', 0x00, struct hmm_dmirror_cmd)
 #define HMM_DMIRROR_WRITE  _IOWR('H', 0x01, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_MIGRATE_IOWR('H', 0x02, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x03, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x04, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x04, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x06, struct hmm_dmirror_cmd)
 
 /*
  * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
@@ -62,4 +63,9 @@ enum {
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
 };
 
+enum {
+   /* 0 is reserved to catch uninitialized type fields */
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1,
+};
+
 #endif /* _LIB_TEST_HMM_UAPI_H */
-- 
2.32.0

[PATCH v6 06/14] mm: add device coherent checker to is_pinnable_page

2022-06-27 Thread Alex Sierra

is_device_coherent checker was added to is_pinnable_page and renamed
to is_longterm_pinnable_page. The reason is that device coherent
pages are not supported for longterm pinning.

Signed-off-by: Alex Sierra 
---
 include/linux/memremap.h | 25 +
 include/linux/mm.h   | 24 
 mm/gup.c |  5 ++---
 mm/gup_test.c|  4 ++--
 mm/hugetlb.c |  2 +-
 5 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 9f752ebed613..6fc0ced64b2d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -169,6 +169,31 @@ static inline bool is_pci_p2pdma_page(const struct page 
*page)
page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
 
+/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
+#ifdef CONFIG_MIGRATION
+static inline bool is_longterm_pinnable_page(struct page *page)
+{
+#ifdef CONFIG_CMA
+   int mt = get_pageblock_migratetype(page);
+
+   if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
+   return false;
+#endif
+   return !(is_device_coherent_page(page) ||
+is_zone_movable_page(page) ||
+is_zero_pfn(page_to_pfn(page)));
+}
+#else
+static inline bool is_longterm_pinnable_page(struct page *page)
+{
+   return true;
+}
+#endif
+static inline bool folio_is_longterm_pinnable(struct folio *folio)
+{
+   return is_longterm_pinnable_page(>page);
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6f5d48c1934..b91a4a1f260b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1590,30 +1590,6 @@ static inline bool page_needs_cow_for_dma(struct 
vm_area_struct *vma,
return page_maybe_dma_pinned(page);
 }
 
-/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
-#ifdef CONFIG_MIGRATION
-static inline bool is_pinnable_page(struct page *page)
-{
-#ifdef CONFIG_CMA
-   int mt = get_pageblock_migratetype(page);
-
-   if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
-   return false;
-#endif
-   return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
-}
-#else
-static inline bool is_pinnable_page(struct page *page)
-{
-   return true;
-}
-#endif
-
-static inline bool folio_is_pinnable(struct folio *folio)
-{
-   return is_pinnable_page(>page);
-}
-
 static inline void set_page_zone(struct page *page, enum zone_type zone)
 {
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
diff --git a/mm/gup.c b/mm/gup.c
index c29a7b5fbbfd..ada73b775a82 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -133,8 +133,7 @@ struct folio *try_grab_folio(struct page *page, int refs, 
unsigned int flags)
 * path.
 */
if (unlikely((flags & FOLL_LONGTERM) &&
-(!is_pinnable_page(page) ||
-is_device_coherent_page(page
+   !is_longterm_pinnable_page(page)))
return NULL;
 
/*
@@ -1931,7 +1930,7 @@ static long check_and_migrate_movable_pages(unsigned long 
nr_pages,
continue;
}
 
-   if (folio_is_pinnable(folio))
+   if (folio_is_longterm_pinnable(folio))
continue;
/*
 * Try to move out any movable page before pinning the range.
diff --git a/mm/gup_test.c b/mm/gup_test.c
index d974dec19e1c..9d705ba6737e 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -1,5 +1,5 @@
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -53,7 +53,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page 
**pages,
dump_page(page, "gup_test failure");
break;
} else if (cmd == PIN_LONGTERM_BENCHMARK &&
-   WARN(!is_pinnable_page(page),
+   WARN(!is_longterm_pinnable_page(page),
 "pages[%lu] is NOT pinnable but pinned\n",
 i)) {
dump_page(page, "gup_test failure");
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a57e1be41401..368fd33787b0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1135,7 +1135,7 @@ static struct page *dequeue_huge_page_node_exact(struct 
hstate *h, int nid)
 
lockdep_assert_held(_lock);
list_for_each_entry(page, >hugepage_freelists[nid], lru) {
-   if (pin && !is_pinnable_page(page))
+   if (pin && !is_longterm_pinnable_page(page))
continue;
 
if (PageHWPoison(page))
-- 
2.32.0

[PATCH v6 09/14] lib: test_hmm add module param for zone device type

2022-06-27 Thread Alex Sierra

In order to configure device coherent in test_hmm, two module parameters
should be passed, which correspond to the SP start address of each
device (2) spm_addr_dev0 & spm_addr_dev1. If no parameters are passed,
private device type is configured.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 73 -
 lib/test_hmm_uapi.h |  1 +
 2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 915ef6b5b0d4..afb30af9f3ff 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -37,6 +37,16 @@
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+static unsigned long spm_addr_dev0;
+module_param(spm_addr_dev0, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev0,
+   "Specify start address for SPM (special purpose memory) used 
for device 0. By setting this Coherent device type will be used. Make sure 
spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
+static unsigned long spm_addr_dev1;
+module_param(spm_addr_dev1, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev1,
+   "Specify start address for SPM (special purpose memory) used 
for device 1. By setting this Coherent device type will be used. Make sure 
spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
 static const struct dev_pagemap_ops dmirror_devmem_ops;
 static const struct mmu_interval_notifier_ops dmirror_min_ops;
 static dev_t dmirror_dev;
@@ -455,28 +465,44 @@ static int dmirror_write(struct dmirror *dmirror, struct 
hmm_dmirror_cmd *cmd)
return ret;
 }
 
-static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
+static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
   struct page **ppage)
 {
struct dmirror_chunk *devmem;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long pfn;
unsigned long pfn_first;
unsigned long pfn_last;
void *ptr;
+   int ret = -ENOMEM;
 
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
-   return false;
+   return ret;
 
-   res = request_free_mem_region(_resource, DEVMEM_CHUNK_SIZE,
- "hmm_dmirror");
-   if (IS_ERR(res))
+   switch (mdevice->zone_device_type) {
+   case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
+   res = request_free_mem_region(_resource, 
DEVMEM_CHUNK_SIZE,
+ "hmm_dmirror");
+   if (IS_ERR_OR_NULL(res))
+   goto err_devmem;
+   devmem->pagemap.range.start = res->start;
+   devmem->pagemap.range.end = res->end;
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   break;
+   case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
+   devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) 
?
+   spm_addr_dev0 :
+   spm_addr_dev1;
+   devmem->pagemap.range.end = devmem->pagemap.range.start +
+   DEVMEM_CHUNK_SIZE - 1;
+   devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
+   break;
+   default:
+   ret = -EINVAL;
goto err_devmem;
+   }
 
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-   devmem->pagemap.range.start = res->start;
-   devmem->pagemap.range.end = res->end;
devmem->pagemap.nr_range = 1;
devmem->pagemap.ops = _devmem_ops;
devmem->pagemap.owner = mdevice;
@@ -497,10 +523,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
mdevice->devmem_capacity = new_capacity;
mdevice->devmem_chunks = new_chunks;
}
-
ptr = memremap_pages(>pagemap, numa_node_id());
-   if (IS_ERR(ptr))
+   if (IS_ERR_OR_NULL(ptr)) {
+   if (ptr)
+   ret = PTR_ERR(ptr);
+   else
+   ret = -EFAULT;
goto err_release;
+   }
 
devmem->mdevice = mdevice;
pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
@@ -529,15 +559,17 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
}
spin_unlock(>lock);
 
-   return true;
+   return 0;
 
 err_release:
mutex_unlock(>devmem_lock);
-   release_mem_region(devmem->pagemap.range.start, 
range_len(>pagemap.range));
+   if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
+   release_mem_region(devme

[PATCH v6 03/14] mm: add device coherent vma selection for memory migration

2022-06-27 Thread Alex Sierra

This case is used to migrate pages from device memory, back to system
memory. Device coherent type memory is cache coherent from device and CPU
point of view.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 include/linux/migrate.h |  1 +
 mm/migrate_device.c | 12 +---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 069a89e847f3..b84908debe5c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
 enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
 };
 
 struct migrate_vma {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index a4847ad65da3..18bc6483f63a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -148,15 +148,21 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
-   if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
-   goto next;
pfn = pte_pfn(pte);
-   if (is_zero_pfn(pfn)) {
+   if (is_zero_pfn(pfn) &&
+   (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
+   if (page && !is_zone_device_page(page) &&
+   !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+   goto next;
+   else if (page && is_device_coherent_page(page) &&
+   (!(migrate->flags & 
MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+page->pgmap->owner != migrate->pgmap_owner))
+   goto next;
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
-- 
2.32.0

[PATCH v6 04/14] mm: remove the vma check in migrate_vma_setup()

2022-06-27 Thread Alex Sierra

From: Alistair Popple 

migrate_vma_setup() checks that a valid vma is passed so that the page
tables can be walked to find the pfns associated with a given address
range. However in some cases the pfns are already known, such as when
migrating device coherent pages during pin_user_pages() meaning a valid
vma isn't required.

Signed-off-by: Alistair Popple 
Acked-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 mm/migrate_device.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 18bc6483f63a..cf9668376c5a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -486,24 +486,24 @@ int migrate_vma_setup(struct migrate_vma *args)
 
args->start &= PAGE_MASK;
args->end &= PAGE_MASK;
-   if (!args->vma || is_vm_hugetlb_page(args->vma) ||
-   (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
-   return -EINVAL;
-   if (nr_pages <= 0)
-   return -EINVAL;
-   if (args->start < args->vma->vm_start ||
-   args->start >= args->vma->vm_end)
-   return -EINVAL;
-   if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
-   return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
-
-   memset(args->src, 0, sizeof(*args->src) * nr_pages);
-   args->cpages = 0;
-   args->npages = 0;
-
-   migrate_vma_collect(args);
+   if (args->vma) {
+   if (is_vm_hugetlb_page(args->vma) ||
+   (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+   return -EINVAL;
+   if (args->start < args->vma->vm_start ||
+   args->start >= args->vma->vm_end)
+   return -EINVAL;
+   if (args->end <= args->vma->vm_start ||
+   args->end > args->vma->vm_end)
+   return -EINVAL;
+   memset(args->src, 0, sizeof(*args->src) * nr_pages);
+   args->cpages = 0;
+   args->npages = 0;
+
+   migrate_vma_collect(args);
+   }
 
if (args->cpages)
migrate_vma_unmap(args);
@@ -685,7 +685,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
continue;
}
 
-   if (!page) {
+   if (!page && migrate->vma) {
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
if (!notified) {
-- 
2.32.0

[PATCH v6 05/14] mm/gup: migrate device coherent pages when pinning instead of failing

2022-06-27 Thread Alex Sierra

From: Alistair Popple 

Currently any attempts to pin a device coherent page will fail. This is
because device coherent pages need to be managed by a device driver, and
pinning them would prevent a driver from migrating them off the device.

However this is no reason to fail pinning of these pages. These are
coherent and accessible from the CPU so can be migrated just like
pinning ZONE_MOVABLE pages. So instead of failing all attempts to pin
them first try migrating them out of ZONE_DEVICE.

Signed-off-by: Alistair Popple 
Acked-by: Felix Kuehling 
[hch: rebased to the split device memory checks,
  moved migrate_device_page to migrate_device.c]
Signed-off-by: Christoph Hellwig 
---
 mm/gup.c| 50 +-
 mm/internal.h   |  1 +
 mm/migrate_device.c | 53 +
 3 files changed, 98 insertions(+), 6 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 48b45bcc8501..c29a7b5fbbfd 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -133,7 +133,8 @@ struct folio *try_grab_folio(struct page *page, int refs, 
unsigned int flags)
 * path.
 */
if (unlikely((flags & FOLL_LONGTERM) &&
-!is_pinnable_page(page)))
+(!is_pinnable_page(page) ||
+is_device_coherent_page(page
return NULL;
 
/*
@@ -1895,9 +1896,43 @@ static long check_and_migrate_movable_pages(unsigned 
long nr_pages,
continue;
prev_folio = folio;
 
-   if (folio_is_pinnable(folio))
+   /*
+* Device private pages will get faulted in during gup so it
+* shouldn't be possible to see one here.
+*/
+   if (WARN_ON_ONCE(folio_is_device_private(folio))) {
+   ret = -EFAULT;
+   goto unpin_pages;
+   }
+
+   /*
+* Device coherent pages are managed by a driver and should not
+* be pinned indefinitely as it prevents the driver moving the
+* page. So when trying to pin with FOLL_LONGTERM instead try
+* to migrate the page out of device memory.
+*/
+   if (folio_is_device_coherent(folio)) {
+   WARN_ON_ONCE(PageCompound(>page));
+
+   /*
+* Migration will fail if the page is pinned, so convert
+* the pin on the source page to a normal reference.
+*/
+   if (gup_flags & FOLL_PIN) {
+   get_page(>page);
+   unpin_user_page(>page);
+   }
+
+   pages[i] = migrate_device_page(>page, gup_flags);
+   if (!pages[i]) {
+   ret = -EBUSY;
+   goto unpin_pages;
+   }
continue;
+   }
 
+   if (folio_is_pinnable(folio))
+   continue;
/*
 * Try to move out any movable page before pinning the range.
 */
@@ -1933,10 +1968,13 @@ static long check_and_migrate_movable_pages(unsigned 
long nr_pages,
return nr_pages;
 
 unpin_pages:
-   if (gup_flags & FOLL_PIN) {
-   unpin_user_pages(pages, nr_pages);
-   } else {
-   for (i = 0; i < nr_pages; i++)
+   for (i = 0; i < nr_pages; i++) {
+   if (!pages[i])
+   continue;
+
+   if (gup_flags & FOLL_PIN)
+   unpin_user_page(pages[i]);
+   else
put_page(pages[i]);
}
 
diff --git a/mm/internal.h b/mm/internal.h
index c0f8fbe0445b..eeab4ee7a4a3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -853,6 +853,7 @@ int numa_migrate_prep(struct page *page, struct 
vm_area_struct *vma,
  unsigned long addr, int page_nid, int *flags);
 
 void free_zone_device_page(struct page *page);
+struct page *migrate_device_page(struct page *page, unsigned int gup_flags);
 
 /*
  * mm/gup.c
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index cf9668376c5a..5decd26dd551 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -794,3 +794,56 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
 }
 EXPORT_SYMBOL(migrate_vma_finalize);
+
+/*
+ * Migrate a device coherent page back to normal memory.  The caller should 
have
+ * a reference on page which will be copied to the new page if migration is
+ * successful or dropped on failure.
+ */
+struct page *migrate_device_page(struct page *page, unsigned int gup_flags)
+{
+   unsigned long src_pfn, dst_pfn = 0;
+   struct migrate_vma args;
+   struct page

[PATCH v6 02/14] mm: handling Non-LRU pages returned by vm_normal_pages

2022-06-27 Thread Alex Sierra

With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages. Although they
behave like normal pages for purposes of mapping in CPU page, and for
COW. They do not support LRU lists, NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they
expect to put pages on an LRU list.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
---
 fs/proc/task_mmu.c | 2 +-
 include/linux/mm.h | 3 ++-
 mm/gup.c   | 6 +-
 mm/huge_memory.c   | 2 +-
 mm/khugepaged.c| 9 ++---
 mm/ksm.c   | 6 +++---
 mm/madvise.c   | 4 ++--
 mm/memory.c| 9 -
 mm/mempolicy.c | 2 +-
 mm/migrate.c   | 4 ++--
 mm/mlock.c | 2 +-
 mm/mprotect.c  | 2 +-
 12 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2d04e3470d4c..2dd8c8a66924 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1792,7 +1792,7 @@ static struct page *can_gather_numa_stats(pte_t pte, 
struct vm_area_struct *vma,
return NULL;
 
page = vm_normal_page(vma, addr, pte);
-   if (!page)
+   if (!page || is_zone_device_page(page))
return NULL;
 
if (PageReserved(page))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..f6f5d48c1934 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -601,7 +601,7 @@ struct vm_operations_struct {
 #endif
/*
 * Called by vm_normal_page() for special PTEs to find the
-* page for @addr.  This is useful if the default behavior
+* page for @addr. This is useful if the default behavior
 * (using pte_page()) would not find the correct page.
 */
struct page *(*find_special_page)(struct vm_area_struct *vma,
@@ -2934,6 +2934,7 @@ struct page *follow_page(struct vm_area_struct *vma, 
unsigned long address,
 #define FOLL_NUMA  0x200   /* force NUMA hinting page fault */
 #define FOLL_MIGRATION 0x400   /* wait for page to replace migration entry */
 #define FOLL_TRIED 0x800   /* a retry, previous pass started an IO */
+#define FOLL_LRU0x1000  /* return only LRU (anon or page cache) */
 #define FOLL_REMOTE0x2000  /* we are working on non-current tsk/mm */
 #define FOLL_COW   0x4000  /* internal GUP flag */
 #define FOLL_ANON  0x8000  /* don't do file mappings */
diff --git a/mm/gup.c b/mm/gup.c
index 551264407624..48b45bcc8501 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -532,7 +532,11 @@ static struct page *follow_page_pte(struct vm_area_struct 
*vma,
}
 
page = vm_normal_page(vma, address, pte);
-   if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
+   if ((flags & FOLL_LRU) && ((page && is_zone_device_page(page)) ||
+   (!page && pte_devmap(pte {
+   page = ERR_PTR(-EEXIST);
+   goto out;
+   } else if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) 
{
/*
 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
 * case since they are only valid while holding the pgmap
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 834f288b3769..d242184ab169 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2906,7 +2906,7 @@ static int split_huge_pages_pid(int pid, unsigned long 
vaddr_start,
}
 
/* FOLL_DUMP to ignore special (like zero) pages */
-   page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+   page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
 
if (IS_ERR(page))
continue;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 16be62d493cd..671ac7800e53 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -618,7 +618,7 @@ static int __collapse_huge_page_isolate(struct 
vm_area_struct *vma,
goto out;
}
page = vm_normal_page(vma, address, pteval);
-   if (unlikely(!page)) {
+   if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
}
@@ -1267,7 +1267,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
writable = true;
 
page = vm_normal_page(vma, _address, pteval);
-   if (unlikely(!page)) {
+   if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
@@ -1479,7 +1479,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr)

[PATCH v6 01/14] mm: add zone device coherent type memory support

2022-06-27 Thread Alex Sierra

Device memory that is cache coherent from device and CPU point of view.
This is used on platforms that have an advanced system bus (like CAPI
or CXL). Any page of a process can be migrated to such memory. However,
no one should be allowed to pin such memory so that it can always be
evicted.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
[hch: rebased ontop of the refcount changes,
  removed is_dev_private_or_coherent_page]
Signed-off-by: Christoph Hellwig 
---
 include/linux/memremap.h | 19 +++
 mm/memcontrol.c  |  7 ---
 mm/memory-failure.c  |  8 ++--
 mm/memremap.c| 10 ++
 mm/migrate_device.c  | 16 +++-
 mm/rmap.c|  5 +++--
 6 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8af304f6b504..9f752ebed613 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -41,6 +41,13 @@ struct vmem_altmap {
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -61,6 +68,7 @@ struct vmem_altmap {
 enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
+   MEMORY_DEVICE_COHERENT,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA,
@@ -143,6 +151,17 @@ static inline bool folio_is_device_private(const struct 
folio *folio)
return is_device_private_page(>page);
 }
 
+static inline bool is_device_coherent_page(const struct page *page)
+{
+   return is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_DEVICE_COHERENT;
+}
+
+static inline bool folio_is_device_coherent(const struct folio *folio)
+{
+   return is_device_coherent_page(>page);
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 618c366a2f07..5d37a85c67da 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5665,8 +5665,8 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  * target for charge migration. if @target is not NULL, the entry is stored
  * in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
+ *   thus not on the lru.
  * For now we such page is charge like a regular page would be as for all
  * intent and purposes it is just special memory taking the place of a
  * regular page.
@@ -5704,7 +5704,8 @@ static enum mc_target_type get_mctgt_type(struct 
vm_area_struct *vma,
 */
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
-   if (is_device_private_page(page))
+   if (is_device_private_page(page) ||
+   is_device_coherent_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index da39ec8afca8..79f175eeb190 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1685,12 +1685,16 @@ static int memory_failure_dev_pagemap(unsigned long 
pfn, int flags,
goto unlock;
}
 
-   if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+   switch (pgmap->type) {
+   case MEMORY_DEVICE_PRIVATE:
+   case MEMORY_DEVICE_COHERENT:
/*
-* TODO: Handle HMM pages which may need coordination
+* TODO: Handle device pages which may need coordination
 * with device-side memory.
 */
goto unlock;
+   default:
+   break;
}
 
/*
diff --git a/mm/memremap.c b/mm/memremap.c
index b870a659eee6..0f8f08f8a991 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
return ERR_PTR(-EINVAL);

[PATCH v6 00/14] Add MEMORY_DEVICE_COHERENT for coherent device memory mapping

2022-06-27 Thread Alex Sierra

This is our MEMORY_DEVICE_COHERENT patch series rebased and updated
for current 5.19.0-rc4

Changes since the last version:
- Fixed problems with migration during long-term pinning in
get_user_pages
- Open coded vm_normal_lru_pages as suggested in previous code review
- Update hmm_gup_test with more get_user_pages calls, include
hmm_cow_in_device in hmm-test.

This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
owned by a device that can be mapped into CPU page tables like
MEMORY_DEVICE_GENERIC and can also be migrated like
MEMORY_DEVICE_PRIVATE.

This patch series is mostly self-contained except for a few places where
it needs to update other subsystems to handle the new memory type.

System stability and performance are not affected according to our
ongoing testing, including xfstests.

How it works: The system BIOS advertises the GPU device memory
(aka VRAM) as SPM (special purpose memory) in the UEFI system address
map.

The amdgpu driver registers the memory with devmap as
MEMORY_DEVICE_COHERENT using devm_memremap_pages. The initial user for
this hardware page migration capability is the Frontier supercomputer
project. This functionality is not AMD-specific. We expect other GPU
vendors to find this functionality useful, and possibly other hardware
types in the future.

Our test nodes in the lab are similar to the Frontier configuration,
with .5 TB of system memory plus 256 GB of device memory split across
4 GPUs, all in a single coherent address space. Page migration is
expected to improve application efficiency significantly. We will
report empirical results as they become available.

Coherent device type pages at gup are now migrated back to system
memory if they are being pinned long-term (FOLL_LONGTERM). The reason
is, that long-term pinning would interfere with the device memory
manager owning the device-coherent pages (e.g. evictions in TTM).
These series incorporate Alistair Popple patches to do this
migration from pin_user_pages() calls. hmm_gup_test has been added to
hmm-test to test different get user pages calls.

This series includes handling of device-managed anonymous pages
returned by vm_normal_pages. Although they behave like normal pages
for purposes of mapping in CPU page tables and for COW, they do not
support LRU lists, NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they
expect to put pages on an LRU list.

v2:
- Rebase to latest 5.18-rc7.
- Drop patch "mm: add device coherent checker to remove migration pte"
and modify try_to_migrate_one, to let DEVICE_COHERENT pages fall
through to normal page path. Based on Alistair Popple's comment.
- Fix comment formatting.
- Reword comment in vm_normal_page about pte_devmap().
- Merge "drm/amdkfd: coherent type as sys mem on migration to ram" to
"drm/amdkfd: add SPM support for SVM".

v3:
- Rebase to latest 5.18.0.
- Patch "mm: handling Non-LRU pages returned by vm_normal_pages"
reordered.
- Add WARN_ON_ONCE for thp device coherent case.

v4:
- Rebase to latest 5.18.0
- Fix consitency between pages with FOLL_LRU flag set and pte_devmap
at follow_page_pte.

v5:
- Remove unused zone_device_type from lib/test_hmm and
selftest/vm/hmm-test.c.

v6:
- Rebase to 5.19.0-rc4
- Rename is_pinnable_page to is_longterm_pinnable_page and add a
coherent device checker.
- Add a new gup test to hmm-test to cover fast pinnable case with
FOLL_LONGTERM

Alex Sierra (12):
  mm: add zone device coherent type memory support
  mm: handling Non-LRU pages returned by vm_normal_pages
  mm: add device coherent vma selection for memory migration
  mm: add device coherent checker to is_pinnable_page
  drm/amdkfd: add SPM support for SVM
  lib: test_hmm add ioctl to get zone device type
  lib: test_hmm add module param for zone device type
  lib: add support for device coherent type in test_hmm
  tools: update hmm-test to support device coherent type
  tools: update test_hmm script to support SP config
  tools: add hmm gup tests for device coherent type
  tools: add selftests to hmm for COW in device memory

Alistair Popple (2):
  mm: remove the vma check in migrate_vma_setup()
  mm/gup: migrate device coherent pages when pinning instead of failing

 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  34 ++-
 fs/proc/task_mmu.c   |   2 +-
 include/linux/memremap.h |  44 +++
 include/linux/migrate.h  |   1 +
 include/linux/mm.h   |  27 +-
 lib/test_hmm.c   | 337 +--
 lib/test_hmm_uapi.h  |  19 +-
 mm/gup.c |  55 +++-
 mm/gup_test.c|   4 +-
 mm/huge_memory.c |   2 +-
 mm/hugetlb.c |   2 +-
 mm/internal.h|   1 +
 mm/khugepaged.c

[PATCH 2/2] drm/amdkfd: track unified memory reservation with xnack off

2022-06-07 Thread Alex Sierra

[WHY]
Unified memory with xnack off should be tracked, as userptr mappings
and legacy allocations do. To avoid oversuscribe system memory when
xnack off.
[How]
Exposing functions reserve_mem_limit and unreserve_mem_limit to SVM
API and call them on every prange creation and free.

Signed-off-by: Alex Sierra 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  4 ++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 27 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  | 47 +--
 3 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index f8b9f27adcf5..f55f34af6480 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -301,6 +301,10 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device 
*adev, struct kgd_mem *
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
+   uint64_t size, u32 alloc_flag);
 
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 50730d2132a6..f13977ae4579 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -122,7 +122,7 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
  *
  * Return: returns -ENOMEM in case of error, ZERO otherwise
  */
-static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
uint64_t reserved_for_pt =
@@ -157,8 +157,8 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) ||
(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
 kfd_mem_limit.max_ttm_mem_limit) ||
-   (adev->kfd.vram_used + vram_needed >
-adev->gmc.real_vram_size - reserved_for_pt)) {
+   (adev && (adev->kfd.vram_used + vram_needed >
+adev->gmc.real_vram_size - reserved_for_pt))) {
ret = -ENOMEM;
goto release;
}
@@ -166,7 +166,10 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
/* Update memory accounting by decreasing available system
 * memory, TTM memory and GPU memory as computed above
 */
-   adev->kfd.vram_used += vram_needed;
+   WARN_ONCE(vram_needed && !adev,
+ "adev reference can't be null when vram is used");
+   if (adev)
+   adev->kfd.vram_used += vram_needed;
kfd_mem_limit.system_mem_used += system_mem_needed;
kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
 
@@ -175,7 +178,7 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
return ret;
 }
 
-static void unreserve_mem_limit(struct amdgpu_device *adev,
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
spin_lock(_mem_limit.mem_limit_lock);
@@ -184,7 +187,10 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   adev->kfd.vram_used -= size;
+   WARN_ONCE(!adev,
+ "adev reference can't be null when alloc mem flags 
vram is set");
+   if (adev)
+   adev->kfd.vram_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
kfd_mem_limit.system_mem_used -= size;
} else if (!(alloc_flag &
@@ -193,11 +199,8 @@ static void unreserve_mem_limit(struct amdgpu_device *adev,
pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
goto release;
}
-
-   WARN_ONCE(adev->kfd.vram_used < 0,
+   WARN_ONCE(adev && adev->kfd.vram_used < 0,
  "KFD VRAM memory accounting unbalanced");
-   WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,
- "KFD TTM memory accounting unbalanced");
WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
  "KFD system memory accounting unbalanced");
 
@@ -211,7 +214,7 @@ void amdgpu_amdkfd_release_notify(struct amd

[PATCH 1/2] drm/amdgpu: remove acc_size from reserve/unreserve mem

2022-06-07 Thread Alex Sierra

TTM used to track the "acc_size" of all BOs internally. We needed to
keep track of it in our memory reservation to avoid TTM running out
of memory in its own accounting. However, that "acc_size" accounting
has since been removed from TTM. Therefore we don't really need to
track it any more.

Signed-off-by: Alex Sierra 
Reviewed-by: Philip Yang 
Reviewed-by: Felix Kuehling 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 59 ++-
 1 file changed, 17 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 054e4a76ae2e..50730d2132a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -108,21 +108,12 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
  * compromise that should work in most cases without reserving too
  * much memory for page tables unnecessarily (factor 16K, >> 14).
  */
-#define ESTIMATE_PT_SIZE(mem_size) ((mem_size) >> 14)
-
-static size_t amdgpu_amdkfd_acc_size(uint64_t size)
-{
-   size >>= PAGE_SHIFT;
-   size *= sizeof(dma_addr_t) + sizeof(void *);
 
-   return __roundup_pow_of_two(sizeof(struct amdgpu_bo)) +
-   __roundup_pow_of_two(sizeof(struct ttm_tt)) +
-   PAGE_ALIGN(size);
-}
+#define ESTIMATE_PT_SIZE(mem_size) ((mem_size) >> 14)
 
 /**
  * amdgpu_amdkfd_reserve_mem_limit() - Decrease available memory by size
- * of buffer including any reserved for control structures
+ * of buffer.
  *
  * @adev: Device to which allocated BO belongs to
  * @size: Size of buffer, in bytes, encapsulated by B0. This should be
@@ -136,28 +127,22 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 {
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
-   size_t acc_size, system_mem_needed, ttm_mem_needed, vram_needed;
+   size_t system_mem_needed, ttm_mem_needed, vram_needed;
int ret = 0;
 
-   acc_size = amdgpu_amdkfd_acc_size(size);
-
+   system_mem_needed = 0;
+   ttm_mem_needed = 0;
vram_needed = 0;
if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
-   system_mem_needed = acc_size + size;
-   ttm_mem_needed = acc_size + size;
+   system_mem_needed = size;
+   ttm_mem_needed = size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   system_mem_needed = acc_size;
-   ttm_mem_needed = acc_size;
vram_needed = size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
-   system_mem_needed = acc_size + size;
-   ttm_mem_needed = acc_size;
-   } else if (alloc_flag &
-  (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
-   KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
-   system_mem_needed = acc_size;
-   ttm_mem_needed = acc_size;
-   } else {
+   system_mem_needed = size;
+   } else if (!(alloc_flag &
+   (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
return -ENOMEM;
}
@@ -193,28 +178,18 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 static void unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
-   size_t acc_size;
-
-   acc_size = amdgpu_amdkfd_acc_size(size);
-
spin_lock(_mem_limit.mem_limit_lock);
 
if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
-   kfd_mem_limit.system_mem_used -= (acc_size + size);
-   kfd_mem_limit.ttm_mem_used -= (acc_size + size);
+   kfd_mem_limit.system_mem_used -= size;
+   kfd_mem_limit.ttm_mem_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-   kfd_mem_limit.system_mem_used -= acc_size;
-   kfd_mem_limit.ttm_mem_used -= acc_size;
adev->kfd.vram_used -= size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
-   kfd_mem_limit.system_mem_used -= (acc_size + size);
-   kfd_mem_limit.ttm_mem_used -= acc_size;
-   } else if (alloc_flag &
-  (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
-   KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
-   kfd_mem_limit.system_mem_used -= acc_size;
-   kfd_mem_limit.ttm_mem_used -= acc_size;
-   } else {
+   kfd_mem_limit.system_mem_used -= size;
+   } else if (!(alloc_flag &
+   (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)))

[PATCH v5 12/13] tools: add hmm gup tests for device coherent type

2022-05-31 Thread Alex Sierra

The intention is to test hmm device coherent type under different get
user pages paths. Also, test gup with FOLL_LONGTERM flag set in
device coherent pages. These pages should get migrated back to system
memory.

Signed-off-by: Alex Sierra 
Reviewed-by: Alistair Popple 
---
 tools/testing/selftests/vm/hmm-tests.c | 105 +
 1 file changed, 105 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 4b547188ec40..3295c8bf6c63 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -36,6 +36,7 @@
  * in the usual include/uapi/... directory.
  */
 #include "../../../../lib/test_hmm_uapi.h"
+#include "../../../../mm/gup_test.h"
 
 struct hmm_buffer {
void*ptr;
@@ -59,6 +60,8 @@ enum {
 #define NTIMES 10
 
 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01/* check pte is writable */
 
 FIXTURE(hmm)
 {
@@ -1764,4 +1767,106 @@ TEST_F(hmm, exclusive_cow)
hmm_buffer_free(buffer);
 }
 
+static int gup_test_exec(int gup_fd, unsigned long addr,
+int cmd, int npages, int size)
+{
+   struct gup_test gup = {
+   .nr_pages_per_call  = npages,
+   .addr   = addr,
+   .gup_flags  = FOLL_WRITE,
+   .size   = size,
+   };
+
+   if (ioctl(gup_fd, cmd, )) {
+   perror("ioctl on error\n");
+   return errno;
+   }
+
+   return 0;
+}
+
+/*
+ * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
+ * This should trigger a migration back to system memory for both, private
+ * and coherent type pages.
+ * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
+ * to your configuration before you run it.
+ */
+TEST_F(hmm, hmm_gup_test)
+{
+   struct hmm_buffer *buffer;
+   int gup_fd;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+
+   gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+   if (gup_fd == -1)
+   SKIP(return, "Skipping test, could not find gup_test driver");
+
+   npages = 3;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   /* Check what the device read. */
+   for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+   ASSERT_EQ(ptr[i], i);
+
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr,
+   GUP_BASIC_TEST, 1, self->page_size), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 1 * 
self->page_size,
+   GUP_FAST_BENCHMARK, 1, self->page_size), 0);
+   ASSERT_EQ(gup_test_exec(gup_fd,
+   (unsigned long)buffer->ptr + 2 * 
self->page_size,
+   PIN_LONGTERM_BENCHMARK, 1, self->page_size), 0);
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   if (hmm_is_coherent_type(variant->device_number)) {
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | 
HMM_DMIRROR_PROT_WRITE, m[1]);
+   } else {
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
+   }
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]);
+   /*
+* Check again the content on the pages. Make sure there's no
+* corrupted data.
+*/
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ASSERT_EQ(ptr[i], i);
+
+   close(gup_fd);
+   hmm_buffer_free(buffer);
+}
 TEST_HARNESS_MAIN
-- 
2.32.0

[PATCH v5 13/13] tools: add selftests to hmm for COW in device memory

2022-05-31 Thread Alex Sierra

The objective is to test device migration mechanism in pages marked
as COW, for private and coherent device type. In case of writing to
COW private page(s), a page fault will migrate pages back to system
memory first. Then, these pages will be duplicated. In case of COW
device coherent type, pages are duplicated directly from device
memory.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
---
 tools/testing/selftests/vm/hmm-tests.c | 80 ++
 1 file changed, 80 insertions(+)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 3295c8bf6c63..2da9d5baf339 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1869,4 +1869,84 @@ TEST_F(hmm, hmm_gup_test)
close(gup_fd);
hmm_buffer_free(buffer);
 }
+
+/*
+ * Test copy-on-write in device pages.
+ * In case of writing to COW private page(s), a page fault will migrate pages
+ * back to system memory first. Then, these pages will be duplicated. In case
+ * of COW device coherent type, pages are duplicated directly from device
+ * memory.
+ */
+TEST_F(hmm, hmm_cow_in_device)
+{
+   struct hmm_buffer *buffer;
+   unsigned long npages;
+   unsigned long size;
+   unsigned long i;
+   int *ptr;
+   int ret;
+   unsigned char *m;
+   pid_t pid;
+   int status;
+
+   npages = 4;
+   size = npages << self->page_shift;
+
+   buffer = malloc(sizeof(*buffer));
+   ASSERT_NE(buffer, NULL);
+
+   buffer->fd = -1;
+   buffer->size = size;
+   buffer->mirror = malloc(size);
+   ASSERT_NE(buffer->mirror, NULL);
+
+   buffer->ptr = mmap(NULL, size,
+  PROT_READ | PROT_WRITE,
+  MAP_PRIVATE | MAP_ANONYMOUS,
+  buffer->fd, 0);
+   ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+   /* Initialize buffer in system memory. */
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Migrate memory to device. */
+
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+
+   pid = fork();
+   if (pid == -1)
+   ASSERT_EQ(pid, 0);
+   if (!pid) {
+   /* Child process waitd for SIGTERM from the parent. */
+   while (1) {
+   }
+   perror("Should not reach this\n");
+   exit(0);
+   }
+   /* Parent process writes to COW pages(s) and gets a
+* new copy in system. In case of device private pages,
+* this write causes a migration to system mem first.
+*/
+   for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+   ptr[i] = i;
+
+   /* Terminate child and wait */
+   EXPECT_EQ(0, kill(pid, SIGTERM));
+   EXPECT_EQ(pid, waitpid(pid, , 0));
+   EXPECT_NE(0, WIFSIGNALED(status));
+   EXPECT_EQ(SIGTERM, WTERMSIG(status));
+
+   /* Take snapshot to CPU pagetables */
+   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+   ASSERT_EQ(ret, 0);
+   ASSERT_EQ(buffer->cpages, npages);
+   m = buffer->mirror;
+   for (i = 0; i < npages; i++)
+   ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
+
+   hmm_buffer_free(buffer);
+}
 TEST_HARNESS_MAIN
-- 
2.32.0

[PATCH v5 11/13] tools: update test_hmm script to support SP config

2022-05-31 Thread Alex Sierra

Add two more parameters to set spm_addr_dev0 & spm_addr_dev1
addresses. These two parameters configure the start SP
addresses for each device in test_hmm driver.
Consequently, this configures zone device type as coherent.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/test_hmm.sh | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/test_hmm.sh 
b/tools/testing/selftests/vm/test_hmm.sh
index 0647b525a625..539c9371e592 100755
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ b/tools/testing/selftests/vm/test_hmm.sh
@@ -40,11 +40,26 @@ check_test_requirements()
 
 load_driver()
 {
-   modprobe $DRIVER > /dev/null 2>&1
+   if [ $# -eq 0 ]; then
+   modprobe $DRIVER > /dev/null 2>&1
+   else
+   if [ $# -eq 2 ]; then
+   modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+   > /dev/null 2>&1
+   else
+   echo "Missing module parameters. Make sure pass"\
+   "spm_addr_dev0 and spm_addr_dev1"
+   usage
+   fi
+   fi
if [ $? == 0 ]; then
major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices)
mknod /dev/hmm_dmirror0 c $major 0
mknod /dev/hmm_dmirror1 c $major 1
+   if [ $# -eq 2 ]; then
+   mknod /dev/hmm_dmirror2 c $major 2
+   mknod /dev/hmm_dmirror3 c $major 3
+   fi
fi
 }
 
@@ -58,7 +73,7 @@ run_smoke()
 {
echo "Running smoke test. Note, this test provides basic coverage."
 
-   load_driver
+   load_driver $1 $2
$(dirname "${BASH_SOURCE[0]}")/hmm-tests
unload_driver
 }
@@ -75,6 +90,9 @@ usage()
echo "# Smoke testing"
echo "./${TEST_NAME}.sh smoke"
echo
+   echo "# Smoke testing with SPM enabled"
+   echo "./${TEST_NAME}.sh smoke  "
+   echo
exit 0
 }
 
@@ -84,7 +102,7 @@ function run_test()
usage
else
if [ "$1" = "smoke" ]; then
-   run_smoke
+   run_smoke $2 $3
else
usage
fi
-- 
2.32.0

[PATCH v5 10/13] tools: update hmm-test to support device coherent type

2022-05-31 Thread Alex Sierra

Test cases such as migrate_fault and migrate_multiple, were modified to
explicit migrate from device to sys memory without the need of page
faults, when using device coherent type.

Snapshot test case updated to read memory device type first and based
on that, get the proper returned results migrate_ping_pong test case
added to test explicit migration from device to sys memory for both
private and coherent zone types.

Helpers to migrate from device to sys memory and vicerversa
were also added.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Popple 
Signed-off-by: Christoph Hellwig 
---
 tools/testing/selftests/vm/hmm-tests.c | 121 -
 1 file changed, 100 insertions(+), 21 deletions(-)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 203323967b50..4b547188ec40 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -46,6 +46,13 @@ struct hmm_buffer {
uint64_tfaults;
 };
 
+enum {
+   HMM_PRIVATE_DEVICE_ONE,
+   HMM_PRIVATE_DEVICE_TWO,
+   HMM_COHERENCE_DEVICE_ONE,
+   HMM_COHERENCE_DEVICE_TWO,
+};
+
 #define TWOMEG (1 << 21)
 #define HMM_BUFFER_SIZE (1024 << 12)
 #define HMM_PATH_MAX64
@@ -60,6 +67,21 @@ FIXTURE(hmm)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm)
+{
+   int device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+   .device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+   .device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
 FIXTURE(hmm2)
 {
int fd0;
@@ -68,6 +90,24 @@ FIXTURE(hmm2)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm2)
+{
+   int device_number0;
+   int device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+   .device_number0 = HMM_PRIVATE_DEVICE_ONE,
+   .device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+   .device_number0 = HMM_COHERENCE_DEVICE_ONE,
+   .device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
 static int hmm_open(int unit)
 {
char pathname[HMM_PATH_MAX];
@@ -81,12 +121,19 @@ static int hmm_open(int unit)
return fd;
 }
 
+static bool hmm_is_coherent_type(int dev_num)
+{
+   return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
 FIXTURE_SETUP(hmm)
 {
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd = hmm_open(0);
+   self->fd = hmm_open(variant->device_number);
+   if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd, 0);
 }
 
@@ -95,9 +142,11 @@ FIXTURE_SETUP(hmm2)
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd0 = hmm_open(0);
+   self->fd0 = hmm_open(variant->device_number0);
+   if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd0, 0);
-   self->fd1 = hmm_open(1);
+   self->fd1 = hmm_open(variant->device_number1);
ASSERT_GE(self->fd1, 0);
 }
 
@@ -211,6 +260,20 @@ static void hmm_nanosleep(unsigned int n)
nanosleep(, NULL);
 }
 
+static int hmm_migrate_sys_to_dev(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
 /*
  * Simple NULL test of device open/close.
  */
@@ -875,7 +938,7 @@ TEST_F(hmm, migrate)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -923,7 +986,7 @@ TEST_F(hmm, migrate_fault)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -936,7 +999,7 @@ TEST_F(hmm, migrate_fault)
ASSERT_EQ(ptr[i], i);
 
/* Migrate memory to the device again. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRA

[PATCH v5 09/13] lib: add support for device coherent type in test_hmm

2022-05-31 Thread Alex Sierra

Device Coherent type uses device memory that is coherently accesible by
the CPU. This could be shown as SP (special purpose) memory range
at the BIOS-e820 memory enumeration. If no SP memory is supported in
system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x1 &
0x14000 physical address. Ex.
efi_fake_mem=1G@0x1:0x4,1G@0x14000:0x4

Private and coherent device mirror instances can be created in the same
probed. This is done by passing the module parameters spm_addr_dev0 &
spm_addr_dev1. In this case, it will create four instances of
device_mirror. The first two correspond to private device type, the
last two to coherent type. Then, they can be easily accessed from user
space through /dev/hmm_mirror. Usually num_device 0 and 1
are for private, and 2 and 3 for coherent types. If no module
parameters are passed, two instances of private type device_mirror will
be created only.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
---
 lib/test_hmm.c  | 253 +---
 lib/test_hmm_uapi.h |   4 +
 2 files changed, 196 insertions(+), 61 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index afb30af9f3ff..7930853e7fc5 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -32,11 +32,22 @@
 
 #include "test_hmm_uapi.h"
 
-#define DMIRROR_NDEVICES   2
+#define DMIRROR_NDEVICES   4
 #define DMIRROR_RANGE_FAULT_TIMEOUT1000
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+/*
+ * For device_private pages, dpage is just a dummy struct page
+ * representing a piece of device memory. dmirror_devmem_alloc_page
+ * allocates a real system memory page as backing storage to fake a
+ * real device. zone_device_data points to that backing page. But
+ * for device_coherent memory, the struct page represents real
+ * physical CPU-accessible memory that we can use directly.
+ */
+#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
+  (page)->zone_device_data : (page))
+
 static unsigned long spm_addr_dev0;
 module_param(spm_addr_dev0, long, 0644);
 MODULE_PARM_DESC(spm_addr_dev0,
@@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce 
*bounce,
return 0;
 }
 
+static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
+{
+   return (mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+}
+
+static enum migrate_vma_direction
+dmirror_select_device(struct dmirror *dmirror)
+{
+   return (dmirror->mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
+   MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+}
+
 static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
 {
vfree(bounce->ptr);
@@ -575,16 +601,19 @@ static int dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 {
struct page *dpage = NULL;
-   struct page *rpage;
+   struct page *rpage = NULL;
 
/*
-* This is a fake device so we alloc real system memory to store
-* our device memory.
+* For ZONE_DEVICE private type, this is a fake device so we allocate
+* real system memory to store our device memory.
+* For ZONE_DEVICE coherent type we use the actual dpage to store the
+* data and ignore rpage.
 */
-   rpage = alloc_page(GFP_HIGHUSER);
-   if (!rpage)
-   return NULL;
-
+   if (dmirror_is_private_zone(mdevice)) {
+   rpage = alloc_page(GFP_HIGHUSER);
+   if (!rpage)
+   return NULL;
+   }
spin_lock(>lock);
 
if (mdevice->free_pages) {
@@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct 
dmirror_device *mdevice)
return dpage;
 
 error:
-   __free_page(rpage);
+   if (rpage)
+   __free_page(rpage);
return NULL;
 }
 
@@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct 
migrate_vma *args,
 * unallocated pte_none() or read-only zero page.
 */
spage = migrate_pfn_to_page(*src);
+   if (WARN(spage && is_zone_device_page(spage),
+"page already in device spage pfn: 0x%lx\n",
+page_to_pfn(spage)))
+   continue;
 
dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;
 
-   rpage = dpage->

[PATCH v5 03/13] mm: add device coherent vma selection for memory migration

2022-05-31 Thread Alex Sierra

This case is used to migrate pages from device memory, back to system
memory. Device coherent type memory is cache coherent from device and CPU
point of view.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 include/linux/migrate.h |  1 +
 mm/migrate_device.c | 12 +---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 069a89e847f3..b84908debe5c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
 enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
 };
 
 struct migrate_vma {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index a4847ad65da3..18bc6483f63a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -148,15 +148,21 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
-   if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
-   goto next;
pfn = pte_pfn(pte);
-   if (is_zero_pfn(pfn)) {
+   if (is_zero_pfn(pfn) &&
+   (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
+   if (page && !is_zone_device_page(page) &&
+   !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+   goto next;
+   else if (page && is_device_coherent_page(page) &&
+   (!(migrate->flags & 
MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+page->pgmap->owner != migrate->pgmap_owner))
+   goto next;
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
-- 
2.32.0

[PATCH v5 06/13] drm/amdkfd: add SPM support for SVM

2022-05-31 Thread Alex Sierra

When CPU is connected throug XGMI, it has coherent
access to VRAM resource. In this case that resource
is taken from a table in the device gmc aperture base.
This resource is used along with the device type, which could
be DEVICE_PRIVATE or DEVICE_COHERENT to create the device
page map region.
Also, MIGRATE_VMA_SELECT_DEVICE_COHERENT flag is selected for
coherent type case during migration to device.

Signed-off-by: Alex Sierra 
Reviewed-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 34 +++-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 997650d597ec..39b8c4710caf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -671,13 +671,15 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
migrate.vma = vma;
migrate.start = start;
migrate.end = end;
-   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
+   if (adev->gmc.xgmi.connected_to_cpu)
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+   else
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
 
buf = kvcalloc(npages,
   2 * sizeof(*migrate.src) + sizeof(uint64_t) + 
sizeof(dma_addr_t),
   GFP_KERNEL);
-
if (!buf)
goto out;
 
@@ -947,7 +949,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
 {
struct kfd_dev *kfddev = adev->kfd.dev;
struct dev_pagemap *pgmap;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long size;
void *r;
 
@@ -962,28 +964,34 @@ int svm_migrate_init(struct amdgpu_device *adev)
 * should remove reserved size
 */
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
-   res = devm_request_free_mem_region(adev->dev, _resource, size);
-   if (IS_ERR(res))
-   return -ENOMEM;
+   if (adev->gmc.xgmi.connected_to_cpu) {
+   pgmap->range.start = adev->gmc.aper_base;
+   pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 
1;
+   pgmap->type = MEMORY_DEVICE_COHERENT;
+   } else {
+   res = devm_request_free_mem_region(adev->dev, _resource, 
size);
+   if (IS_ERR(res))
+   return -ENOMEM;
+   pgmap->range.start = res->start;
+   pgmap->range.end = res->end;
+   pgmap->type = MEMORY_DEVICE_PRIVATE;
+   }
 
-   pgmap->type = MEMORY_DEVICE_PRIVATE;
pgmap->nr_range = 1;
-   pgmap->range.start = res->start;
-   pgmap->range.end = res->end;
pgmap->ops = _migrate_pgmap_ops;
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
-   pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
-
+   pgmap->flags = 0;
/* Device manager releases device-specific resources, memory region and
 * pgmap when driver disconnects from device.
 */
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
-
/* Disable SVM support capability */
pgmap->type = 0;
-   devm_release_mem_region(adev->dev, res->start, 
resource_size(res));
+   if (pgmap->type == MEMORY_DEVICE_PRIVATE)
+   devm_release_mem_region(adev->dev, res->start,
+   res->end - res->start + 1);
return PTR_ERR(r);
}
 
-- 
2.32.0

[PATCH v5 07/13] lib: test_hmm add ioctl to get zone device type

2022-05-31 Thread Alex Sierra

new ioctl cmd added to query zone device type. This will be
used once the test_hmm adds zone device coherent type.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 11 +--
 lib/test_hmm_uapi.h | 14 ++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index cfe632047839..915ef6b5b0d4 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -87,6 +87,7 @@ struct dmirror_chunk {
 struct dmirror_device {
struct cdev cdevice;
struct hmm_devmem   *devmem;
+   unsigned intzone_device_type;
 
unsigned intdevmem_capacity;
unsigned intdevmem_count;
@@ -1260,14 +1261,20 @@ static void dmirror_device_remove(struct dmirror_device 
*mdevice)
 static int __init hmm_dmirror_init(void)
 {
int ret;
-   int id;
+   int id = 0;
+   int ndevices = 0;
 
ret = alloc_chrdev_region(_dev, 0, DMIRROR_NDEVICES,
  "HMM_DMIRROR");
if (ret)
goto err_unreg;
 
-   for (id = 0; id < DMIRROR_NDEVICES; id++) {
+   memset(dmirror_devices, 0, DMIRROR_NDEVICES * 
sizeof(dmirror_devices[0]));
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   dmirror_devices[ndevices++].zone_device_type =
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
+   for (id = 0; id < ndevices; id++) {
ret = dmirror_device_init(dmirror_devices + id, id);
if (ret)
goto err_chrdev;
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index f14dea5dcd06..0511af7464ee 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -31,10 +31,11 @@ struct hmm_dmirror_cmd {
 /* Expose the address space of the calling process through hmm device file */
 #define HMM_DMIRROR_READ   _IOWR('H', 0x00, struct hmm_dmirror_cmd)
 #define HMM_DMIRROR_WRITE  _IOWR('H', 0x01, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_MIGRATE_IOWR('H', 0x02, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x03, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x04, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x04, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x06, struct hmm_dmirror_cmd)
 
 /*
  * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
@@ -62,4 +63,9 @@ enum {
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
 };
 
+enum {
+   /* 0 is reserved to catch uninitialized type fields */
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1,
+};
+
 #endif /* _LIB_TEST_HMM_UAPI_H */
-- 
2.32.0

[PATCH v5 04/13] mm: remove the vma check in migrate_vma_setup()

2022-05-31 Thread Alex Sierra

From: Alistair Popple 

migrate_vma_setup() checks that a valid vma is passed so that the page
tables can be walked to find the pfns associated with a given address
range. However in some cases the pfns are already known, such as when
migrating device coherent pages during pin_user_pages() meaning a valid
vma isn't required.

Signed-off-by: Alistair Popple 
Acked-by: Felix Kuehling 
Signed-off-by: Christoph Hellwig 
---
 mm/migrate_device.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 18bc6483f63a..cf9668376c5a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -486,24 +486,24 @@ int migrate_vma_setup(struct migrate_vma *args)
 
args->start &= PAGE_MASK;
args->end &= PAGE_MASK;
-   if (!args->vma || is_vm_hugetlb_page(args->vma) ||
-   (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
-   return -EINVAL;
-   if (nr_pages <= 0)
-   return -EINVAL;
-   if (args->start < args->vma->vm_start ||
-   args->start >= args->vma->vm_end)
-   return -EINVAL;
-   if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
-   return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
-
-   memset(args->src, 0, sizeof(*args->src) * nr_pages);
-   args->cpages = 0;
-   args->npages = 0;
-
-   migrate_vma_collect(args);
+   if (args->vma) {
+   if (is_vm_hugetlb_page(args->vma) ||
+   (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+   return -EINVAL;
+   if (args->start < args->vma->vm_start ||
+   args->start >= args->vma->vm_end)
+   return -EINVAL;
+   if (args->end <= args->vma->vm_start ||
+   args->end > args->vma->vm_end)
+   return -EINVAL;
+   memset(args->src, 0, sizeof(*args->src) * nr_pages);
+   args->cpages = 0;
+   args->npages = 0;
+
+   migrate_vma_collect(args);
+   }
 
if (args->cpages)
migrate_vma_unmap(args);
@@ -685,7 +685,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
continue;
}
 
-   if (!page) {
+   if (!page && migrate->vma) {
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
if (!notified) {
-- 
2.32.0

[PATCH v5 08/13] lib: test_hmm add module param for zone device type

2022-05-31 Thread Alex Sierra

In order to configure device coherent in test_hmm, two module parameters
should be passed, which correspond to the SP start address of each
device (2) spm_addr_dev0 & spm_addr_dev1. If no parameters are passed,
private device type is configured.

Signed-off-by: Alex Sierra 
Acked-by: Felix Kuehling 
Reviewed-by: Alistair Poppple 
Signed-off-by: Christoph Hellwig 
---
 lib/test_hmm.c  | 73 -
 lib/test_hmm_uapi.h |  1 +
 2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 915ef6b5b0d4..afb30af9f3ff 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -37,6 +37,16 @@
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+static unsigned long spm_addr_dev0;
+module_param(spm_addr_dev0, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev0,
+   "Specify start address for SPM (special purpose memory) used 
for device 0. By setting this Coherent device type will be used. Make sure 
spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
+static unsigned long spm_addr_dev1;
+module_param(spm_addr_dev1, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev1,
+   "Specify start address for SPM (special purpose memory) used 
for device 1. By setting this Coherent device type will be used. Make sure 
spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
+
 static const struct dev_pagemap_ops dmirror_devmem_ops;
 static const struct mmu_interval_notifier_ops dmirror_min_ops;
 static dev_t dmirror_dev;
@@ -455,28 +465,44 @@ static int dmirror_write(struct dmirror *dmirror, struct 
hmm_dmirror_cmd *cmd)
return ret;
 }
 
-static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
+static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
   struct page **ppage)
 {
struct dmirror_chunk *devmem;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long pfn;
unsigned long pfn_first;
unsigned long pfn_last;
void *ptr;
+   int ret = -ENOMEM;
 
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
-   return false;
+   return ret;
 
-   res = request_free_mem_region(_resource, DEVMEM_CHUNK_SIZE,
- "hmm_dmirror");
-   if (IS_ERR(res))
+   switch (mdevice->zone_device_type) {
+   case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
+   res = request_free_mem_region(_resource, 
DEVMEM_CHUNK_SIZE,
+ "hmm_dmirror");
+   if (IS_ERR_OR_NULL(res))
+   goto err_devmem;
+   devmem->pagemap.range.start = res->start;
+   devmem->pagemap.range.end = res->end;
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   break;
+   case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
+   devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) 
?
+   spm_addr_dev0 :
+   spm_addr_dev1;
+   devmem->pagemap.range.end = devmem->pagemap.range.start +
+   DEVMEM_CHUNK_SIZE - 1;
+   devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
+   break;
+   default:
+   ret = -EINVAL;
goto err_devmem;
+   }
 
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-   devmem->pagemap.range.start = res->start;
-   devmem->pagemap.range.end = res->end;
devmem->pagemap.nr_range = 1;
devmem->pagemap.ops = _devmem_ops;
devmem->pagemap.owner = mdevice;
@@ -497,10 +523,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
mdevice->devmem_capacity = new_capacity;
mdevice->devmem_chunks = new_chunks;
}
-
ptr = memremap_pages(>pagemap, numa_node_id());
-   if (IS_ERR(ptr))
+   if (IS_ERR_OR_NULL(ptr)) {
+   if (ptr)
+   ret = PTR_ERR(ptr);
+   else
+   ret = -EFAULT;
goto err_release;
+   }
 
devmem->mdevice = mdevice;
pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
@@ -529,15 +559,17 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
}
spin_unlock(>lock);
 
-   return true;
+   return 0;
 
 err_release:
mutex_unlock(>devmem_lock);
-   release_mem_region(devmem->pagemap.range.start, 
range_len(>pagemap.range));
+   if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
+   release_mem_region(devme

1 2 3 4 5 >

1 - 100 of 490 matches

Mail list logo