[PATCH] drm/amdgpu: GC_9.4.3 requires at least 280MB TMR

2023-07-30 Thread Amber Lin
On GC_9.4.3, if atombios reports TMR size less than 280MB, firmware area
will be overwritten by driver or user application use. Remove !adev->bios
condition since reserve_size is initialized as 0, it'll fail into
else if (!reserve_size) condition.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 7c6dd3de1867..fa5721b3139c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1718,7 +1718,7 @@ static int amdgpu_ttm_reserve_tmr(struct amdgpu_device 
*adev)
reserve_size =
amdgpu_atomfirmware_get_fw_reserved_fb_size(adev);
 
-   if (!adev->bios && adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3))
+   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3))
reserve_size = max(reserve_size, (uint32_t)280 << 20);
else if (!reserve_size)
reserve_size = DISCOVERY_TMR_OFFSET;
-- 
2.25.1



[PATCH] drm/amdkfd: Remove unused entries in table

2021-11-18 Thread Amber Lin
Remove unused entries in kfd_device_info table: num_xgmi_sdma_engines
and num_sdma_queues_per_engine. They are calculated in
kfd_get_num_sdma_engines and kfd_get_num_xgmi_sdma_engines instead.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 58 -
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  2 -
 2 files changed, 60 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 3fea47e37c17..e1294fba0c26 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -68,8 +68,6 @@ static const struct kfd_device_info kaveri_device_info = {
.supports_cwsr = false,
.needs_iommu_device = true,
.needs_pci_atomics = false,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -87,8 +85,6 @@ static const struct kfd_device_info carrizo_device_info = {
.supports_cwsr = true,
.needs_iommu_device = true,
.needs_pci_atomics = false,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -105,8 +101,6 @@ static const struct kfd_device_info raven_device_info = {
.supports_cwsr = true,
.needs_iommu_device = true,
.needs_pci_atomics = true,
-   .num_sdma_engines = 1,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 #endif
@@ -126,8 +120,6 @@ static const struct kfd_device_info hawaii_device_info = {
.supports_cwsr = false,
.needs_iommu_device = false,
.needs_pci_atomics = false,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 #endif
@@ -145,8 +137,6 @@ static const struct kfd_device_info tonga_device_info = {
.supports_cwsr = false,
.needs_iommu_device = false,
.needs_pci_atomics = true,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -163,8 +153,6 @@ static const struct kfd_device_info fiji_device_info = {
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = true,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -181,8 +169,6 @@ static const struct kfd_device_info fiji_vf_device_info = {
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = false,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -200,8 +186,6 @@ static const struct kfd_device_info polaris10_device_info = 
{
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = true,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -218,8 +202,6 @@ static const struct kfd_device_info 
polaris10_vf_device_info = {
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = false,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -236,8 +218,6 @@ static const struct kfd_device_info polaris11_device_info = 
{
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = true,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -254,8 +234,6 @@ static const struct kfd_device_info polaris12_device_info = 
{
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = true,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -272,8 +250,6 @@ static const struct kfd_device_info vegam_device_info = {
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = true,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -290,8 +266,6 @@ static const struct kfd_device_info vega10_device_info = {
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = false,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -308,8 +282,6 @@ static const struct kfd_device_info vega10_vf_device_info = 
{
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = false,
-   .num_sdma_engines = 2,
-   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -326,8 +298,6 @@ static const struct kfd_device_info vega12_device_info = {
.supports_cwsr = true,
.needs_iommu_device = false,
.needs_pci_atomics = false,
-   .num_sdma_engines = 2

[PATCH v2] drm/amdkfd: Retrieve SDMA numbers from amdgpu

2021-11-17 Thread Amber Lin
Instead of hard coding the number of sdma engines and the number of
sdma_xgmi engines in the device_info table, get the number of toal SDMA
instances from amdgpu. The first two engines are sdma engines and the
rest are sdma-xgmi engines unless the ASIC doesn't support XGMI.

v2: add kfd_ prefix to non static function names

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   | 20 
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 32 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |  4 +--
 4 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index ce9f4e562bac..3fea47e37c17 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -1516,6 +1516,26 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, 
uint64_t throttle_bitmask)
kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
 }
 
+/* kfd_get_num_sdma_engines returns the number of PCIe optimized SDMA and
+ * kfd_get_num_xgmi_sdma_engines returns the number of XGMI SDMA.
+ * When the device has more than two engines, we reserve two for PCIe to enable
+ * full-duplex and the rest are used as XGMI.
+ */
+unsigned int kfd_get_num_sdma_engines(struct kfd_dev *kdev)
+{
+   /* If XGMI is not supported, all SDMA engines are PCIe */
+   if (!kdev->adev->gmc.xgmi.supported)
+   return kdev->adev->sdma.num_instances;
+
+   return min(kdev->adev->sdma.num_instances, 2);
+}
+
+unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_dev *kdev)
+{
+   /* After reserved for PCIe, the rest of engines are XGMI */
+   return kdev->adev->sdma.num_instances - kfd_get_num_sdma_engines(kdev);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 /* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 62fe28244a80..2af2b3268171 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -99,31 +99,22 @@ unsigned int get_pipes_per_mec(struct device_queue_manager 
*dqm)
return dqm->dev->shared_resources.num_pipe_per_mec;
 }
 
-static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
-{
-   return dqm->dev->device_info->num_sdma_engines;
-}
-
-static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm)
-{
-   return dqm->dev->device_info->num_xgmi_sdma_engines;
-}
-
 static unsigned int get_num_all_sdma_engines(struct device_queue_manager *dqm)
 {
-   return get_num_sdma_engines(dqm) + get_num_xgmi_sdma_engines(dqm);
+   return kfd_get_num_sdma_engines(dqm->dev) +
+   kfd_get_num_xgmi_sdma_engines(dqm->dev);
 }
 
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
 {
-   return dqm->dev->device_info->num_sdma_engines
-   * dqm->dev->device_info->num_sdma_queues_per_engine;
+   return kfd_get_num_sdma_engines(dqm->dev) *
+   dqm->dev->device_info->num_sdma_queues_per_engine;
 }
 
 unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm)
 {
-   return dqm->dev->device_info->num_xgmi_sdma_engines
-   * dqm->dev->device_info->num_sdma_queues_per_engine;
+   return kfd_get_num_xgmi_sdma_engines(dqm->dev) *
+   dqm->dev->device_info->num_sdma_queues_per_engine;
 }
 
 void program_sh_mem_settings(struct device_queue_manager *dqm,
@@ -1054,9 +1045,9 @@ static int allocate_sdma_queue(struct 
device_queue_manager *dqm,
dqm->sdma_bitmap &= ~(1ULL << bit);
q->sdma_id = bit;
q->properties.sdma_engine_id = q->sdma_id %
-   get_num_sdma_engines(dqm);
+   kfd_get_num_sdma_engines(dqm->dev);
q->properties.sdma_queue_id = q->sdma_id /
-   get_num_sdma_engines(dqm);
+   kfd_get_num_sdma_engines(dqm->dev);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
if (dqm->xgmi_sdma_bitmap == 0) {
pr_err("No more XGMI SDMA queue to allocate\n");
@@ -1071,10 +1062,11 @@ static int allocate_sdma_queue(struct 
device_queue_manager *dqm,
 * assumes the first N engines are always
 * PCIe-optimized ones
 */
-   q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
-   q->sdma_id % get_num_xgmi_sdma_engines(dqm);
+   q->properties.

[PATCH] drm/amdkfd: Retrieve SDMA numbers from amdgpu

2021-11-17 Thread Amber Lin
Instead of hard coding the number of sdma engines and the number of
sdma_xgmi engines in the device_info table, get the number of toal SDMA
instances from amdgpu. The first two engines are sdma engines and the
rest are sdma-xgmi engines unless the ASIC doesn't support XGMI.

v2: Move get_num_*_sdma_engines to global and shared by queues manager
and topology.
v3: Use gmc.xgmi.supported to justify the SDMA PCIe/XGMI assignment

Signed-off-by: Amber Lin 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   | 20 
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |  5 ++-
 4 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index ce9f4e562bac..ec1f6bacb61e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -1516,6 +1516,26 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, 
uint64_t throttle_bitmask)
kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
 }
 
+/* get_num_sdma_engines returns the number of PCIe optimized SDMA and
+ * get_num_xgmi_sdma_engines returns the number of XGMI SDMA.
+ * When the device has more than two engines, we reserve two for PCIe to enable
+ * full-duplex and the rest are used as XGMI.
+ */
+unsigned int get_num_sdma_engines(struct kfd_dev *kdev)
+{
+   /* If XGMI is not supported, all SDMA engines are PCIe */
+   if (!kdev->adev->gmc.xgmi.supported)
+   return kdev->adev->sdma.num_instances;
+
+   return min(kdev->adev->sdma.num_instances, 2);
+}
+
+unsigned int get_num_xgmi_sdma_engines(struct kfd_dev *kdev)
+{
+   /* After reserved for PCIe, the rest of engines are XGMI */
+   return kdev->adev->sdma.num_instances - get_num_sdma_engines(kdev);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 /* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 62fe28244a80..5f2886cf4d7e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -99,31 +99,22 @@ unsigned int get_pipes_per_mec(struct device_queue_manager 
*dqm)
return dqm->dev->shared_resources.num_pipe_per_mec;
 }
 
-static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
-{
-   return dqm->dev->device_info->num_sdma_engines;
-}
-
-static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm)
-{
-   return dqm->dev->device_info->num_xgmi_sdma_engines;
-}
-
 static unsigned int get_num_all_sdma_engines(struct device_queue_manager *dqm)
 {
-   return get_num_sdma_engines(dqm) + get_num_xgmi_sdma_engines(dqm);
+   return get_num_sdma_engines(dqm->dev) +
+   get_num_xgmi_sdma_engines(dqm->dev);
 }
 
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
 {
-   return dqm->dev->device_info->num_sdma_engines
-   * dqm->dev->device_info->num_sdma_queues_per_engine;
+   return get_num_sdma_engines(dqm->dev) *
+   dqm->dev->device_info->num_sdma_queues_per_engine;
 }
 
 unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm)
 {
-   return dqm->dev->device_info->num_xgmi_sdma_engines
-   * dqm->dev->device_info->num_sdma_queues_per_engine;
+   return get_num_xgmi_sdma_engines(dqm->dev) *
+   dqm->dev->device_info->num_sdma_queues_per_engine;
 }
 
 void program_sh_mem_settings(struct device_queue_manager *dqm,
@@ -1054,9 +1045,9 @@ static int allocate_sdma_queue(struct 
device_queue_manager *dqm,
dqm->sdma_bitmap &= ~(1ULL << bit);
q->sdma_id = bit;
q->properties.sdma_engine_id = q->sdma_id %
-   get_num_sdma_engines(dqm);
+   get_num_sdma_engines(dqm->dev);
q->properties.sdma_queue_id = q->sdma_id /
-   get_num_sdma_engines(dqm);
+   get_num_sdma_engines(dqm->dev);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
if (dqm->xgmi_sdma_bitmap == 0) {
pr_err("No more XGMI SDMA queue to allocate\n");
@@ -1071,10 +1062,10 @@ static int allocate_sdma_queue(struct 
device_queue_manager *dqm,
 * assumes the first N engines are always
 * PCIe-optimized ones
 */
-   q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
-   

Re: [PATCH] drm/amdkfd: CWSR with sw scheduler on Aldebaran and Arcturus

2021-08-20 Thread Amber Lin

Reviewed-by: Amber Lin 


On 8/20/21 3:11 PM, Mukul Joshi wrote:

Program trap handler settings to enable CWSR with software scheduler
on Aldebaran and Arcturus.

Signed-off-by: Mukul Joshi 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c  | 3 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c| 2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h| 2 ++
  4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index a5434b713856..46cd4ee6bafb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -44,4 +44,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_atc_vmid_pasid_mapping_info =
kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
.set_vm_context_page_table_base = 
kgd_gfx_v9_set_vm_context_page_table_base,
+   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
  };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 6409d6b1b2df..5a7f680bcb3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -305,5 +305,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
.set_vm_context_page_table_base =
kgd_gfx_v9_set_vm_context_page_table_base,
-   .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy
+   .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
+   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
  };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 154244916727..bcc1cbeb8799 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -882,7 +882,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct kgd_dev *kgd, int 
pasid,
adev->gfx.cu_info.max_waves_per_simd;
  }
  
-static void kgd_gfx_v9_program_trap_handler_settings(struct kgd_dev *kgd,

+void kgd_gfx_v9_program_trap_handler_settings(struct kgd_dev *kgd,
  uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
  {
struct amdgpu_device *adev = get_amdgpu_device(kgd);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index e64deba8900f..c63591106879 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -65,3 +65,5 @@ void kgd_gfx_v9_set_vm_context_page_table_base(struct kgd_dev 
*kgd,
uint32_t vmid, uint64_t page_table_base);
  void kgd_gfx_v9_get_cu_occupancy(struct kgd_dev *kgd, int pasid,
int *pasid_wave_cnt, int *max_waves_per_cu);
+void kgd_gfx_v9_program_trap_handler_settings(struct kgd_dev *kgd,
+   uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);


[PATCH] drm/amdkfd: Fix circular lock in nocpsch path

2021-06-15 Thread Amber Lin
Calling free_mqd inside of destroy_queue_nocpsch_locked can cause a
circular lock. destroy_queue_nocpsch_locked is called under a DQM lock,
which is taken in MMU notifiers, potentially in FS reclaim context.
Taking another lock, which is BO reservation lock from free_mqd, while
causing an FS reclaim inside the DQM lock creates a problematic circular
lock dependency. Therefore move free_mqd out of
destroy_queue_nocpsch_locked and call it after unlocking DQM.

Signed-off-by: Amber Lin 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c  | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 72bea5278add..c069fa259b30 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -486,9 +486,6 @@ static int destroy_queue_nocpsch_locked(struct 
device_queue_manager *dqm,
if (retval == -ETIME)
qpd->reset_wavefronts = true;
 
-
-   mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-
list_del(>list);
if (list_empty(>queues_list)) {
if (qpd->reset_wavefronts) {
@@ -523,6 +520,8 @@ static int destroy_queue_nocpsch(struct 
device_queue_manager *dqm,
int retval;
uint64_t sdma_val = 0;
struct kfd_process_device *pdd = qpd_to_pdd(qpd);
+   struct mqd_manager *mqd_mgr =
+   dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];
 
/* Get the SDMA queue stats */
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
@@ -540,6 +539,8 @@ static int destroy_queue_nocpsch(struct 
device_queue_manager *dqm,
pdd->sdma_past_activity_counter += sdma_val;
dqm_unlock(dqm);
 
+   mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+
return retval;
 }
 
@@ -1629,7 +1630,7 @@ static bool set_cache_memory_policy(struct 
device_queue_manager *dqm,
 static int process_termination_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
 {
-   struct queue *q, *next;
+   struct queue *q;
struct device_process_node *cur, *next_dpn;
int retval = 0;
bool found = false;
@@ -1637,12 +1638,19 @@ static int process_termination_nocpsch(struct 
device_queue_manager *dqm,
dqm_lock(dqm);
 
/* Clear all user mode queues */
-   list_for_each_entry_safe(q, next, >queues_list, list) {
+   while (!list_empty(>queues_list)) {
+   struct mqd_manager *mqd_mgr;
int ret;
 
+   q = list_first_entry(>queues_list, struct queue, list);
+   mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+   q->properties.type)];
ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
if (ret)
retval = ret;
+   dqm_unlock(dqm);
+   mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+   dqm_lock(dqm);
}
 
/* Unregister process */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdkfd: Avoid null pointer in SMI event

2021-03-30 Thread Amber Lin
Power Management IP is initialized/enabled before KFD init. When a
thermal throttling happens before kfd_smi_init is done, calling the KFD
SMI update function causes a stack dump by referring a NULL pointer (
smi_clients list). Check if kfd_init is completed before calling the
function.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 72c893fff61a..3cd46d7190b3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -1297,7 +1297,7 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
 
 void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
 {
-   if (kfd)
+   if (kfd && kfd->init_complete)
kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
 }
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event

2020-07-23 Thread Amber Lin




On 2020-07-23 5:41 p.m., Joshi, Mukul wrote:

[AMD Official Use Only - Internal Distribution Only]



-Original Message-
From: Lin, Amber 
Sent: Thursday, July 23, 2020 5:27 PM
To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix 
Subject: Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event



On 2020-07-22 12:08 p.m., Mukul Joshi wrote:

Add support for reporting thermal throttling events through SMI.
Also, add a counter to count the number of throttling interrupts
observed and report the count in the SMI event message.

Signed-off-by: Mukul Joshi 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  4 ++
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  1 +
   drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  7 ++
   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++-
   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
   drivers/gpu/drm/amd/powerplay/amdgpu_smu.c|  1 +
   drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
   .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  1 +
   drivers/gpu/drm/amd/powerplay/smu_v11_0.c |  5 ++
   include/uapi/linux/kfd_ioctl.h|  3 +-
   10 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1b865fed74ca..19e4658756d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void 
*ih_ring_entry)
   void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
   {
   }
+
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
+throttle_bitmask) { }
   #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3f2b695cf19e..e8b0258aae24 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
   int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
   struct dma_fence *fence);
   void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t
+throttle_bitmask);
   
   #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git

a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 4bfedaab183f..d5e790f046b4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -29,6 +29,7 @@
   #include "cwsr_trap_handler.h"
   #include "kfd_iommu.h"
   #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
   
   #define MQD_SIZE_ALIGNED 768
   
@@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)

WARN_ONCE(count < 0, "Compute profile ref. count error");
   }
   
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t

+throttle_bitmask) {
+   if (kfd)
+   kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); 
}
+
   #if defined(CONFIG_DEBUG_FS)
   
   /* This function will send a package to HIQ to hang the HWS diff

--git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 7b348bf9df21..00c90b47155b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -24,6 +24,7 @@
   #include 
   #include 
   #include 
+#include "amdgpu.h"
   #include "amdgpu_vm.h"
   #include "kfd_priv.h"
   #include "kfd_smi_events.h"
@@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct 
file *filep)
return 0;
   }
   
+static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,

+ char *event_msg, int len)
+{
+   struct kfd_smi_client *client;
+
+   rcu_read_lock();
+
+   list_for_each_entry_rcu(client, >smi_clients, list) {
+   if (!(READ_ONCE(client->events) & smi_event))
+   continue;
+   spin_lock(>lock);
+   if (kfifo_avail(>fifo) >= len) {
+   kfifo_in(>fifo, event_msg, len);
+   wake_up_all(>wait_queue);
+   } else {
+   pr_debug("smi_event(EventID: %llu): no space left\n",
+   smi_event);
+   }
+   spin_unlock(>lock);
+   }
+
+   rcu_read_unlock();
+}
+
+void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
+uint32_t throttle_bitmask)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
+   /*
+* ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):

gpu_id is not needed. The user calls ioctl with GPU specified and KFD returns 
an anonymous fd. Read from this anon_fd already identify the GPU.

I agree with you. But I 

Re: [PATCH v2] drm/amdkfd: Add thermal throttling SMI event

2020-07-23 Thread Amber Lin




On 2020-07-22 12:08 p.m., Mukul Joshi wrote:

Add support for reporting thermal throttling events through SMI.
Also, add a counter to count the number of throttling interrupts
observed and report the count in the SMI event message.

Signed-off-by: Mukul Joshi 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  4 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  1 +
  drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  7 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 68 ++-
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  2 +
  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c|  1 +
  drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  1 +
  .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  1 +
  drivers/gpu/drm/amd/powerplay/smu_v11_0.c |  5 ++
  include/uapi/linux/kfd_ioctl.h|  3 +-
  10 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1b865fed74ca..19e4658756d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -755,4 +755,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void 
*ih_ring_entry)
  void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
  {
  }
+
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
+{
+}
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3f2b695cf19e..e8b0258aae24 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -269,5 +269,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
  int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
   struct dma_fence *fence);
  void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t 
throttle_bitmask);
  
  #endif /* AMDGPU_AMDKFD_H_INCLUDED */

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 4bfedaab183f..d5e790f046b4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -29,6 +29,7 @@
  #include "cwsr_trap_handler.h"
  #include "kfd_iommu.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  #define MQD_SIZE_ALIGNED 768
  
@@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)

WARN_ONCE(count < 0, "Compute profile ref. count error");
  }
  
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)

+{
+   if (kfd)
+   kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
+}
+
  #if defined(CONFIG_DEBUG_FS)
  
  /* This function will send a package to HIQ to hang the HWS

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 7b348bf9df21..00c90b47155b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -24,6 +24,7 @@
  #include 
  #include 
  #include 
+#include "amdgpu.h"
  #include "amdgpu_vm.h"
  #include "kfd_priv.h"
  #include "kfd_smi_events.h"
@@ -148,6 +149,55 @@ static int kfd_smi_ev_release(struct inode *inode, struct 
file *filep)
return 0;
  }
  
+static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,

+ char *event_msg, int len)
+{
+   struct kfd_smi_client *client;
+
+   rcu_read_lock();
+
+   list_for_each_entry_rcu(client, >smi_clients, list) {
+   if (!(READ_ONCE(client->events) & smi_event))
+   continue;
+   spin_lock(>lock);
+   if (kfifo_avail(>fifo) >= len) {
+   kfifo_in(>fifo, event_msg, len);
+   wake_up_all(>wait_queue);
+   } else {
+   pr_debug("smi_event(EventID: %llu): no space left\n",
+   smi_event);
+   }
+   spin_unlock(>lock);
+   }
+
+   rcu_read_unlock();
+}
+
+void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
+uint32_t throttle_bitmask)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
+   /*
+* ThermalThrottle msg = gpu_id(4):throttle_bitmask(4):
gpu_id is not needed. The user calls ioctl with GPU specified and KFD 
returns an anonymous fd. Read from this anon_fd already identify the GPU.

+*   thermal_interrupt_count(8):
+* 16 bytes event + 1 byte space + 4 bytes gpu_id + 1 byte : +
+* 4 byte throttle_bitmask + 1 byte : +
+* 8 byte thermal_interupt_counter + 1 byte \n = 36
+*/
+   char fifo_in[36];
+   int len;
+
+   if (list_empty(>smi_clients))
+   return;
+
+   len = 

[PATCH] drm/amdkfd: Provide SMI events watch

2020-05-13 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients
v5: move events enablement from ioctl to fd write

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  18 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   7 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 214 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  29 +++
 include/uapi/linux/kfd_ioctl.h   |  16 +-
 9 files changed, 292 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index cf0017f..e9b96ad 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1740,6 +1741,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpuid);
+   if (!dev)
+   return -EINVAL;
+
+   return kfd_smi_event_open(dev, >anon_fd);
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -1835,6 +1850,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..2c030c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
return ret;
 }
 
+static void kfd_smi_init(struct kfd_dev *dev) {
+   INIT_LIST_HEAD(>smi_

[PATCH v7 1/2] drm/amdkfd: Provide SMI events watch

2020-04-18 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients
v5: move events enablement from ioctl to fd write

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  18 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   7 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 210 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  29 
 include/uapi/linux/kfd_ioctl.h   |  16 +-
 9 files changed, 288 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..2baaaec 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1732,6 +1733,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpuid);
+   if (!dev)
+   return -EINVAL;
+
+   return kfd_smi_event_open(dev, >anon_fd);
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -1827,6 +1842,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..2c030c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
return ret;
 }
 
+static void kfd_smi_init(struct kfd_dev *dev) {
+   INIT_LIST_HEAD(>smi_

[PATCH v7 2/2] include/uapi/linux: Update KFD ioctl version

2020-04-18 Thread Amber Lin
Bump KFD ioctl after adding SMI events support

Signed-off-by: Amber Lin 
---
 include/uapi/linux/kfd_ioctl.h | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index ad33c18..46adbcc 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -26,8 +26,12 @@
 #include 
 #include 
 
+/*
+ * - 1.1 - initial version
+ * - 1.3 - Add SMI events support
+ */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 1
+#define KFD_IOCTL_MINOR_VERSION 3
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH v6] drm/amdkfd: Provide SMI events watch

2020-04-17 Thread Amber Lin



On 2020-04-17 6:31 p.m., Felix Kuehling wrote:

Am 2020-04-17 um 4:07 p.m. schrieb Amber Lin:

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
 - correct kfifo usage
 - move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients
v5: move events enablement from ioctl to fd write

Signed-off-by: Amber Lin 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  18 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   7 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   4 +
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 215 +++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  29 +++
  include/uapi/linux/kfd_ioctl.h   |  16 +-
  9 files changed, 293 insertions(+), 1 deletion(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
  
  ifneq ($(CONFIG_AMD_IOMMU_V2),)

diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static bool cik_event_interrupt_isr(struct kfd_dev *dev,

const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
  
+		kfd_smi_event_update_vmfault(dev, pasid);

kfd_process_vm_fault(dev->dqm, pasid);
  
  		memset(, 0, sizeof(info));

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..2baaaec 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static long kfd_ioctl(struct file *, unsigned int, unsigned long);

  static int kfd_open(struct inode *, struct file *);
@@ -1732,6 +1733,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return r;
  }
  
+/* Handle requests for watching SMI events */

+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpuid);
+   if (!dev)
+   return -EINVAL;
+
+   return kfd_smi_event_open(dev, >anon_fd);
+}
+
  #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -1827,6 +1842,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
  
  	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,

kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
  };
  
  #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..2c030c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
return ret;

[PATCH v6] drm/amdkfd: Provide SMI events watch

2020-04-17 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients
v5: move events enablement from ioctl to fd write

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  18 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   7 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 215 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  29 +++
 include/uapi/linux/kfd_ioctl.h   |  16 +-
 9 files changed, 293 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..2baaaec 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1732,6 +1733,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpuid);
+   if (!dev)
+   return -EINVAL;
+
+   return kfd_smi_event_open(dev, >anon_fd);
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -1827,6 +1842,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..2c030c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
return ret;
 }
 
+static void kfd_smi_init(struct kfd_dev *dev) {
+   INIT_LIST_HEAD(>smi_

[PATCH v5] drm/amdkfd: Provide SMI events watch

2020-04-15 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients
v5: move events enablement from ioctl to fd write

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  18 +++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   7 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 191 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  29 
 include/uapi/linux/kfd_ioctl.h   |  16 +-
 9 files changed, 269 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..2baaaec 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1732,6 +1733,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpuid);
+   if (!dev)
+   return -EINVAL;
+
+   return kfd_smi_event_open(dev, >anon_fd);
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -1827,6 +1842,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..2c030c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
return ret;
 }
 
+static void kfd_smi_init(struct kfd_dev *dev) {
+   INIT_LIST_HEAD(>smi_

Re: [PATCH v4] drm/amdkfd: Provide SMI events watch

2020-04-15 Thread Amber Lin
Thank you Felix. Now I understand the problem of global client ID is 
leaking a hole for potential attackers. I didn't take that into 
consideration. I'll change that following your advice below.


Hi Alex,

Thank you for the link. It's helpful. I have a question regarding the 
versioning. One topic in the article talks about how the userspace can 
figure out if the new ioctl is supported in a given kernel. Is it 
correct that with dkms driver, we use the driver version coming from 
AMDGPU_VERSION in amdgpu_drv.c, and in upstream kernel we use the kernel 
version?


Thanks.

Amber

On 2020-04-14 11:03 p.m., Deucher, Alexander wrote:


[AMD Public Use]


Some good advice on getting ioctls right:
https://www.kernel.org/doc/html/v5.4-preprc-cpu/ioctl/botching-up-ioctls.html

Alex


*From:* amd-gfx  on behalf of 
Felix Kuehling 

*Sent:* Tuesday, April 14, 2020 10:40 PM
*To:* Lin, Amber ; amd-gfx@lists.freedesktop.org 


*Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch

Hi Amber,

I understand that different processes can get the same FD. My 
statement about FD being unique is relative to one process.


The main problem with the global client ID is, that it allows process 
A to change the event mask of process B just by specifying process B's 
client ID. That can lead to denial of service attacks where process A 
can cause events not to be delivered to B or can flood process B with 
frequent events that it's not prepared to handle.


Therefore you must make the lookup of the client from the client ID 
not from a global list, but from a per-process list. That way process 
A can only change event masks of process A clients, and not those of 
any other process.


But if the client list is process-specific, you can use the FD as a 
unique identifier of the client within the process, so you don't need 
a separate client ID.


Regards,
  Felix

Am 2020-04-14 um 8:09 p.m. schrieb Lin, Amber:


[AMD Official Use Only - Internal Distribution Only]

Hi Felix,

That was my assumption too that each registration will get different 
file descriptor, but it turns out not. When I started two process and 
both register gpu0 and gpu1, they both got fd=15. If I have process A 
register gpu0+gpu1, and process B only register gpu0, process A gets 
fd=15 and process B gets fd=9. That’s why I added client ID.


By multiple clients, I mean multiple processes. The ask is users want 
to have multiple tools and those different tools can use rsmi lib to 
watch events at the same time. Due to the reason above that two 
processes can actually get the same fd and I need to add client ID to 
distinguish the registration, I don’t see the point of limiting one 
registration per process unless I use pid to distinguish the client 
instead, which was in my consideration too when I was writing the 
code. But second thought is why adding this restriction when client 
ID can allow the tool to watch different events on different devices 
if they want to. Maybe client ID is a bad term and it misleads you. I 
should call it register ID.


Regards,

Amber

*From:* Kuehling, Felix  
<mailto:felix.kuehl...@amd.com>

*Sent:* Tuesday, April 14, 2020 7:04 PM
*To:* Lin, Amber  <mailto:amber@amd.com>; 
amd-gfx@lists.freedesktop.org <mailto:amd-gfx@lists.freedesktop.org>

*Subject:* Re: [PATCH v4] drm/amdkfd: Provide SMI events watch

Hi Amber,

Some general remarks about the multi-client support. You added a 
global client id that's separate from the file descriptor. That's 
problematic for two reasons:


 1. A process could change a different process' event mask
 2. The FD should already be unique per process, no need to invent
another ID

If we want to allow one process to register for events multiple times 
(multiple FDs per process), then the list of clients should be per 
process. Each process should only be allowed to change the event 
masks of its own clients. The client could be identified by its FD. 
No need for another client ID.


But you could also simplify it further by allowing only one event 
client per process. Then you don't need the client ID lookup at all. 
Just have a single event client in the kfd_process.


Another approach would be to make enable/disable functions of the 
event FD, rather than the KFD FD ioctl. It could be an ioctl of the 
event FD, or even simpler, you could use the write file-operation to 
write an event mask (of arbitrary length if you want to enable growth 
in the future). That way everything would be neatly encapsulated in 
the event FD private data.


Two more comments inline ...

Am 2020-04-14 um 5:30 p.m. schrieb Amber Lin:

When the compute is malfunctioning or performance drops, the system admin

will use SMI (System Management Interface) tool to monitor/diagnostic what

went wrong. This patch provides an event watch interface for the user

space to register devices an

[PATCH v4] drm/amdkfd: Provide SMI events watch

2020-04-14 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  43 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_module.c  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 235 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  33 
 include/uapi/linux/kfd_ioctl.h   |  35 +++-
 9 files changed, 354 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..f13fde59 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1732,6 +1733,45 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   uint32_t *ids_array;
+   int ret = 0;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   ids_array = kmalloc_array(args->num_gpuids, sizeof(uint32_t),
+ GFP_KERNEL);
+   if (!ids_array)
+   return -ENOMEM;
+   if (copy_from_user(ids_array,
+ (void __user *)args->gpuids_array_ptr,
+ args->num_gpuids * sizeof(uint32_t))) {
+   kfree(ids_array);
+   return -EFAULT;
+   }
+
+   ret = kfd_smi_event_register(args->num_gpuids, ids_array,
+>anon_fd, >client_id);
+   if (ret)
+   kfree(ids_array);
+
+   return ret;
+
+   case KFD_SMI_EVENTS_ENABLE:
+   /* subscribe events */
+   return kfd_smi_event_enable(args->client_id, args->events);
+   case KFD_SMI_EVENTS_DISABLE:
+   /* unsubscribe events */
+   return kfd_smi_event_disable(args->client_id, args-&

[PATCH v4] drm/amdkfd: Provide SMI events watch

2020-04-12 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  43 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_module.c  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 235 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  33 
 include/uapi/linux/kfd_ioctl.h   |  35 +++-
 9 files changed, 354 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..f13fde59 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1732,6 +1733,45 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   uint32_t *ids_array;
+   int ret = 0;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   ids_array = kmalloc_array(args->num_gpuids, sizeof(uint32_t),
+ GFP_KERNEL);
+   if (!ids_array)
+   return -ENOMEM;
+   if (copy_from_user(ids_array,
+ (void __user *)args->gpuids_array_ptr,
+ args->num_gpuids * sizeof(uint32_t))) {
+   kfree(ids_array);
+   return -EFAULT;
+   }
+
+   ret = kfd_smi_event_register(args->num_gpuids, ids_array,
+>anon_fd, >client_id);
+   if (ret)
+   kfree(ids_array);
+
+   return ret;
+
+   case KFD_SMI_EVENTS_ENABLE:
+   /* subscribe events */
+   return kfd_smi_event_enable(args->client_id, args->events);
+   case KFD_SMI_EVENTS_DISABLE:
+   /* unsubscribe events */
+   return kfd_smi_event_disable(args->client_id, args-&

[PATCH v3] drm/amdkfd: Provide SMI events watch

2020-04-06 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  13 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 174 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  42 ++
 include/uapi/linux/kfd_ioctl.h   |  32 -
 9 files changed, 296 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   /* register the device */
+   return kfd_smi_event_register(dev, >data);
+   case KFD_SMI_EVENTS_ENABLE:
+   /* subscribe events to the device */
+   return kfd_smi_event_enable(dev, args->events);
+   case KFD_SMI_EVENTS_DISABLE:
+   /* unsubscribe events */
+   return kfd_smi_event_disable(dev, args->events);
+   }
+
+   return -EINVAL;
+}
+
 bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_devic

Re: [PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-03 Thread Amber Lin
Further thinking about it, I'll use struct kfd_smi_msg_header. Instead 
of using struct kfd_smi_msg_vmfault, it's a description about the event. 
This way we make it generic to all events.


On 2020-04-03 9:38 a.m., Amber Lin wrote:
Thanks Felix. I'll make changes accordingly but please pay attention 
to my last reply inline.


On 2020-04-02 7:51 p.m., Felix Kuehling wrote:

On 2020-04-02 4:46 p.m., Amber Lin wrote:
When the compute is malfunctioning or performance drops, the system 
admin
will use SMI (System Management Interface) tool to 
monitor/diagnostic what

went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the 
event

happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
 - correct kfifo usage
 - move event message API to kfd_ioctl.h

Signed-off-by: Amber Lin 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  12 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177 
+++

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
  include/uapi/linux/kfd_ioctl.h   |  30 +++-
  9 files changed, 286 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile

index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES    := $(AMDKFD_PATH)/kfd_module.o \
  $(AMDKFD_PATH)/kfd_int_process_v9.o \
  $(AMDKFD_PATH)/kfd_dbgdev.o \
  $(AMDKFD_PATH)/kfd_dbgmgr.o \
-    $(AMDKFD_PATH)/kfd_crat.o
+    $(AMDKFD_PATH)/kfd_crat.o \
+    $(AMDKFD_PATH)/kfd_smi_events.o
    ifneq ($(CONFIG_AMD_IOMMU_V2),)
  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c

index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
    static bool cik_event_interrupt_isr(struct kfd_dev *dev,
  const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct 
kfd_dev *dev,

  ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
  struct kfd_vm_fault_info info;
  +    kfd_smi_event_update_vmfault(dev, pasid);
  kfd_process_vm_fault(dev->dqm, pasid);
    memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
    static long kfd_ioctl(struct file *, unsigned int, unsigned long);
  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file 
*filep, struct kfd_process *p,

  return ret;
  }
  +/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+    struct kfd_process *p, void *data)
+{
+    struct kfd_ioctl_smi_events_args *args = data;
+    struct kfd_dev *dev;
+
+    dev = kfd_device_by_id(args->gpu_id);
+    if (!dev)
+    return -EINVAL;
+
+    switch (args->op) {
+    case KFD_SMI_EVENTS_REGISTER:
+    /* register the device */
+    return kfd_smi_event_register(dev, >data);
+    case KFD_SMI_EVENTS_ENABLE:
+    /* subscribe events to the device */
+    return kfd_smi_event_enable(dev, args->events);
+    case KFD_SMI_EVENTS_DISABLE:
+    /* unsubscribe events */
+    return kfd_smi_event_disable(dev, args->events);
+    }
+
+    return -EINVAL;
+}
+
  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
  {
  struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc 
amdkfd_ioctls[] = {

    AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_

Re: [PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-03 Thread Amber Lin
Thanks Felix. I'll make changes accordingly but please pay attention to 
my last reply inline.


On 2020-04-02 7:51 p.m., Felix Kuehling wrote:

On 2020-04-02 4:46 p.m., Amber Lin wrote:
When the compute is malfunctioning or performance drops, the system 
admin
will use SMI (System Management Interface) tool to monitor/diagnostic 
what

went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
 - correct kfifo usage
 - move event message API to kfd_ioctl.h

Signed-off-by: Amber Lin 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  12 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177 
+++

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
  include/uapi/linux/kfd_ioctl.h   |  30 +++-
  9 files changed, 286 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile

index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES    := $(AMDKFD_PATH)/kfd_module.o \
  $(AMDKFD_PATH)/kfd_int_process_v9.o \
  $(AMDKFD_PATH)/kfd_dbgdev.o \
  $(AMDKFD_PATH)/kfd_dbgmgr.o \
-    $(AMDKFD_PATH)/kfd_crat.o
+    $(AMDKFD_PATH)/kfd_crat.o \
+    $(AMDKFD_PATH)/kfd_smi_events.o
    ifneq ($(CONFIG_AMD_IOMMU_V2),)
  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c

index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
    static bool cik_event_interrupt_isr(struct kfd_dev *dev,
  const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev 
*dev,

  ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
  struct kfd_vm_fault_info info;
  +    kfd_smi_event_update_vmfault(dev, pasid);
  kfd_process_vm_fault(dev->dqm, pasid);
    memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
    static long kfd_ioctl(struct file *, unsigned int, unsigned long);
  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file 
*filep, struct kfd_process *p,

  return ret;
  }
  +/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+    struct kfd_process *p, void *data)
+{
+    struct kfd_ioctl_smi_events_args *args = data;
+    struct kfd_dev *dev;
+
+    dev = kfd_device_by_id(args->gpu_id);
+    if (!dev)
+    return -EINVAL;
+
+    switch (args->op) {
+    case KFD_SMI_EVENTS_REGISTER:
+    /* register the device */
+    return kfd_smi_event_register(dev, >data);
+    case KFD_SMI_EVENTS_ENABLE:
+    /* subscribe events to the device */
+    return kfd_smi_event_enable(dev, args->events);
+    case KFD_SMI_EVENTS_DISABLE:
+    /* unsubscribe events */
+    return kfd_smi_event_disable(dev, args->events);
+    }
+
+    return -EINVAL;
+}
+
  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
  {
  struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc 
amdkfd_ioctls[] = {

    AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
  kfd_ioctl_alloc_queue_gws, 0),
+
+    AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+    kfd_ioctl_smi_events, 0),
  };
    #define AMDKFD_CORE_IOCTL_COUNT    ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/

[PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-02 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  12 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
 include/uapi/linux/kfd_ioctl.h   |  30 +++-
 9 files changed, 286 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   /* register the device */
+   return kfd_smi_event_register(dev, >data);
+   case KFD_SMI_EVENTS_ENABLE:
+   /* subscribe events to the device */
+   return kfd_smi_event_enable(dev, args->events);
+   case KFD_SMI_EVENTS_DISABLE:
+   /* unsubscribe events */
+   return kfd_smi_event_disable(dev, args->events);
+   }
+
+   return -EINVAL;
+}
+
 bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-04-01 Thread Amber Lin
Thanks Felix for the review. I have a better understanding of how kfifo 
works now and have changed my code quite a bit. Couple of questions 
below inline regarding the gpu_id and data arguments.


Thanks.

Amber

On 2020-03-26 4:53 p.m., Felix Kuehling wrote:


Hi Amber,

I see that this is based on the debugger event code. Jon and I are 
just working through some issues with that code. The lessons from that 
will need to be applied to this as well. But I think we can define 
your API to simplify this a bit.


The basic problem is, that we have one Fifo in the kfd_device, but 
potentially multiple file descriptors referring to it. For the event 
interface I think we can enforce only a single file descriptor per 
device. If there is already one, your register call can fail. See more 
comments inline.


On 2020-03-17 13:57, Amber Lin wrote:

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function


pull -> poll


Thank you for spotting the typo. I’ll change that.


with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 +++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
  include/uapi/linux/kfd_ioctl.h   |  27 -
  9 files changed, 265 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
  
  ifneq ($(CONFIG_AMD_IOMMU_V2),)

  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static bool cik_event_interrupt_isr(struct kfd_dev *dev,

const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
  
+		kfd_smi_event_update_vmfault(dev, pasid);

kfd_process_vm_fault(dev->dqm, pasid);
  
  		memset(, 0, sizeof(info));

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static long kfd_ioctl(struct file *, unsigned int, unsigned long);

  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
  }
  
+/* Handle requests for watching SMI events */

+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+   int ret = 0;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   ret = kfd_smi_event_register(dev, args->events);
+   if (ret >= 0) {
+ 

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-03-24 Thread Amber Lin
Sorry for the messed-up link. This is the link (rocm-smi-lib) which 
makes use of the interface

https://github.com/RadeonOpenCompute/rocm_smi_lib

On 2020-03-23 2:19 p.m., Amber Lin wrote:

Somehow my reply didn't seem to reach the mailing list...

Hi Alex,

https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FRadeonOpenCompute%2Frocm_smi_libdata=02%7C01%7Camber.lin%40amd.com%7C37d1a82d9e734d9fec6d08d7cf56ce36%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637205844045641423sdata=I%2BVkN3VKYFUiZ0xGW0Yst70rcqrMRXUTcd995RgfRa4%3Dreserved=0 
will use this interface. Those functions will be added to this library:


/* Get a handler for watching events */
rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
/* Register events for the device using the handler from init */ 
rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,

    rsmi_event_handle_t *handle);
/* Wait for events. If one of the events happens, a success is 
returned with

 * with details in data.
 */
rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t 
timeout_ms,

    rsmi_event_data_t *data);
/* Stop watching events */
rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);

I add the ioctl to /dev/kfd with a debate if it should be in 
/dev/dri/card* or /dev/dri/renderD* instead. The first event to report 
is VM fault in this patch. Other events like RAS errors, PCIe errors, 
GPU reset… etc will be added for the system admin to diagnose the 
system health. I see this as a system feature so I use /dev/kfd. I’ll 
like to hear if people think differently. Thanks.


Thanks.

Amber

On 2020-03-17 3:03 p.m., Alex Deucher wrote:

On Tue, Mar 17, 2020 at 1:57 PM Amber Lin  wrote:
When the compute is malfunctioning or performance drops, the system 
admin
will use SMI (System Management Interface) tool to 
monitor/diagnostic what

went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function
with wait-time specified to wait for the event to happen. Once the 
event

happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin 
Can you provide a link to the userspace tools that make use of this 
interface?


Thanks,

Alex


---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  10 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 
+++

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
  include/uapi/linux/kfd_ioctl.h   |  27 -
  9 files changed, 265 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile

index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
 $(AMDKFD_PATH)/kfd_int_process_v9.o \
 $(AMDKFD_PATH)/kfd_dbgdev.o \
 $(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o

  ifneq ($(CONFIG_AMD_IOMMU_V2),)
  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c

index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"

  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct 
kfd_dev *dev,

 ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
 struct kfd_vm_fault_info info;

+   kfd_smi_event_update_vmfault(dev, pasid);
 kfd_process_vm_fault(dev->dqm, pasid);

 memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-03-23 Thread Amber Lin

Somehow my reply didn't seem to reach the mailing list...

Hi Alex,

https://github.com/RadeonOpenCompute/rocm_smi_lib will use this 
interface. Those functions will be added to this library:


/* Get a handler for watching events */
rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
/* Register events for the device using the handler from init */ 
rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,

    rsmi_event_handle_t *handle);
/* Wait for events. If one of the events happens, a success is returned with
 * with details in data.
 */
rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t 
timeout_ms,

    rsmi_event_data_t *data);
/* Stop watching events */
rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);

I add the ioctl to /dev/kfd with a debate if it should be in 
/dev/dri/card* or /dev/dri/renderD* instead. The first event to report 
is VM fault in this patch. Other events like RAS errors, PCIe errors, 
GPU reset… etc will be added for the system admin to diagnose the system 
health. I see this as a system feature so I use /dev/kfd. I’ll like to 
hear if people think differently. Thanks.


Thanks.

Amber

On 2020-03-17 3:03 p.m., Alex Deucher wrote:

On Tue, Mar 17, 2020 at 1:57 PM Amber Lin  wrote:

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin 

Can you provide a link to the userspace tools that make use of this interface?

Thanks,

Alex


---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 +++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
  include/uapi/linux/kfd_ioctl.h   |  27 -
  9 files changed, 265 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
 $(AMDKFD_PATH)/kfd_int_process_v9.o \
 $(AMDKFD_PATH)/kfd_dbgdev.o \
 $(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o

  ifneq ($(CONFIG_AMD_IOMMU_V2),)
  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"

  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
 struct kfd_vm_fault_info info;

+   kfd_smi_event_update_vmfault(dev, pasid);
 kfd_process_vm_fault(dev->dqm, pasid);

 memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"

  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
 return ret;
  }

+/* Handle r

[PATCH] drm/amdkfd: Provide SMI events watch

2020-03-17 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
 include/uapi/linux/kfd_ioctl.h   |  27 -
 9 files changed, 265 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+   int ret = 0;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   ret = kfd_smi_event_register(dev, args->events);
+   if (ret >= 0) {
+   /* When the registration is successful, it returns the
+* annoymous inode. Pass it to the user in data1
+*/
+   args->data1 = ret;
+   ret = 0;
+   }
+   break;
+   case KFD_SMI_EVENTS_UNREGISTER:
+   kfd_smi_event_unregister(dev, args->events);
+   break;
+   default:
+   ret = -EINVAL;
+   break;
+   }
+
+   return ret;
+}
+
 bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
struct kfd_local_mem_info mem_info;
@@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCT

[PATCH v3] drm/amdkfd: Add queue information to sysfs

2020-02-03 Thread Amber Lin
Provide compute queues information in sysfs under /sys/class/kfd/kfd/proc.
The format is /sys/class/kfd/kfd/proc//queues//XX where
XX are size, type, and gpuid three files to represent queue size, queue
type, and the GPU this queue uses.  folder and files underneath
are generated when a queue is created. They are removed when the queue is
destroyed.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 90 ++
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  2 +
 3 files changed, 99 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c0b0def..f805f55 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "amd_shared.h"
 
@@ -503,6 +504,9 @@ struct queue {
struct kfd_process  *process;
struct kfd_dev  *device;
void *gws;
+
+   /* procfs */
+   struct kobject kobj;
 };
 
 /*
@@ -730,6 +734,7 @@ struct kfd_process {
 
/* Kobj for our procfs */
struct kobject *kobj;
+   struct kobject *kobj_queues;
struct attribute attr_pasid;
 };
 
@@ -836,6 +841,8 @@ extern struct device *kfd_device;
 /* KFD's procfs */
 void kfd_procfs_init(void);
 void kfd_procfs_shutdown(void);
+int kfd_procfs_add_queue(struct queue *q);
+void kfd_procfs_del_queue(struct queue *q);
 
 /* Topology */
 int kfd_topology_init(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 25b90f7..98dcbb9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -132,6 +132,88 @@ void kfd_procfs_shutdown(void)
}
 }
 
+static ssize_t kfd_procfs_queue_show(struct kobject *kobj,
+struct attribute *attr, char *buffer)
+{
+   struct queue *q = container_of(kobj, struct queue, kobj);
+
+   if (!strcmp(attr->name, "size"))
+   return snprintf(buffer, PAGE_SIZE, "%llu",
+   q->properties.queue_size);
+   else if (!strcmp(attr->name, "type"))
+   return snprintf(buffer, PAGE_SIZE, "%d", q->properties.type);
+   else if (!strcmp(attr->name, "gpuid"))
+   return snprintf(buffer, PAGE_SIZE, "%u", q->device->id);
+   else
+   pr_err("Invalid attribute");
+
+   return 0;
+}
+
+static struct attribute attr_queue_size = {
+   .name = "size",
+   .mode = KFD_SYSFS_FILE_MODE
+};
+
+static struct attribute attr_queue_type = {
+   .name = "type",
+   .mode = KFD_SYSFS_FILE_MODE
+};
+
+static struct attribute attr_queue_gpuid = {
+   .name = "gpuid",
+   .mode = KFD_SYSFS_FILE_MODE
+};
+
+static struct attribute *procfs_queue_attrs[] = {
+   _queue_size,
+   _queue_type,
+   _queue_gpuid,
+   NULL
+};
+
+static const struct sysfs_ops procfs_queue_ops = {
+   .show = kfd_procfs_queue_show,
+};
+
+static struct kobj_type procfs_queue_type = {
+   .sysfs_ops = _queue_ops,
+   .default_attrs = procfs_queue_attrs,
+};
+
+int kfd_procfs_add_queue(struct queue *q)
+{
+   struct kfd_process *proc;
+   int ret;
+
+   if (!q || !q->process)
+   return -EINVAL;
+   proc = q->process;
+
+   /* Create proc//queues/ folder */
+   if (!proc->kobj_queues)
+   return -EFAULT;
+   ret = kobject_init_and_add(>kobj, _queue_type,
+   proc->kobj_queues, "%u", q->properties.queue_id);
+   if (ret < 0) {
+   pr_warn("Creating proc//queues/%u failed",
+   q->properties.queue_id);
+   kobject_put(>kobj);
+   return ret;
+   }
+
+   return 0;
+}
+
+void kfd_procfs_del_queue(struct queue *q)
+{
+   if (!q)
+   return;
+
+   kobject_del(>kobj);
+   kobject_put(>kobj);
+}
+
 int kfd_process_create_wq(void)
 {
if (!kfd_process_wq)
@@ -323,6 +405,11 @@ struct kfd_process *kfd_create_process(struct file *filep)
if (ret)
pr_warn("Creating pasid for pid %d failed",
(int)process->lead_thread->pid);
+
+   process->kobj_queues = kobject_create_and_add("queues",
+   process->kobj);
+   if (!process->kobj_queues)
+   pr_warn("Creating KFD proc/queues folder failed");
}
 out:
if (!IS_ERR(process))
@@ -457,6 +544,9 @@ static void kfd_process_wq_release(struct work_struct *wo

[PATCH v2] drm/amdkfd: Add queue information to sysfs

2020-01-31 Thread Amber Lin
Provide compute queues information in sysfs under /sys/class/kfd/kfd/proc.
The format is /sys/class/kfd/kfd/proc//queues//XX where
XX are size, type, and gpuid three files to represent queue size, queue
type, and the GPU this queue uses.  folder and files underneath
are generated when a queue is created. They are removed when the queue is
destroyed.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  9 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 96 ++
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  2 +
 3 files changed, 107 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c0b0def..cb2d2d7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -503,6 +503,12 @@ struct queue {
struct kfd_process  *process;
struct kfd_dev  *device;
void *gws;
+
+   /* procfs */
+   struct kobject *kobj_qid;
+   struct attribute attr_size;
+   struct attribute attr_type;
+   struct attribute attr_gpuid;
 };
 
 /*
@@ -730,6 +736,7 @@ struct kfd_process {
 
/* Kobj for our procfs */
struct kobject *kobj;
+   struct kobject *kobj_queues;
struct attribute attr_pasid;
 };
 
@@ -836,6 +843,8 @@ extern struct device *kfd_device;
 /* KFD's procfs */
 void kfd_procfs_init(void);
 void kfd_procfs_shutdown(void);
+int kfd_procfs_add_queue(struct queue *q);
+void kfd_procfs_del_queue(struct queue *q);
 
 /* Topology */
 int kfd_topology_init(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 25b90f7..78ca037 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -132,6 +132,94 @@ void kfd_procfs_shutdown(void)
}
 }
 
+static int kfd_procfs_add_file(const char *name, struct kobject *kobj,
+  struct attribute *attr)
+{
+   int ret;
+
+   attr->name = name;
+   attr->mode = KFD_SYSFS_FILE_MODE;
+   sysfs_attr_init(attr);
+   ret = sysfs_create_file(kobj, attr);
+   if (ret)
+   pr_warn("Creating %s file failed", name);
+   return ret;
+}
+
+static ssize_t kfd_procfs_queue_show(struct kobject *kobj,
+struct attribute *attr, char *buffer)
+{
+   if (!strcmp(attr->name, "size")) {
+   struct queue *q = container_of(attr, struct queue, attr_size);
+   return snprintf(buffer, PAGE_SIZE, "%llu",
+   q->properties.queue_size);
+   } else if (!strcmp(attr->name, "type")) {
+   struct queue *q = container_of(attr, struct queue, attr_type);
+   return snprintf(buffer, PAGE_SIZE, "%d", q->properties.type);
+   } else if (!strcmp(attr->name, "gpuid")) {
+   struct queue *q = container_of(attr, struct queue, attr_gpuid);
+   return snprintf(buffer, PAGE_SIZE, "%u", q->device->id);
+   } else
+   pr_err("Invalid attribute");
+
+   return 0;
+}
+
+static const struct sysfs_ops procfs_queue_ops = {
+   .show = kfd_procfs_queue_show,
+};
+
+static struct kobj_type procfs_queue_type = {
+   .release = kfd_procfs_kobj_release,
+   .sysfs_ops = _queue_ops,
+};
+
+int kfd_procfs_add_queue(struct queue *q)
+{
+   struct kfd_process *proc;
+   int ret;
+
+   if (!q || !q->process)
+   return -EINVAL;
+   proc = q->process;
+
+   /* Create proc//queues/ folder*/
+   if (!proc->kobj_queues)
+   return -EFAULT;
+   if (q->kobj_qid)
+   return -EEXIST;
+   q->kobj_qid = kfd_alloc_struct(q->kobj_qid);
+   if (!q->kobj_qid)
+   return -ENOMEM;
+   ret = kobject_init_and_add(q->kobj_qid, _queue_type,
+   proc->kobj_queues, "%u", q->properties.queue_id);
+   if (ret < 0) {
+   pr_warn("Creating proc//queues/%u failed",
+   q->properties.queue_id);
+   return ret;
+   }
+
+   /* Create proc//queues//XX files */
+   kfd_procfs_add_file("size", q->kobj_qid, >attr_size);
+   kfd_procfs_add_file("type", q->kobj_qid, >attr_type);
+   kfd_procfs_add_file("gpuid", q->kobj_qid, >attr_gpuid);
+
+   return 0;
+}
+
+void kfd_procfs_del_queue(struct queue *q)
+{
+   if (!q || !q->process)
+   return;
+
+   sysfs_remove_file(q->kobj_qid, >attr_size);
+   sysfs_remove_file(q->kobj_qid, >attr_type);
+   sysfs_remove_file(q->kobj_qid, >attr_gpuid);
+   kobject_del(q->kobj_qid);
+   kobject_put(q->kobj_qid);
+   q-&g

[PATCH] drm/amdkfd: Add queue information to sysfs

2020-01-29 Thread Amber Lin
Provide compute queues information in sysfs under /sys/class/kfd/kfd/proc.
The format is /sys/class/kfd/kfd/proc//queues//XX where
XX are size, type, and gpuid three files to represent queue size, queue
type, and the GPU this queue uses.  folder and files underneath
are generated when a queue is created. They are removed when the queue is
destroyed.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  9 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 99 ++
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  2 +
 3 files changed, 110 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c0b0def..cb2d2d7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -503,6 +503,12 @@ struct queue {
struct kfd_process  *process;
struct kfd_dev  *device;
void *gws;
+
+   /* procfs */
+   struct kobject *kobj_qid;
+   struct attribute attr_size;
+   struct attribute attr_type;
+   struct attribute attr_gpuid;
 };
 
 /*
@@ -730,6 +736,7 @@ struct kfd_process {
 
/* Kobj for our procfs */
struct kobject *kobj;
+   struct kobject *kobj_queues;
struct attribute attr_pasid;
 };
 
@@ -836,6 +843,8 @@ extern struct device *kfd_device;
 /* KFD's procfs */
 void kfd_procfs_init(void);
 void kfd_procfs_shutdown(void);
+int kfd_procfs_add_queue(struct queue *q);
+void kfd_procfs_del_queue(struct queue *q);
 
 /* Topology */
 int kfd_topology_init(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 25b90f7..0220651 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -132,6 +132,97 @@ void kfd_procfs_shutdown(void)
}
 }
 
+static int kfd_procfs_add_file(const char *name, struct kobject *kobj,
+  struct attribute *attr)
+{
+   int ret;
+
+   attr->name = name;
+   attr->mode = KFD_SYSFS_FILE_MODE;
+   sysfs_attr_init(attr);
+   ret = sysfs_create_file(kobj, attr);
+   if (ret)
+   pr_warn("Creating %s file failed", name);
+   return ret;
+}
+
+static ssize_t kfd_procfs_queue_show(struct kobject *kobj,
+struct attribute *attr, char *buffer)
+{
+   if (!strcmp(attr->name, "size")) {
+   struct queue *q = container_of(attr, struct queue, attr_size);
+   return snprintf(buffer, PAGE_SIZE, "%llu",
+   q->properties.queue_size);
+   } else if (!strcmp(attr->name, "type")) {
+   struct queue *q = container_of(attr, struct queue, attr_type);
+   return snprintf(buffer, PAGE_SIZE, "%d", q->properties.type);
+   } else if (!strcmp(attr->name, "gpuid")) {
+   struct queue *q = container_of(attr, struct queue, attr_gpuid);
+   return snprintf(buffer, PAGE_SIZE, "%u", q->device->id);
+   } else
+   pr_err("Invalid attribute");
+
+   return 0;
+}
+
+static const struct sysfs_ops procfs_queue_ops = {
+   .show = kfd_procfs_queue_show,
+};
+
+static struct kobj_type procfs_queue_type = {
+   .release = kfd_procfs_kobj_release,
+   .sysfs_ops = _queue_ops,
+};
+
+int kfd_procfs_add_queue(struct queue *q)
+{
+   struct kfd_process *proc;
+   int ret;
+
+   if (!q || !q->process)
+   return -EINVAL;
+   proc = q->process;
+
+   /* Create proc//queues/ folder*/
+   if (!proc->kobj_queues)
+   return -EFAULT;
+   if (q->kobj_qid)
+   return -EEXIST;
+   q->kobj_qid = kfd_alloc_struct(q->kobj_qid);
+   if (!q->kobj_qid)
+   return -ENOMEM;
+   ret = kobject_init_and_add(q->kobj_qid, _queue_type,
+   proc->kobj_queues, "%u", q->properties.queue_id);
+   if (ret < 0) {
+   pr_warn("Creating proc//queues/%u failed",
+   q->properties.queue_id);
+   return ret;
+   }
+
+   /* Create proc//queues//XX files */
+   kfd_procfs_add_file("size", q->kobj_qid, >attr_size);
+   kfd_procfs_add_file("type", q->kobj_qid, >attr_type);
+   kfd_procfs_add_file("gpuid", q->kobj_qid, >attr_gpuid);
+
+   return 0;
+}
+
+void kfd_procfs_del_queue(struct queue *q)
+{
+   struct kfd_process *proc;
+
+   if (!q || !q->process)
+   return;
+   proc = q->process;
+
+   sysfs_remove_file(q->kobj_qid, >attr_size);
+   sysfs_remove_file(q->kobj_qid, >attr_type);
+   sysfs_remove_file(q->kobj_qid, >attr_gpuid);
+   k

[PATCH] drm/amdgpu: Relocate some definitions

2018-08-29 Thread Amber Lin
Move some KFD-related (but used in amdgpu_drv.c) definitions from
kfd_priv.h to kgd_kfd_interface.h so we don't need to include kfd_priv.h
in amdgpu_drv.c. This fixes a build failure when AMDGPU is enabled but
MMU_NOTIFIER is not.
This patch also disables KFD-related module options when HSA_AMD is not
enabled.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 20 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 28 -
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 28 +
 3 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index dd6d8b1..e4d0d72 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -39,7 +39,6 @@
 #include "amdgpu_gem.h"
 
 #include "amdgpu_amdkfd.h"
-#include "kfd_priv.h"
 
 /*
  * KMS wrapper.
@@ -128,15 +127,6 @@ int amdgpu_compute_multipipe = -1;
 int amdgpu_gpu_recovery = -1; /* auto */
 int amdgpu_emu_mode = 0;
 uint amdgpu_smu_memory_pool_size = 0;
-/* KFD parameters */
-int sched_policy = KFD_SCHED_POLICY_HWS;
-int hws_max_conc_proc = 8;
-int cwsr_enable = 1;
-int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
-int send_sigterm;
-int debug_largebar;
-int ignore_crat;
-int vega10_noretry;
 
 /**
  * DOC: vramlimit (int)
@@ -542,12 +532,14 @@ MODULE_PARM_DESC(smu_memory_pool_size,
"0x1 = 256Mbyte, 0x2 = 512Mbyte, 0x4 = 1 Gbyte, 0x8 = 2GByte");
 module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 
0444);
 
+#ifdef CONFIG_HSA_AMD
 /**
  * DOC: sched_policy (int)
  * Set scheduling policy. Default is HWS(hardware scheduling) with 
over-subscription.
  * Setting 1 disables over-subscription. Setting 2 disables HWS and statically
  * assigns queues to HQDs.
  */
+int sched_policy = KFD_SCHED_POLICY_HWS;
 module_param(sched_policy, int, 0444);
 MODULE_PARM_DESC(sched_policy,
"Scheduling policy (0 = HWS (Default), 1 = HWS without 
over-subscription, 2 = Non-HWS (Used for debugging only)");
@@ -557,6 +549,7 @@ MODULE_PARM_DESC(sched_policy,
  * Maximum number of processes that HWS can schedule concurrently. The maximum 
is the
  * number of VMIDs assigned to the HWS, which is also the default.
  */
+int hws_max_conc_proc = 8;
 module_param(hws_max_conc_proc, int, 0444);
 MODULE_PARM_DESC(hws_max_conc_proc,
"Max # processes HWS can execute concurrently when sched_policy=0 (0 = 
no concurrency, #VMIDs for KFD = Maximum(default))");
@@ -567,6 +560,7 @@ MODULE_PARM_DESC(hws_max_conc_proc,
  * the middle of a compute wave. Default is 1 to enable this feature. Setting 0
  * disables it.
  */
+int cwsr_enable = 1;
 module_param(cwsr_enable, int, 0444);
 MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
 
@@ -575,6 +569,7 @@ MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On 
(Default))");
  * Maximum number of queues per device. Valid setting is between 1 and 4096. 
Default
  * is 4096.
  */
+int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
 module_param(max_num_of_queues_per_device, int, 0444);
 MODULE_PARM_DESC(max_num_of_queues_per_device,
"Maximum number of supported queues per device (1 = Minimum, 4096 = 
default)");
@@ -584,6 +579,7 @@ MODULE_PARM_DESC(max_num_of_queues_per_device,
  * Send sigterm to HSA process on unhandled exceptions. Default is not to send 
sigterm
  * but just print errors on dmesg. Setting 1 enables sending sigterm.
  */
+int send_sigterm;
 module_param(send_sigterm, int, 0444);
 MODULE_PARM_DESC(send_sigterm,
"Send sigterm to HSA process on unhandled exception (0 = disable, 1 = 
enable)");
@@ -595,6 +591,7 @@ MODULE_PARM_DESC(send_sigterm,
  * size, usually 256MB.
  * Default value is 0, diabled.
  */
+int debug_largebar;
 module_param(debug_largebar, int, 0444);
 MODULE_PARM_DESC(debug_largebar,
"Debug large-bar flag used to simulate large-bar capability on 
non-large bar machine (0 = disable, 1 = enable)");
@@ -605,6 +602,7 @@ MODULE_PARM_DESC(debug_largebar,
  * table to get information about AMD APUs. This option can serve as a 
workaround on
  * systems with a broken CRAT table.
  */
+int ignore_crat;
 module_param(ignore_crat, int, 0444);
 MODULE_PARM_DESC(ignore_crat,
"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 
= ignore CRAT)");
@@ -615,9 +613,11 @@ MODULE_PARM_DESC(ignore_crat,
  * Setting 1 disables retry.
  * Retry is needed for recoverable page faults.
  */
+int vega10_noretry;
 module_param_named(noretry, vega10_noretry, int, 0644);
 MODULE_PARM_DESC(noretry,
"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled 
(default), 1 = retry disabled)");
+#endif
 
 static const st

[PATCH] drm/amdgpu: Relocate some definitions

2018-08-28 Thread Amber Lin
Move some KFD-related (but used in amdgpu_drv.c) definitions from
kfd_priv.h to kgd_kfd_interface.h so we don't need to include kfd_priv.h
in amdgpu_drv.c. This fixes a build failure when AMDGPU is enabled but
MMU_NOTIFIER is not.
This patch also disables KFD-related module options when HSA_AMD is not
enabled.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 20 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 28 -
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 28 +
 3 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index dd6d8b1..e4d0d72 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -39,7 +39,6 @@
 #include "amdgpu_gem.h"
 
 #include "amdgpu_amdkfd.h"
-#include "kfd_priv.h"
 
 /*
  * KMS wrapper.
@@ -128,15 +127,6 @@ int amdgpu_compute_multipipe = -1;
 int amdgpu_gpu_recovery = -1; /* auto */
 int amdgpu_emu_mode = 0;
 uint amdgpu_smu_memory_pool_size = 0;
-/* KFD parameters */
-int sched_policy = KFD_SCHED_POLICY_HWS;
-int hws_max_conc_proc = 8;
-int cwsr_enable = 1;
-int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
-int send_sigterm;
-int debug_largebar;
-int ignore_crat;
-int vega10_noretry;
 
 /**
  * DOC: vramlimit (int)
@@ -542,12 +532,14 @@ MODULE_PARM_DESC(smu_memory_pool_size,
"0x1 = 256Mbyte, 0x2 = 512Mbyte, 0x4 = 1 Gbyte, 0x8 = 2GByte");
 module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 
0444);
 
+#ifdef CONFIG_HSA_AMD
 /**
  * DOC: sched_policy (int)
  * Set scheduling policy. Default is HWS(hardware scheduling) with 
over-subscription.
  * Setting 1 disables over-subscription. Setting 2 disables HWS and statically
  * assigns queues to HQDs.
  */
+int sched_policy = KFD_SCHED_POLICY_HWS;
 module_param(sched_policy, int, 0444);
 MODULE_PARM_DESC(sched_policy,
"Scheduling policy (0 = HWS (Default), 1 = HWS without 
over-subscription, 2 = Non-HWS (Used for debugging only)");
@@ -557,6 +549,7 @@ MODULE_PARM_DESC(sched_policy,
  * Maximum number of processes that HWS can schedule concurrently. The maximum 
is the
  * number of VMIDs assigned to the HWS, which is also the default.
  */
+int hws_max_conc_proc = 8;
 module_param(hws_max_conc_proc, int, 0444);
 MODULE_PARM_DESC(hws_max_conc_proc,
"Max # processes HWS can execute concurrently when sched_policy=0 (0 = 
no concurrency, #VMIDs for KFD = Maximum(default))");
@@ -567,6 +560,7 @@ MODULE_PARM_DESC(hws_max_conc_proc,
  * the middle of a compute wave. Default is 1 to enable this feature. Setting 0
  * disables it.
  */
+int cwsr_enable = 1;
 module_param(cwsr_enable, int, 0444);
 MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
 
@@ -575,6 +569,7 @@ MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On 
(Default))");
  * Maximum number of queues per device. Valid setting is between 1 and 4096. 
Default
  * is 4096.
  */
+int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
 module_param(max_num_of_queues_per_device, int, 0444);
 MODULE_PARM_DESC(max_num_of_queues_per_device,
"Maximum number of supported queues per device (1 = Minimum, 4096 = 
default)");
@@ -584,6 +579,7 @@ MODULE_PARM_DESC(max_num_of_queues_per_device,
  * Send sigterm to HSA process on unhandled exceptions. Default is not to send 
sigterm
  * but just print errors on dmesg. Setting 1 enables sending sigterm.
  */
+int send_sigterm;
 module_param(send_sigterm, int, 0444);
 MODULE_PARM_DESC(send_sigterm,
"Send sigterm to HSA process on unhandled exception (0 = disable, 1 = 
enable)");
@@ -595,6 +591,7 @@ MODULE_PARM_DESC(send_sigterm,
  * size, usually 256MB.
  * Default value is 0, diabled.
  */
+int debug_largebar;
 module_param(debug_largebar, int, 0444);
 MODULE_PARM_DESC(debug_largebar,
"Debug large-bar flag used to simulate large-bar capability on 
non-large bar machine (0 = disable, 1 = enable)");
@@ -605,6 +602,7 @@ MODULE_PARM_DESC(debug_largebar,
  * table to get information about AMD APUs. This option can serve as a 
workaround on
  * systems with a broken CRAT table.
  */
+int ignore_crat;
 module_param(ignore_crat, int, 0444);
 MODULE_PARM_DESC(ignore_crat,
"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 
= ignore CRAT)");
@@ -615,9 +613,11 @@ MODULE_PARM_DESC(ignore_crat,
  * Setting 1 disables retry.
  * Retry is needed for recoverable page faults.
  */
+int vega10_noretry;
 module_param_named(noretry, vega10_noretry, int, 0644);
 MODULE_PARM_DESC(noretry,
"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled 
(default), 1 = retry disabled)");
+#endif
 
 static const st

[PATCH v2] drm/amdgpu: Move KFD parameters to amdgpu

2018-08-24 Thread Amber Lin
After merging KFD into amdgpu, move module parameters defined in KFD to
amdgpu_drv.c, where other module parameters are declared.

v2: add kernel-doc comments

Change-Id: I2de8d6c96bb49554c028bbc84bdb194f974c9278
Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 87 +
 drivers/gpu/drm/amd/amdkfd/kfd_module.c | 40 ---
 2 files changed, 87 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 2221f6b..dd6d8b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -39,6 +39,7 @@
 #include "amdgpu_gem.h"
 
 #include "amdgpu_amdkfd.h"
+#include "kfd_priv.h"
 
 /*
  * KMS wrapper.
@@ -127,6 +128,15 @@ int amdgpu_compute_multipipe = -1;
 int amdgpu_gpu_recovery = -1; /* auto */
 int amdgpu_emu_mode = 0;
 uint amdgpu_smu_memory_pool_size = 0;
+/* KFD parameters */
+int sched_policy = KFD_SCHED_POLICY_HWS;
+int hws_max_conc_proc = 8;
+int cwsr_enable = 1;
+int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
+int send_sigterm;
+int debug_largebar;
+int ignore_crat;
+int vega10_noretry;
 
 /**
  * DOC: vramlimit (int)
@@ -532,6 +542,83 @@ MODULE_PARM_DESC(smu_memory_pool_size,
"0x1 = 256Mbyte, 0x2 = 512Mbyte, 0x4 = 1 Gbyte, 0x8 = 2GByte");
 module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 
0444);
 
+/**
+ * DOC: sched_policy (int)
+ * Set scheduling policy. Default is HWS(hardware scheduling) with 
over-subscription.
+ * Setting 1 disables over-subscription. Setting 2 disables HWS and statically
+ * assigns queues to HQDs.
+ */
+module_param(sched_policy, int, 0444);
+MODULE_PARM_DESC(sched_policy,
+   "Scheduling policy (0 = HWS (Default), 1 = HWS without 
over-subscription, 2 = Non-HWS (Used for debugging only)");
+
+/**
+ * DOC: hws_max_conc_proc (int)
+ * Maximum number of processes that HWS can schedule concurrently. The maximum 
is the
+ * number of VMIDs assigned to the HWS, which is also the default.
+ */
+module_param(hws_max_conc_proc, int, 0444);
+MODULE_PARM_DESC(hws_max_conc_proc,
+   "Max # processes HWS can execute concurrently when sched_policy=0 (0 = 
no concurrency, #VMIDs for KFD = Maximum(default))");
+
+/**
+ * DOC: cwsr_enable (int)
+ * CWSR(compute wave store and resume) allows the GPU to preempt shader 
execution in
+ * the middle of a compute wave. Default is 1 to enable this feature. Setting 0
+ * disables it.
+ */
+module_param(cwsr_enable, int, 0444);
+MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
+
+/**
+ * DOC: max_num_of_queues_per_device (int)
+ * Maximum number of queues per device. Valid setting is between 1 and 4096. 
Default
+ * is 4096.
+ */
+module_param(max_num_of_queues_per_device, int, 0444);
+MODULE_PARM_DESC(max_num_of_queues_per_device,
+   "Maximum number of supported queues per device (1 = Minimum, 4096 = 
default)");
+
+/**
+ * DOC: send_sigterm (int)
+ * Send sigterm to HSA process on unhandled exceptions. Default is not to send 
sigterm
+ * but just print errors on dmesg. Setting 1 enables sending sigterm.
+ */
+module_param(send_sigterm, int, 0444);
+MODULE_PARM_DESC(send_sigterm,
+   "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = 
enable)");
+
+/**
+ * DOC: debug_largebar (int)
+ * Set debug_largebar as 1 to enable simulating large-bar capability on 
non-large bar
+ * system. This limits the VRAM size reported to ROCm applications to the 
visible
+ * size, usually 256MB.
+ * Default value is 0, diabled.
+ */
+module_param(debug_largebar, int, 0444);
+MODULE_PARM_DESC(debug_largebar,
+   "Debug large-bar flag used to simulate large-bar capability on 
non-large bar machine (0 = disable, 1 = enable)");
+
+/**
+ * DOC: ignore_crat (int)
+ * Ignore CRAT table during KFD initialization. By default, KFD uses the ACPI 
CRAT
+ * table to get information about AMD APUs. This option can serve as a 
workaround on
+ * systems with a broken CRAT table.
+ */
+module_param(ignore_crat, int, 0444);
+MODULE_PARM_DESC(ignore_crat,
+   "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 
= ignore CRAT)");
+
+/**
+ * DOC: noretry (int)
+ * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables 
retry.
+ * Setting 1 disables retry.
+ * Retry is needed for recoverable page faults.
+ */
+module_param_named(noretry, vega10_noretry, int, 0644);
+MODULE_PARM_DESC(noretry,
+   "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled 
(default), 1 = retry disabled)");
+
 static const struct pci_device_id pciidlist[] = {
 #ifdef  CONFIG_DRM_AMDGPU_SI
{0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI},
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c 
b/drivers/gpu/drm/amd/a

[PATCH v2] drm/amdgpu: Merge amdkfd into amdgpu

2018-08-24 Thread Amber Lin
Since KFD is only supported by single GPU driver, it makes sense to merge
amdgpu and amdkfd into one module. This patch is the initial step: merge
Kconfig and Makefile.

v2: also remove kfd from drm Kconfig

Change-Id: I21c996ba29d393c1bf8064bdb2f5d89541159649
Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/Kconfig |  2 -
 drivers/gpu/drm/amd/amdgpu/Kconfig  |  1 +
 drivers/gpu/drm/amd/amdgpu/Makefile |  6 ++-
 drivers/gpu/drm/amd/amdkfd/Kconfig  |  2 +-
 drivers/gpu/drm/amd/amdkfd/Makefile | 53 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_module.c | 76 ++---
 6 files changed, 63 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 2a72d2f..5ea1ac3 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -264,8 +264,6 @@ source "drivers/gpu/drm/bridge/Kconfig"
 
 source "drivers/gpu/drm/sti/Kconfig"
 
-source "drivers/gpu/drm/amd/amdkfd/Kconfig"
-
 source "drivers/gpu/drm/imx/Kconfig"
 
 source "drivers/gpu/drm/v3d/Kconfig"
diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig 
b/drivers/gpu/drm/amd/amdgpu/Kconfig
index e8af1f5..9221e54 100644
--- a/drivers/gpu/drm/amd/amdgpu/Kconfig
+++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
@@ -42,3 +42,4 @@ config DRM_AMDGPU_GART_DEBUGFS
 
 source "drivers/gpu/drm/amd/acp/Kconfig"
 source "drivers/gpu/drm/amd/display/Kconfig"
+source "drivers/gpu/drm/amd/amdkfd/Kconfig"
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index d2bafab..847536b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -35,7 +35,8 @@ ccflags-y := -I$(FULL_AMD_PATH)/include/asic_reg \
-I$(FULL_AMD_DISPLAY_PATH) \
-I$(FULL_AMD_DISPLAY_PATH)/include \
-I$(FULL_AMD_DISPLAY_PATH)/dc \
-   -I$(FULL_AMD_DISPLAY_PATH)/amdgpu_dm
+   -I$(FULL_AMD_DISPLAY_PATH)/amdgpu_dm \
+   -I$(FULL_AMD_PATH)/amdkfd
 
 amdgpu-y := amdgpu_drv.o
 
@@ -136,6 +137,9 @@ amdgpu-y += \
 amdgpu-y += amdgpu_amdkfd.o
 
 ifneq ($(CONFIG_HSA_AMD),)
+AMDKFD_PATH := ../amdkfd
+include $(FULL_AMD_PATH)/amdkfd/Makefile
+amdgpu-y += $(AMDKFD_FILES)
 amdgpu-y += \
 amdgpu_amdkfd_fence.o \
 amdgpu_amdkfd_gpuvm.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig 
b/drivers/gpu/drm/amd/amdkfd/Kconfig
index 3858820..fbf0ee5 100644
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -3,7 +3,7 @@
 #
 
 config HSA_AMD
-   tristate "HSA kernel driver for AMD GPU devices"
+   bool "HSA kernel driver for AMD GPU devices"
depends on DRM_AMDGPU && X86_64
imply AMD_IOMMU_V2
select MMU_NOTIFIER
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index ffd096f..69ec969 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -23,26 +23,41 @@
 # Makefile for Heterogenous System Architecture support for AMD GPU devices
 #
 
-ccflags-y := -Idrivers/gpu/drm/amd/include/  \
-   -Idrivers/gpu/drm/amd/include/asic_reg
-
-amdkfd-y   := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
-   kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \
-   kfd_process.o kfd_queue.o kfd_mqd_manager.o \
-   kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \
-   kfd_mqd_manager_v9.o \
-   kfd_kernel_queue.o kfd_kernel_queue_cik.o \
-   kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \
-   kfd_packet_manager.o kfd_process_queue_manager.o \
-   kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \
-   kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \
-   kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
-   kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o
+AMDKFD_FILES   := $(AMDKFD_PATH)/kfd_module.o \
+   $(AMDKFD_PATH)/kfd_device.o \
+   $(AMDKFD_PATH)/kfd_chardev.o \
+   $(AMDKFD_PATH)/kfd_topology.o \
+   $(AMDKFD_PATH)/kfd_pasid.o \
+   $(AMDKFD_PATH)/kfd_doorbell.o \
+   $(AMDKFD_PATH)/kfd_flat_memory.o \
+   $(AMDKFD_PATH)/kfd_process.o \
+   $(AMDKFD_PATH)/kfd_queue.o \
+   $(AMDKFD_PATH)/kfd_mqd_manager.o \
+   $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
+   $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
+   $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \
+   $(AMDKFD_PATH)/kfd_kernel_queue.o \
+   $(AMDKFD_PATH)/kfd_kernel_queue_cik.o \
+   $(AMDKFD_PATH)/kfd_kernel_queue_vi.o \
+   $(AMDKFD_PATH)/kfd_kernel_queue_v9.o \
+   $(AMDKFD_PATH)/kfd_packet_manager.o \
+   $(AMDKFD_PATH)/kfd_process

[PATCH 1/3] drm/amdgpu: Merge amdkfd into amdgpu

2018-08-23 Thread Amber Lin
Since KFD is only supported by single GPU driver, it makes sense to merge
amdgpu and amdkfd into one module. This patch is the initial step: merge
Kconfig and Makefile.

Change-Id: I21c996ba29d393c1bf8064bdb2f5d89541159649
Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdgpu/Kconfig  |  1 +
 drivers/gpu/drm/amd/amdgpu/Makefile |  6 ++-
 drivers/gpu/drm/amd/amdkfd/Kconfig  |  2 +-
 drivers/gpu/drm/amd/amdkfd/Makefile | 53 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_module.c | 76 ++---
 5 files changed, 63 insertions(+), 75 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig 
b/drivers/gpu/drm/amd/amdgpu/Kconfig
index e8af1f5..9221e54 100644
--- a/drivers/gpu/drm/amd/amdgpu/Kconfig
+++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
@@ -42,3 +42,4 @@ config DRM_AMDGPU_GART_DEBUGFS
 
 source "drivers/gpu/drm/amd/acp/Kconfig"
 source "drivers/gpu/drm/amd/display/Kconfig"
+source "drivers/gpu/drm/amd/amdkfd/Kconfig"
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index d2bafab..847536b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -35,7 +35,8 @@ ccflags-y := -I$(FULL_AMD_PATH)/include/asic_reg \
-I$(FULL_AMD_DISPLAY_PATH) \
-I$(FULL_AMD_DISPLAY_PATH)/include \
-I$(FULL_AMD_DISPLAY_PATH)/dc \
-   -I$(FULL_AMD_DISPLAY_PATH)/amdgpu_dm
+   -I$(FULL_AMD_DISPLAY_PATH)/amdgpu_dm \
+   -I$(FULL_AMD_PATH)/amdkfd
 
 amdgpu-y := amdgpu_drv.o
 
@@ -136,6 +137,9 @@ amdgpu-y += \
 amdgpu-y += amdgpu_amdkfd.o
 
 ifneq ($(CONFIG_HSA_AMD),)
+AMDKFD_PATH := ../amdkfd
+include $(FULL_AMD_PATH)/amdkfd/Makefile
+amdgpu-y += $(AMDKFD_FILES)
 amdgpu-y += \
 amdgpu_amdkfd_fence.o \
 amdgpu_amdkfd_gpuvm.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig 
b/drivers/gpu/drm/amd/amdkfd/Kconfig
index 3858820..fbf0ee5 100644
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -3,7 +3,7 @@
 #
 
 config HSA_AMD
-   tristate "HSA kernel driver for AMD GPU devices"
+   bool "HSA kernel driver for AMD GPU devices"
depends on DRM_AMDGPU && X86_64
imply AMD_IOMMU_V2
select MMU_NOTIFIER
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index ffd096f..69ec969 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -23,26 +23,41 @@
 # Makefile for Heterogenous System Architecture support for AMD GPU devices
 #
 
-ccflags-y := -Idrivers/gpu/drm/amd/include/  \
-   -Idrivers/gpu/drm/amd/include/asic_reg
-
-amdkfd-y   := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
-   kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \
-   kfd_process.o kfd_queue.o kfd_mqd_manager.o \
-   kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \
-   kfd_mqd_manager_v9.o \
-   kfd_kernel_queue.o kfd_kernel_queue_cik.o \
-   kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \
-   kfd_packet_manager.o kfd_process_queue_manager.o \
-   kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \
-   kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \
-   kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
-   kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o
+AMDKFD_FILES   := $(AMDKFD_PATH)/kfd_module.o \
+   $(AMDKFD_PATH)/kfd_device.o \
+   $(AMDKFD_PATH)/kfd_chardev.o \
+   $(AMDKFD_PATH)/kfd_topology.o \
+   $(AMDKFD_PATH)/kfd_pasid.o \
+   $(AMDKFD_PATH)/kfd_doorbell.o \
+   $(AMDKFD_PATH)/kfd_flat_memory.o \
+   $(AMDKFD_PATH)/kfd_process.o \
+   $(AMDKFD_PATH)/kfd_queue.o \
+   $(AMDKFD_PATH)/kfd_mqd_manager.o \
+   $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
+   $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
+   $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \
+   $(AMDKFD_PATH)/kfd_kernel_queue.o \
+   $(AMDKFD_PATH)/kfd_kernel_queue_cik.o \
+   $(AMDKFD_PATH)/kfd_kernel_queue_vi.o \
+   $(AMDKFD_PATH)/kfd_kernel_queue_v9.o \
+   $(AMDKFD_PATH)/kfd_packet_manager.o \
+   $(AMDKFD_PATH)/kfd_process_queue_manager.o \
+   $(AMDKFD_PATH)/kfd_device_queue_manager.o \
+   $(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \
+   $(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \
+   $(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \
+   $(AMDKFD_PATH)/kfd_interrupt.o \
+   $(AMDKFD_PATH)/kfd_events.o \
+   $(AMDKFD_PATH)/cik_event_interrupt.o \
+   $(AMDKFD_PATH)/kfd_int_process_v9.o \
+   $(AMDKFD_PATH)/kfd_dbgdev.o \
+   $

[PATCH 2/3] drm/amdgpu: Remove CONFIG_HSA_AMD_MODULE

2018-08-23 Thread Amber Lin
After amdkfd is merged to amdgpu, CONFIG_HSA_AMD_MODULE no longer exists.

Change-Id: I42096cdf887e0d776075f3dd3e8d3f153aff4e85
Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 26 +++---
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index e3ed08d..8c652ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -36,36 +36,16 @@ int amdgpu_amdkfd_init(void)
 {
int ret;
 
-#if defined(CONFIG_HSA_AMD_MODULE)
-   int (*kgd2kfd_init_p)(unsigned int, const struct kgd2kfd_calls**);
-
-   kgd2kfd_init_p = symbol_request(kgd2kfd_init);
-
-   if (kgd2kfd_init_p == NULL)
-   return -ENOENT;
-
-   ret = kgd2kfd_init_p(KFD_INTERFACE_VERSION, );
-   if (ret) {
-   symbol_put(kgd2kfd_init);
-   kgd2kfd = NULL;
-   }
-
-
-#elif defined(CONFIG_HSA_AMD)
-
+#ifdef CONFIG_HSA_AMD
ret = kgd2kfd_init(KFD_INTERFACE_VERSION, );
if (ret)
kgd2kfd = NULL;
-
+   amdgpu_amdkfd_gpuvm_init_mem_limits();
 #else
kgd2kfd = NULL;
ret = -ENOENT;
 #endif
 
-#if defined(CONFIG_HSA_AMD_MODULE) || defined(CONFIG_HSA_AMD)
-   amdgpu_amdkfd_gpuvm_init_mem_limits();
-#endif
-
return ret;
 }
 
@@ -471,7 +451,7 @@ bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, 
u32 vmid)
return false;
 }
 
-#if !defined(CONFIG_HSA_AMD_MODULE) && !defined(CONFIG_HSA_AMD)
+#ifndef CONFIG_HSA_AMD
 bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
 {
return false;
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 3/3] drm/amdgpu: Move KFD parameters to amdgpu

2018-08-23 Thread Amber Lin
After merging KFD into amdgpu, move module parameters defined in KFD to
amdgpu_drv.c, where other module parameters are declared.

Change-Id: I2de8d6c96bb49554c028bbc84bdb194f974c9278
Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 41 +
 drivers/gpu/drm/amd/amdkfd/kfd_module.c | 40 
 2 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 2221f6b..af9a766 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -39,6 +39,7 @@
 #include "amdgpu_gem.h"
 
 #include "amdgpu_amdkfd.h"
+#include "kfd_priv.h"
 
 /*
  * KMS wrapper.
@@ -127,6 +128,15 @@ int amdgpu_compute_multipipe = -1;
 int amdgpu_gpu_recovery = -1; /* auto */
 int amdgpu_emu_mode = 0;
 uint amdgpu_smu_memory_pool_size = 0;
+/* KFD parameters */
+int sched_policy = KFD_SCHED_POLICY_HWS;
+int hws_max_conc_proc = 8;
+int cwsr_enable = 1;
+int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
+int send_sigterm;
+int debug_largebar;
+int ignore_crat;
+int vega10_noretry;
 
 /**
  * DOC: vramlimit (int)
@@ -532,6 +542,37 @@ MODULE_PARM_DESC(smu_memory_pool_size,
"0x1 = 256Mbyte, 0x2 = 512Mbyte, 0x4 = 1 Gbyte, 0x8 = 2GByte");
 module_param_named(smu_memory_pool_size, amdgpu_smu_memory_pool_size, uint, 
0444);
 
+module_param(sched_policy, int, 0444);
+MODULE_PARM_DESC(sched_policy,
+   "Scheduling policy (0 = HWS (Default), 1 = HWS without 
over-subscription, 2 = Non-HWS (Used for debugging only)");
+
+module_param(hws_max_conc_proc, int, 0444);
+MODULE_PARM_DESC(hws_max_conc_proc,
+   "Max # processes HWS can execute concurrently when sched_policy=0 (0 = 
no concurrency, #VMIDs for KFD = Maximum(default))");
+
+module_param(cwsr_enable, int, 0444);
+MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
+
+module_param(max_num_of_queues_per_device, int, 0444);
+MODULE_PARM_DESC(max_num_of_queues_per_device,
+   "Maximum number of supported queues per device (1 = Minimum, 4096 = 
default)");
+
+module_param(send_sigterm, int, 0444);
+MODULE_PARM_DESC(send_sigterm,
+   "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = 
enable)");
+
+module_param(debug_largebar, int, 0444);
+MODULE_PARM_DESC(debug_largebar,
+   "Debug large-bar flag used to simulate large-bar capability on 
non-large bar machine (0 = disable, 1 = enable)");
+
+module_param(ignore_crat, int, 0444);
+MODULE_PARM_DESC(ignore_crat,
+   "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 
= ignore CRAT)");
+
+module_param_named(noretry, vega10_noretry, int, 0644);
+MODULE_PARM_DESC(noretry,
+   "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled 
(default), 1 = retry disabled)");
+
 static const struct pci_device_id pciidlist[] = {
 #ifdef  CONFIG_DRM_AMDGPU_SI
{0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI},
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index 8847514..5f4977b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -21,7 +21,6 @@
  */
 
 #include 
-#include 
 #include 
 #include "kfd_priv.h"
 
@@ -39,45 +38,6 @@ static const struct kgd2kfd_calls kgd2kfd = {
  kgd2kfd_schedule_evict_and_restore_process,
 };
 
-int sched_policy = KFD_SCHED_POLICY_HWS;
-module_param(sched_policy, int, 0444);
-MODULE_PARM_DESC(sched_policy,
-   "Scheduling policy (0 = HWS (Default), 1 = HWS without 
over-subscription, 2 = Non-HWS (Used for debugging only)");
-
-int hws_max_conc_proc = 8;
-module_param(hws_max_conc_proc, int, 0444);
-MODULE_PARM_DESC(hws_max_conc_proc,
-   "Max # processes HWS can execute concurrently when sched_policy=0 (0 = 
no concurrency, #VMIDs for KFD = Maximum(default))");
-
-int cwsr_enable = 1;
-module_param(cwsr_enable, int, 0444);
-MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
-
-int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
-module_param(max_num_of_queues_per_device, int, 0444);
-MODULE_PARM_DESC(max_num_of_queues_per_device,
-   "Maximum number of supported queues per device (1 = Minimum, 4096 = 
default)");
-
-int send_sigterm;
-module_param(send_sigterm, int, 0444);
-MODULE_PARM_DESC(send_sigterm,
-   "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = 
enable)");
-
-int debug_largebar;
-module_param(debug_largebar, int, 0444);
-MODULE_PARM_DESC(debug_largebar,
-   "Debug large-bar flag used to simulate large-bar capability on 
non-large bar machine (0 = disable, 1 = enable)");
-
-in

[PATCH v2] drm/amdgpu: Map all visible VRAM at startup

2018-02-27 Thread Amber Lin
When using CPU to update page table, we need to kmap all the PDs/PTs after
they are allocated and that requires a TLB shot down on each CPU, which is
quite heavy.

Instead, we map the whole visible VRAM to a kernel address at once. Pages
can be obtained from the offset.

v2: move the mapping base from gmc to amdgpu_mman structure, and the
implementation in amdgpu_ttm_* functions

Change-Id: I56574bd544dae273da50e8b5dd6894cd5d9454bd
Signed-off-by: Amber Lin <amber@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 17 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  1 +
 2 files changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index e38e6db..f126a5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -621,6 +621,7 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_bo_device 
*bdev, struct ttm_mem_
 {
struct ttm_mem_type_manager *man = >man[mem->mem_type];
struct amdgpu_device *adev = amdgpu_ttm_adev(bdev);
+   struct drm_mm_node *mm_node = mem->mm_node;
 
mem->bus.addr = NULL;
mem->bus.offset = 0;
@@ -640,6 +641,15 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_bo_device 
*bdev, struct ttm_mem_
/* check if it's visible */
if ((mem->bus.offset + mem->bus.size) > 
adev->gmc.visible_vram_size)
return -EINVAL;
+   /* Only physically contiguous buffers apply. In a contiguous
+* buffer, size of the first mm_node would match the number of
+* pages in ttm_mem_reg.
+*/
+   if (adev->mman.aper_base_kaddr &&
+   (mm_node->size == mem->num_pages))
+   mem->bus.addr = (u8 *)adev->mman.aper_base_kaddr +
+   mem->bus.offset;
+
mem->bus.base = adev->gmc.aper_base;
mem->bus.is_iomem = true;
break;
@@ -1402,6 +1412,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 
/* Change the size here instead of the init above so only lpfn is 
affected */
amdgpu_ttm_set_active_vram_size(adev, adev->gmc.visible_vram_size);
+#ifdef CONFIG_64BIT
+   adev->mman.aper_base_kaddr = ioremap_wc(adev->gmc.aper_base,
+   adev->gmc.visible_vram_size);
+#endif
 
/*
 *The reserved vram for firmware must be pinned to the specified
@@ -1494,6 +1508,9 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)
amdgpu_ttm_debugfs_fini(adev);
amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, NULL);
amdgpu_ttm_fw_reserve_vram_fini(adev);
+   if (adev->mman.aper_base_kaddr)
+   iounmap(adev->mman.aper_base_kaddr);
+   adev->mman.aper_base_kaddr = NULL;
 
ttm_bo_clean_mm(>mman.bdev, TTM_PL_VRAM);
ttm_bo_clean_mm(>mman.bdev, TTM_PL_TT);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 1e275c7..d314910 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -44,6 +44,7 @@ struct amdgpu_mman {
struct ttm_bo_devicebdev;
boolmem_global_referenced;
boolinitialized;
+   void __iomem*aper_base_kaddr;
 
 #if defined(CONFIG_DEBUG_FS)
struct dentry   *debugfs_entries[8];
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx