[PATCH] drm/amdgpu/sriov: Add MB_REQ_MSG_READY_TO_RESET response

2021-04-08 Thread jianzh
From: Jiange Zhao 

Add MB_REQ_MSG_READY_TO_RESET response when VF get FLR notification.
When guest received FLR notification from host, it would
lock adapter into reset state. There will be no more
job submission and hardware access after that.

Then it should send a response to host that it has prepared
for host reset.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 48e588d3c409..117d22848ee4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -277,6 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
return;
 
amdgpu_virt_fini_data_exchange(adev);
+   xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
atomic_set(>in_gpu_reset, 1);
 
do {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
index 9f5808616174..73887b0aa1d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
@@ -37,7 +37,8 @@ enum idh_request {
IDH_REQ_GPU_RESET_ACCESS,
IDH_REQ_GPU_INIT_DATA,
 
-   IDH_LOG_VF_ERROR   = 200,
+   IDH_LOG_VF_ERROR= 200,
+   IDH_READY_TO_RESET  = 201,
 };
 
 enum idh_event {
-- 
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/sriov: Add MB_REQ_MSG_READY_TO_RESET response

2021-04-08 Thread jianzh
From: Jiange Zhao 

Add MB_REQ_MSG_READY_TO_RESET response when VF get FLR notification.
When guest received FLR notification from host, it would
lock adapter into reset state. There will be no more
job submission and hardware access after that.

Then it should send a response to host that it has prepared
for host reset.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 48e588d3c409..117d22848ee4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -277,6 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
return;
 
amdgpu_virt_fini_data_exchange(adev);
+   xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
atomic_set(>in_gpu_reset, 1);
 
do {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
index 9f5808616174..73887b0aa1d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
@@ -37,7 +37,8 @@ enum idh_request {
IDH_REQ_GPU_RESET_ACCESS,
IDH_REQ_GPU_INIT_DATA,
 
-   IDH_LOG_VF_ERROR   = 200,
+   IDH_LOG_VF_ERROR= 200,
+   IDH_READY_TO_RESET  = 201,
 };
 
 enum idh_event {
-- 
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/SRIOV: Extend VF reset request wait period

2020-12-07 Thread jianzh
From: Jiange Zhao 

In Virtualization case, when one VF is sending too many
FLR requests, hypervisor would stop responding to this
VF's request for a long period of time. This is called
event guard. During this period of cooling time, guest
driver should wait instead of doing other things. After
this period of time, guest driver would resume reset
process and return to normal.

Currently, guest driver would wait 12 seconds and return fail
if it doesn't get response from host.

Solution: extend this waiting time in guest driver and poll
response periodically. Poll happens every 6 seconds and it will
last for 60 seconds.

v2: change the max repetition times from number to macro.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h |  1 +
 4 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f5ce9a9f4cf5..7767ccca526b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -187,7 +187,16 @@ static int xgpu_ai_send_access_requests(struct 
amdgpu_device *adev,
 
 static int xgpu_ai_request_reset(struct amdgpu_device *adev)
 {
-   return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS);
+   int ret, i = 0;
+
+   while (i < AI_MAILBOX_POLL_MSG_REP_MAX) {
+   ret = xgpu_ai_send_access_requests(adev, 
IDH_REQ_GPU_RESET_ACCESS);
+   if (!ret)
+   break;
+   i++;
+   }
+
+   return ret;
 }
 
 static int xgpu_ai_request_full_gpu_access(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index 83b453f5d717..50572635d0f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -25,8 +25,9 @@
 #define __MXGPU_AI_H__
 
 #define AI_MAILBOX_POLL_ACK_TIMEDOUT   500
-#define AI_MAILBOX_POLL_MSG_TIMEDOUT   12000
+#define AI_MAILBOX_POLL_MSG_TIMEDOUT   6000
 #define AI_MAILBOX_POLL_FLR_TIMEDOUT   5000
+#define AI_MAILBOX_POLL_MSG_REP_MAX11
 
 enum idh_request {
IDH_REQ_GPU_INIT_ACCESS = 1,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 666ed99cc14b..dd5c1e6ce009 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -200,7 +200,16 @@ static int xgpu_nv_send_access_requests(struct 
amdgpu_device *adev,
 
 static int xgpu_nv_request_reset(struct amdgpu_device *adev)
 {
-   return xgpu_nv_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS);
+   int ret, i = 0;
+
+   while (i < NV_MAILBOX_POLL_MSG_REP_MAX) {
+   ret = xgpu_nv_send_access_requests(adev, 
IDH_REQ_GPU_RESET_ACCESS);
+   if (!ret)
+   break;
+   i++;
+   }
+
+   return ret;
 }
 
 static int xgpu_nv_request_full_gpu_access(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
index 52605e14a1a5..9f5808616174 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
@@ -27,6 +27,7 @@
 #define NV_MAILBOX_POLL_ACK_TIMEDOUT   500
 #define NV_MAILBOX_POLL_MSG_TIMEDOUT   6000
 #define NV_MAILBOX_POLL_FLR_TIMEDOUT   5000
+#define NV_MAILBOX_POLL_MSG_REP_MAX11
 
 enum idh_request {
IDH_REQ_GPU_INIT_ACCESS = 1,
-- 
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/SRIOV: Extend VF reset request wait period

2020-11-25 Thread jianzh
From: Jiange Zhao 

In Virtualization case, when one VF is sending too many
FLR requests, hypervisor would stop responding to this
VF's request for a long period of time. This is called
event guard. During this period of cooling time, guest
driver should wait instead of doing other things. After
this period of time, guest driver would resume reset
process and return to normal.

Currently, guest driver would wait 12 seconds and return fail
if it doesn't get response from host.

Solution: extend this waiting time in guest driver and poll
response periodically.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 ++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f5ce9a9f4cf5..d8d8c623bb74 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -187,7 +187,16 @@ static int xgpu_ai_send_access_requests(struct 
amdgpu_device *adev,
 
 static int xgpu_ai_request_reset(struct amdgpu_device *adev)
 {
-   return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS);
+   int ret, i = 0;
+
+   while (i < 11) {
+   ret = xgpu_ai_send_access_requests(adev, 
IDH_REQ_GPU_RESET_ACCESS);
+   if (!ret)
+   break;
+   i++;
+   }
+
+   return ret;
 }
 
 static int xgpu_ai_request_full_gpu_access(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index 83b453f5d717..20ee2142f9ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -25,7 +25,7 @@
 #define __MXGPU_AI_H__
 
 #define AI_MAILBOX_POLL_ACK_TIMEDOUT   500
-#define AI_MAILBOX_POLL_MSG_TIMEDOUT   12000
+#define AI_MAILBOX_POLL_MSG_TIMEDOUT   6000
 #define AI_MAILBOX_POLL_FLR_TIMEDOUT   5000
 
 enum idh_request {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 666ed99cc14b..0147dfe21a39 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -200,7 +200,16 @@ static int xgpu_nv_send_access_requests(struct 
amdgpu_device *adev,
 
 static int xgpu_nv_request_reset(struct amdgpu_device *adev)
 {
-   return xgpu_nv_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS);
+   int ret, i = 0;
+
+   while (i < 11) {
+   ret = xgpu_nv_send_access_requests(adev, 
IDH_REQ_GPU_RESET_ACCESS);
+   if (!ret)
+   break;
+   i++;
+   }
+
+   return ret;
 }
 
 static int xgpu_nv_request_full_gpu_access(struct amdgpu_device *adev,
-- 
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

2020-05-14 Thread jianzh
From: Jiange Zhao 

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
(2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
  rename debugfs file to amdgpu_autodump,
  provide autodump_read as well,
  style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
the node can be reopened; also, there is no need to wait for
completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
add 'app_state_mutex' for race conditions:
(1)Only 1 user can open this file node
(2)wait_dump() can only take effect after poll() executed.
(3)eliminated the race condition between release() and
   wait_dump()

v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
removed state checking in amdgpu_debugfs_wait_dump
Improve on top of version 3 so that the node can be reopened.

v7: move reinit_completion into open() so that only one user
can open it.

v8: remove complete_all() from amdgpu_debugfs_wait_dump().

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a806cb55b78..9e8eeddfe7ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,6 +992,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..d33cb344be69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+   unsigned long timeout = 600 * HZ;
+   int ret;
+
+   wake_up_interruptible(>autodump.gpu_hang);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.dumping, timeout);
+   if (ret == 0) {
+   pr_err("autodump: timeout, move on to gpu recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+   struct amdgpu_device *adev = inode->i_private;
+   int ret;
+
+   file->private_data = adev;
+
+   mutex_lock(>lock_reset);
+   if (adev->autodump.dumping.done) {
+   reinit_completion(>autodump.dumping);
+   ret = 0;
+   } else {
+   ret = -EBUSY;
+   }
+   mutex_unlock(>lock_reset);
+
+   return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   complete_all(>autodump.dumping);
+   return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   poll_wait(file, >autodump.gpu_hang, poll_table);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+   .owner = THIS_MODULE,
+   .open = amdgpu_debugfs_autodump_open,
+   .poll = amdgpu_debugfs_autodump_poll,
+   .release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+   

[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

2020-05-13 Thread jianzh
From: Jiange Zhao 

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
(2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
  rename debugfs file to amdgpu_autodump,
  provide autodump_read as well,
  style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
the node can be reopened; also, there is no need to wait for
completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
add 'app_state_mutex' for race conditions:
(1)Only 1 user can open this file node
(2)wait_dump() can only take effect after poll() executed.
(3)eliminated the race condition between release() and
   wait_dump()

v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
removed state checking in amdgpu_debugfs_wait_dump
Improve on top of version 3 so that the node can be reopened.

v7: move reinit_completion into open() so that only one user
can open it.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 79 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a806cb55b78..9e8eeddfe7ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,6 +992,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..efee3f1adecf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -74,8 +74,83 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+   unsigned long timeout = 600 * HZ;
+   int ret;
+
+   wake_up_interruptible(>autodump.gpu_hang);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.dumping, timeout);
+   complete_all(>autodump.dumping);
+   if (ret == 0) {
+   pr_err("autodump: timeout, move on to gpu recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+   struct amdgpu_device *adev = inode->i_private;
+   int ret;
+
+   file->private_data = adev;
+
+   mutex_lock(>lock_reset);
+   if (adev->autodump.dumping.done) {
+   reinit_completion(>autodump.dumping);
+   ret = 0;
+   } else {
+   ret = -EBUSY;
+   }
+   mutex_unlock(>lock_reset);
+
+   return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   complete_all(>autodump.dumping);
+   return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   poll_wait(file, >autodump.gpu_hang, poll_table);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+   .owner = THIS_MODULE,
+   .open = amdgpu_debugfs_autodump_open,
+   .poll = amdgpu_debugfs_autodump_poll,
+   .release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+   init_completion(>autodump.dumping);
+   

[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

2020-05-09 Thread jianzh
From: Jiange Zhao 

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
(2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
  rename debugfs file to amdgpu_autodump,
  provide autodump_read as well,
  style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
the node can be reopened; also, there is no need to wait for
completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
add 'app_state_mutex' for race conditions:
(1)Only 1 user can open this file node
(2)wait_dump() can only take effect after poll() executed.
(3)eliminated the race condition between release() and
   wait_dump()

v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex'
removed state checking in amdgpu_debugfs_wait_dump
Improve on top of version 3 so that the node can be reopened.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 78 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a806cb55b78..9e8eeddfe7ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,6 +992,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..261b67ece7fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+   unsigned long timeout = 600 * HZ;
+   int ret;
+
+   wake_up_interruptible(>autodump.gpu_hang);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.dumping, timeout);
+   complete_all(>autodump.dumping);
+   if (ret == 0) {
+   pr_err("autodump: timeout, move on to gpu recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+   struct amdgpu_device *adev = inode->i_private;
+   int ret;
+
+   file->private_data = adev;
+
+   mutex_lock(>lock_reset);
+   if (adev->autodump.dumping.done)
+   ret = 0;
+   else
+   ret = -EBUSY;
+   mutex_unlock(>lock_reset);
+
+   return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   complete_all(>autodump.dumping);
+   return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   reinit_completion(>autodump.dumping);
+   poll_wait(file, >autodump.gpu_hang, poll_table);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+   .owner = THIS_MODULE,
+   .open = amdgpu_debugfs_autodump_open,
+   .poll = amdgpu_debugfs_autodump_poll,
+   .release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+   init_completion(>autodump.dumping);
+   complete_all(>autodump.dumping);
+   init_waitqueue_head(>autodump.gpu_hang);
+
+   

[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

2020-04-28 Thread jianzh
From: Jiange Zhao 

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
(2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
  rename debugfs file to amdgpu_autodump,
  provide autodump_read as well,
  style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
the node can be reopened; also, there is no need to wait for
completion when no app is waiting for a dump.

v5: change 'bool app_listening' to 'enum amdgpu_autodump_state'
add 'app_state_mutex' for race conditions:
(1)Only 1 user can open this file node
(2)wait_dump() can only take effect after poll() executed.
(3)eliminated the race condition between release() and
   wait_dump()

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 92 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 14 
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..6f8ef98c4b97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -990,6 +990,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..1d4a95e8ad5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -74,8 +74,96 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+   unsigned long timeout = 600 * HZ;
+   int ret;
+
+   mutex_lock(>autodump.app_state_mutex);
+   if (adev->autodump.app_state != AMDGPU_AUTODUMP_LISTENING) {
+   mutex_unlock(>autodump.app_state_mutex);
+   return 0;
+   }
+   mutex_unlock(>autodump.app_state_mutex);
+
+   wake_up_interruptible(>autodump.gpu_hang);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.dumping, timeout);
+   if (ret == 0) {
+   pr_err("autodump: timeout, move on to gpu recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+   struct amdgpu_device *adev = inode->i_private;
+   int ret;
+
+   file->private_data = adev;
+
+   mutex_lock(>autodump.app_state_mutex);
+   if (adev->autodump.app_state == AMDGPU_AUTODUMP_NO_APP) {
+   adev->autodump.app_state = AMDGPU_AUTODUMP_REGISTERED;
+   ret = 0;
+   } else {
+   ret = -EBUSY;
+   }
+   mutex_unlock(>autodump.app_state_mutex);
+
+   return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   mutex_lock(>autodump.app_state_mutex);
+   complete(>autodump.dumping);
+   adev->autodump.app_state = AMDGPU_AUTODUMP_NO_APP;
+   mutex_unlock(>autodump.app_state_mutex);
+   return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   mutex_lock(>autodump.app_state_mutex);
+   poll_wait(file, >autodump.gpu_hang, poll_table);
+   adev->autodump.app_state = AMDGPU_AUTODUMP_LISTENING;
+   mutex_unlock(>autodump.app_state_mutex);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct 

[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

2020-04-26 Thread jianzh
From: Jiange Zhao 

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
(2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
  rename debugfs file to amdgpu_autodump,
  provide autodump_read as well,
  style and code cleanups

v4: add 'bool app_listening' to differentiate situations, so that
the node can be reopened; also, there is no need to wait for
completion when no app is waiting for a dump.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 82 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  7 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..6f8ef98c4b97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -990,6 +990,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..04720264e8b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -74,7 +74,85 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
 #if defined(CONFIG_DEBUG_FS)
+   unsigned long timeout = 600 * HZ;
+   int ret;
+
+   if (!adev->autodump.app_listening)
+   return 0;
+
+   wake_up_interruptible(>autodump.gpu_hang);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.dumping, timeout);
+   if (ret == 0) {
+   pr_err("autodump: timeout, move on to gpu recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+   struct amdgpu_device *adev = inode->i_private;
+   int ret;
+
+   file->private_data = adev;
+
+   mutex_lock(>lock_reset);
+   if (!adev->autodump.app_listening) {
+   adev->autodump.app_listening = true;
+   ret = 0;
+   } else {
+   ret = -EBUSY;
+   }
+   mutex_unlock(>lock_reset);
+
+   return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   complete(>autodump.dumping);
+   adev->autodump.app_listening = false;
+   return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   poll_wait(file, >autodump.gpu_hang, poll_table);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+   .owner = THIS_MODULE,
+   .open = amdgpu_debugfs_autodump_open,
+   .poll = amdgpu_debugfs_autodump_poll,
+   .release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+   init_completion(>autodump.dumping);
+   init_waitqueue_head(>autodump.gpu_hang);
+   adev->autodump.app_listening = false;
+
+   debugfs_create_file("amdgpu_autodump", 0600,
+   adev->ddev->primary->debugfs_root,
+   adev, _debug_fops);
+}
 
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
@@ -1434,6 +1512,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 
amdgpu_ras_debugfs_create_all(adev);
 
+   amdgpu_debugfs_autodump_init(adev);
+
return 

[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset (v2)

2020-04-24 Thread jianzh
From: Jiange Zhao 

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
(2) add a mutex in open() to prevent race condition

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 10 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 92 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..34b8ce9fba47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -724,6 +724,14 @@ struct amd_powerplay {
const struct amd_pm_funcs *pp_funcs;
 };
 
+struct amdgpu_autodump {
+   boolapp_listening;
+   struct completion   completed;
+   struct dentry   *dentry;
+   struct wait_queue_head  gpu_hang_wait;
+   struct mutexmutex;
+};
+
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 struct amdgpu_device {
@@ -990,6 +998,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..693bfcaad312 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include 
 
 #include "amdgpu.h"
@@ -74,8 +74,96 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+   unsigned long tmo = 600*HZ;
+   int ret;
+
+   if (!adev->autodump.app_listening)
+   return 0;
+
+   wake_up_interruptible(>autodump.gpu_hang_wait);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.completed, tmo);
+   if (ret == 0) {
+   pr_err("autodump: timeout, move on to gpu recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+   int ret = 0;
+   struct amdgpu_device *adev;
+
+   ret = simple_open(inode, file);
+   if (ret)
+   return ret;
+
+   adev = file->private_data;
+
+   mutex_lock(>autodump.mutex);
+   if (adev->autodump.app_listening == true) {
+   ret = -EBUSY;
+   } else {
+   adev->autodump.app_listening = true;
+   }
+   mutex_unlock(>autodump.mutex);
+
+   return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   complete(>autodump.completed);
+   adev->autodump.app_listening = false;
+
+   return 0;
+}
+
+unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   poll_wait(file, >autodump.gpu_hang_wait, poll_table);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+   .owner = THIS_MODULE,
+   .open = amdgpu_debugfs_autodump_open,
+   .poll = amdgpu_debugfs_autodump_poll,
+   .release = amdgpu_debugfs_autodump_release,
+};
+
+static int amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+   struct dentry *entry;
+
+   init_completion(>autodump.completed);
+   init_waitqueue_head(>autodump.gpu_hang_wait);
+   mutex_init(>autodump.mutex);
+   adev->autodump.app_listening = false;
+
+   entry = debugfs_create_file("autodump", 0600,
+   adev->ddev->primary->debugfs_root,
+   adev, _debug_fops);
+   

[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

2020-04-23 Thread jianzh
From: Jiange Zhao 

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 97 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..a505b547f242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -724,6 +724,13 @@ struct amd_powerplay {
const struct amd_pm_funcs *pp_funcs;
 };
 
+struct amdgpu_autodump {
+   boolregistered;
+   struct completion   completed;
+   struct dentry   *dentry;
+   struct wait_queue_head  gpu_hang_wait;
+};
+
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 struct amdgpu_device {
@@ -990,6 +997,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..cdd4bf00adee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+   int ret;
+   unsigned long tmo = 600*HZ;
+
+   if (!adev->autodump.registered)
+   return 0;
+
+   wake_up_interruptible(>autodump.gpu_hang_wait);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.completed, tmo);
+   if (ret == 0) { /* time out and dump tool still not finish its dump*/
+   pr_err("autodump: timeout before dump finished, move on to gpu 
recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+   int ret;
+   struct amdgpu_device *adev;
+
+   ret = simple_open(inode, file);
+   if (ret)
+   return ret;
+
+   adev = file->private_data;
+   if (adev->autodump.registered == true)
+   return -EINVAL;
+
+   adev->autodump.registered = true;
+
+   return 0;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   complete(>autodump.completed);
+   adev->autodump.registered = false;
+
+   return 0;
+}
+
+unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   poll_wait(file, >autodump.gpu_hang_wait, poll_table);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+   .owner = THIS_MODULE,
+   .open = amdgpu_debugfs_autodump_open,
+   .poll = amdgpu_debugfs_autodump_poll,
+   .release = amdgpu_debugfs_autodump_release,
+};
+
+static int amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+   struct dentry *entry;
+
+   init_completion(>autodump.completed);
+   init_waitqueue_head(>autodump.gpu_hang_wait);
+   adev->autodump.registered = false;
+
+   entry = debugfs_create_file("autodump", 0600,
+   adev->ddev->primary->debugfs_root,
+   adev, _debug_fops);
+   adev->autodump.dentry = entry;
+
+   return 0;
+}
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1434,6 +1517,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 
amdgpu_ras_debugfs_create_all(adev);
 
+   amdgpu_debugfs_autodump_init(adev);
+
return amdgpu_debugfs_add_files(adev, 

[PATCH] drm/amdgpu/sriov: Use VF-accessible register for gpu_clock_count

2020-03-03 Thread jianzh
Navi12 VK CTS subtest timestamp.calibrated.dev_domain_test failed
because mmRLC_CAPTURE_GPU_CLOCK_COUNT register cannot be
written in VF due to security policy.

Solution: use a VF-accessible timestamp register pair
mmGOLDEN_TSC_COUNT_LOWER/UPPER for SRIOV case.

v2: according to Deucher Alexander's advice, switch to
mmGOLDEN_TSC_COUNT_LOWER/UPPER for both bare metal and SRIOV.

Signed-off-by: jianzh 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 03655c3..22a07ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -35,6 +35,8 @@
 
 #include "gc/gc_10_1_0_offset.h"
 #include "gc/gc_10_1_0_sh_mask.h"
+#include "smuio/smuio_11_0_0_offset.h"
+#include "smuio/smuio_11_0_0_sh_mask.h"
 #include "navi10_enum.h"
 #include "hdp/hdp_5_0_0_offset.h"
 #include "ivsrcid/gfx/irqsrcs_gfx_10_1.h"
@@ -3925,9 +3927,8 @@ static uint64_t gfx_v10_0_get_gpu_clock_counter(struct 
amdgpu_device *adev)
 
amdgpu_gfx_off_ctrl(adev, false);
mutex_lock(>gfx.gpu_clock_mutex);
-   WREG32_SOC15(GC, 0, mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
-   clock = (uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_LSB) |
-   ((uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_MSB) << 
32ULL);
+   clock = (uint64_t)RREG32_SOC15(SMUIO, 0, mmGOLDEN_TSC_COUNT_LOWER) |
+   ((uint64_t)RREG32_SOC15(SMUIO, 0, mmGOLDEN_TSC_COUNT_UPPER) << 
32ULL);
mutex_unlock(>gfx.gpu_clock_mutex);
amdgpu_gfx_off_ctrl(adev, true);
return clock;
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/sriov: Use VF-accessible register for gpu_clock_count

2020-02-27 Thread jianzh
Navi12 VK CTS subtest timestamp.calibrated.dev_domain_test failed
because mmRLC_CAPTURE_GPU_CLOCK_COUNT register cannot be
written in VF due to security policy.

Solution: use a VF-accessible timestamp register pair
mmGOLDEN_TSC_COUNT_LOWER/UPPER for SRIOV case.

Signed-off-by: jianzh 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 44f00ec..8787a46 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -35,6 +35,8 @@
 
 #include "gc/gc_10_1_0_offset.h"
 #include "gc/gc_10_1_0_sh_mask.h"
+#include "smuio/smuio_11_0_0_offset.h"
+#include "smuio/smuio_11_0_0_sh_mask.h"
 #include "navi10_enum.h"
 #include "hdp/hdp_5_0_0_offset.h"
 #include "ivsrcid/gfx/irqsrcs_gfx_10_1.h"
@@ -3920,9 +3922,14 @@ static uint64_t gfx_v10_0_get_gpu_clock_counter(struct 
amdgpu_device *adev)
 
amdgpu_gfx_off_ctrl(adev, false);
mutex_lock(>gfx.gpu_clock_mutex);
-   WREG32_SOC15(GC, 0, mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
-   clock = (uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_LSB) |
-   ((uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_MSB) << 
32ULL);
+   if (!amdgpu_sriov_vf(adev)) {
+   WREG32_SOC15(GC, 0, mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
+   clock = (uint64_t)RREG32_SOC15(GC, 0, 
mmRLC_GPU_CLOCK_COUNT_LSB) |
+   ((uint64_t)RREG32_SOC15(GC, 0, 
mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
+   } else {
+   clock = (uint64_t)RREG32_SOC15(SMUIO, 0, 
mmGOLDEN_TSC_COUNT_LOWER) |
+   ((uint64_t)RREG32_SOC15(SMUIO, 0, 
mmGOLDEN_TSC_COUNT_UPPER) << 32ULL);
+   }
mutex_unlock(>gfx.gpu_clock_mutex);
amdgpu_gfx_off_ctrl(adev, true);
return clock;
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/SRIOV: Only reset hw.status for target IP

2019-10-29 Thread jianzh
From: Jiange Zhao 

In the old way, when doing IH hw_init, PSP, nv_common
and GMC hw.status would be reset to false, even though
their hw_init have been done. In the next step, fw_loading,
PSP would do hw_init again.

In the new way, only reset hw.status to false for the target
IP in the list. In this way, PSP will only do hw_init once.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4eee40b9d0b0..ad6d2452fed9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2352,11 +2352,11 @@ static int amdgpu_device_ip_reinit_early_sriov(struct 
amdgpu_device *adev)
for (j = 0; j < adev->num_ip_blocks; j++) {
block = >ip_blocks[j];
 
-   block->status.hw = false;
if (block->version->type != ip_order[i] ||
!block->status.valid)
continue;
 
+   block->status.hw = false;
r = block->version->funcs->hw_init(adev);
DRM_INFO("RE-INIT-early: %s %s\n", 
block->version->funcs->name, r?"failed":"succeeded");
if (r)
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu/SRIOV: Reorganize hw.status for SRIOV re-init

2019-10-28 Thread jianzh
From: Jiange Zhao 

in amdgpu_device_ip_reinit_early_sriov, after IH hw_init,
only IH's hw.status is true. Other three IP's hw.status
are re-set to false, even though they have already done
hw_init.

The new way is to do hw_init for each IP in the list,
regardless of hw.status. And set hw.status only after
hw_init is done.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 676cad15239f..dcce498e84e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2352,7 +2352,6 @@ static int amdgpu_device_ip_reinit_early_sriov(struct 
amdgpu_device *adev)
for (j = 0; j < adev->num_ip_blocks; j++) {
block = >ip_blocks[j];
 
-   block->status.hw = false;
if (block->version->type != ip_order[i] ||
!block->status.valid)
continue;
@@ -2389,8 +2388,7 @@ static int amdgpu_device_ip_reinit_late_sriov(struct 
amdgpu_device *adev)
block = >ip_blocks[j];
 
if (block->version->type != ip_order[i] ||
-   !block->status.valid ||
-   block->status.hw)
+   !block->status.valid)
continue;
 
r = block->version->funcs->hw_init(adev);
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu/SRIOV: SRIOV VF doesn't support BACO

2019-10-28 Thread jianzh
From: Jiange Zhao 

SRIOV VF doesn't support BACO.

Only PF with BACO capability can do it.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/nv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 22ab1955b923..a55a2e83fb19 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -299,7 +299,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)
 {
struct smu_context *smu = >smu;
 
-   if (smu_baco_is_support(smu))
+   if (!amdgpu_sriov_vf(adev) && smu_baco_is_support(smu))
return AMD_RESET_METHOD_BACO;
else
return AMD_RESET_METHOD_MODE1;
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu/SRIOV: add navi12 pci id for SRIOV

2019-09-18 Thread jianzh
From: Jiange Zhao 

Add Navi12 PCI id support.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 420888e941df..b52c7255e5e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1034,6 +1034,7 @@ static const struct pci_device_id pciidlist[] = {
 
/* Navi12 */
{0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12},
+   {0x1002, 0x7362, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12},
 
{0, 0, 0}
 };
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu/SRIOV: add navi12 pci id for SRIOV

2019-09-17 Thread jianzh
From: Jiange Zhao 

Add Navi12 PCI id support.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 420888e941df..b52c7255e5e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1034,6 +1034,7 @@ static const struct pci_device_id pciidlist[] = {
 
/* Navi12 */
{0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12},
+   {0x1002, 0x7362, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12},
 
{0, 0, 0}
 };
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Navi12 SRIOV VF doesn't load TOC

2019-09-11 Thread jianzh
From: Jiange Zhao 

In SRIOV case, the autoload sequence is the same

as bare metal, except VF won't load TOC.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index f90a0cd12827..762c97ce8251 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -253,7 +253,8 @@ static int psp_tmr_init(struct psp_context *psp)
 
/* For ASICs support RLC autoload, psp will parse the toc
 * and calculate the total size of TMR needed */
-   if (psp->toc_start_addr &&
+   if (!amdgpu_sriov_vf(psp->adev) &&
+   psp->toc_start_addr &&
psp->toc_bin_size &&
psp->fw_pri_buf) {
ret = psp_load_toc(psp, _size);
@@ -1305,9 +1306,6 @@ int psp_rlc_autoload_start(struct psp_context *psp)
int ret;
struct psp_gfx_cmd_resp *cmd;
 
-   if (amdgpu_sriov_vf(psp->adev))
-   return 0;
-
cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
if (!cmd)
return -ENOMEM;
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Navi10/12 VF doesn't support SMU

2019-09-11 Thread jianzh
From: Jiange Zhao 

In SRIOV case, SMU and powerplay are handled in HV.

VF shouldn't have control over SMU and powerplay.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/nv.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 4c24672be12a..fb097aa089da 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -438,7 +438,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_device_ip_block_add(adev, _ih_ip_block);
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP &&
-   is_support_sw_smu(adev))
+   is_support_sw_smu(adev) && !amdgpu_sriov_vf(adev))
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
if (adev->enable_virtual_display || amdgpu_sriov_vf(adev))
amdgpu_device_ip_block_add(adev, _virtual_ip_block);
@@ -449,7 +449,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_device_ip_block_add(adev, _v10_0_ip_block);
amdgpu_device_ip_block_add(adev, _v5_0_ip_block);
if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT &&
-   is_support_sw_smu(adev))
+   is_support_sw_smu(adev) && !amdgpu_sriov_vf(adev))
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
amdgpu_device_ip_block_add(adev, _v2_0_ip_block);
if (adev->enable_mes)
@@ -461,7 +461,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_device_ip_block_add(adev, _ih_ip_block);
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP &&
-   is_support_sw_smu(adev))
+   is_support_sw_smu(adev) && !amdgpu_sriov_vf(adev))
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
if (adev->enable_virtual_display || amdgpu_sriov_vf(adev))
amdgpu_device_ip_block_add(adev, _virtual_ip_block);
@@ -472,7 +472,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_device_ip_block_add(adev, _v10_0_ip_block);
amdgpu_device_ip_block_add(adev, _v5_0_ip_block);
if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT &&
-   is_support_sw_smu(adev))
+   is_support_sw_smu(adev) && !amdgpu_sriov_vf(adev))
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
amdgpu_device_ip_block_add(adev, _v2_0_ip_block);
break;
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: For Navi12 SRIOV VF, register mailbox functions

2019-09-11 Thread jianzh
From: Jiange Zhao 

Mailbox functions and interrupts are only for Navi12 VF.

Register functions and irqs during initialization.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/nv.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index a61f43c0c9df..4c24672be12a 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -53,6 +53,7 @@
 #include "vcn_v2_0.h"
 #include "dce_virtual.h"
 #include "mes_v10_1.h"
+#include "mxgpu_nv.h"
 
 static const struct amd_ip_funcs nv_common_ip_funcs;
 
@@ -426,6 +427,9 @@ int nv_set_ip_blocks(struct amdgpu_device *adev)
 
adev->nbio.funcs->detect_hw_virt(adev);
 
+   if (amdgpu_sriov_vf(adev))
+   adev->virt.ops = _nv_virt_ops;
+
switch (adev->asic_type) {
case CHIP_NAVI10:
case CHIP_NAVI14:
@@ -666,16 +670,31 @@ static int nv_common_early_init(void *handle)
return -EINVAL;
}
 
+   if (amdgpu_sriov_vf(adev)) {
+   amdgpu_virt_init_setting(adev);
+   xgpu_nv_mailbox_set_irq_funcs(adev);
+   }
+
return 0;
 }
 
 static int nv_common_late_init(void *handle)
 {
+   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+   if (amdgpu_sriov_vf(adev))
+   xgpu_nv_mailbox_get_irq(adev);
+
return 0;
 }
 
 static int nv_common_sw_init(void *handle)
 {
+   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+   if (amdgpu_sriov_vf(adev))
+   xgpu_nv_mailbox_add_irq_id(adev);
+
return 0;
 }
 
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Add SRIOV mailbox backend for Navi1x

2019-09-09 Thread jianzh
From: Jiange Zhao 

Mimic the ones for Vega10, add mailbox backend for Navi1x

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |   2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 380 ++
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h |  41 +++
 3 files changed, 422 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 84614a71bb4d..43dc4aa18930 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -68,7 +68,7 @@ amdgpu-$(CONFIG_DRM_AMDGPU_SI)+= si.o gmc_v6_0.o gfx_v6_0.o 
si_ih.o si_dma.o dce
 amdgpu-y += \
vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o nbio_v7_0.o 
vega10_reg_init.o \
vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o navi10_reg_init.o 
navi14_reg_init.o \
-   arct_reg_init.o navi12_reg_init.o
+   arct_reg_init.o navi12_reg_init.o mxgpu_nv.o
 
 # add DF block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
new file mode 100644
index ..0d8767eb7a70
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu.h"
+#include "nbio/nbio_2_3_offset.h"
+#include "nbio/nbio_2_3_sh_mask.h"
+#include "gc/gc_10_1_0_offset.h"
+#include "gc/gc_10_1_0_sh_mask.h"
+#include "soc15.h"
+#include "navi10_ih.h"
+#include "soc15_common.h"
+#include "mxgpu_nv.h"
+#include "mxgpu_ai.h"
+
+static void xgpu_nv_mailbox_send_ack(struct amdgpu_device *adev)
+{
+   WREG8(NV_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
+}
+
+static void xgpu_nv_mailbox_set_valid(struct amdgpu_device *adev, bool val)
+{
+   WREG8(NV_MAIBOX_CONTROL_TRN_OFFSET_BYTE, val ? 1 : 0);
+}
+
+/*
+ * this peek_msg could *only* be called in IRQ routine becuase in IRQ routine
+ * RCV_MSG_VALID filed of BIF_BX_PF_MAILBOX_CONTROL must already be set to 1
+ * by host.
+ *
+ * if called no in IRQ routine, this peek_msg cannot guaranteed to return the
+ * correct value since it doesn't return the RCV_DW0 under the case that
+ * RCV_MSG_VALID is set by host.
+ */
+static enum idh_event xgpu_nv_mailbox_peek_msg(struct amdgpu_device *adev)
+{
+   return RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
+   mmBIF_BX_PF_MAILBOX_MSGBUF_RCV_DW0));
+}
+
+
+static int xgpu_nv_mailbox_rcv_msg(struct amdgpu_device *adev,
+  enum idh_event event)
+{
+   u32 reg;
+
+   reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
+
mmBIF_BX_PF_MAILBOX_MSGBUF_RCV_DW0));
+   if (reg != event)
+   return -ENOENT;
+
+   xgpu_nv_mailbox_send_ack(adev);
+
+   return 0;
+}
+
+static uint8_t xgpu_nv_peek_ack(struct amdgpu_device *adev)
+{
+   return RREG8(NV_MAIBOX_CONTROL_TRN_OFFSET_BYTE) & 2;
+}
+
+static int xgpu_nv_poll_ack(struct amdgpu_device *adev)
+{
+   int timeout  = NV_MAILBOX_POLL_ACK_TIMEDOUT;
+   u8 reg;
+
+   do {
+   reg = RREG8(NV_MAIBOX_CONTROL_TRN_OFFSET_BYTE);
+   if (reg & 2)
+   return 0;
+
+   mdelay(5);
+   timeout -= 5;
+   } while (timeout > 1);
+
+   pr_err("Doesn't get TRN_MSG_ACK from pf in %d msec\n", 
NV_MAILBOX_POLL_ACK_TIMEDOUT);
+
+   return -ETIME;
+}
+
+static int xgpu_nv_poll_msg(struct amdgpu_device *adev, enum idh_event event)
+{
+   int r, timeout = NV_MAILBOX_POLL_MSG_TIMEDOUT;
+
+   do {
+   r = xgpu_nv_mailbox_rcv_msg(adev, event);
+   if (!r)
+   return 0;
+
+   msleep(10);
+   timeout -= 10;
+   } while (timeout > 1);