Re: [RFC 2/2] drm/xe/FLR: Support PCIe FLR

2024-03-21 Thread Aravind Iddamsetty


On 21/03/24 04:31, Lucas De Marchi wrote:
> On Wed, Mar 20, 2024 at 04:14:26PM +0530, Aravind Iddamsetty wrote:
>> PCI subsystem provides callbacks to inform the driver about a request to
>> do function level reset by user, initiated by writing to sysfs entry
>> /sys/bus/pci/devices/.../reset. This will allow the driver to handle FLR
>> without the need to do unbind and rebind as the driver needs to
>> reinitialize the device afresh post FLR.
>>
>> Cc: Rodrigo Vivi 
>> Cc: Lucas De Marchi 
>> Signed-off-by: Aravind Iddamsetty 
>> ---
>> drivers/gpu/drm/xe/Makefile  |  1 +
>> drivers/gpu/drm/xe/xe_device_types.h |  3 +
>> drivers/gpu/drm/xe/xe_gt.c   | 31 ++---
>> drivers/gpu/drm/xe/xe_gt.h   |  1 +
>> drivers/gpu/drm/xe/xe_pci.c  | 53 ++--
>> drivers/gpu/drm/xe/xe_pci.h  |  6 +-
>> drivers/gpu/drm/xe/xe_pci_err.c  | 94 
>> 7 files changed, 174 insertions(+), 15 deletions(-)
>> create mode 100644 drivers/gpu/drm/xe/xe_pci_err.c
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 3c3e67885559..1447712fec65 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -114,6 +114,7 @@ xe-y += xe_bb.o \
>> xe_module.o \
>> xe_pat.o \
>> xe_pci.o \
>> +    xe_pci_err.o \
>> xe_pcode.o \
>> xe_pm.o \
>> xe_preempt_fence.o \
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
>> b/drivers/gpu/drm/xe/xe_device_types.h
>> index 9785eef2e5a4..e9b8c7cbb428 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -455,6 +455,9 @@ struct xe_device {
>> /** @needs_flr_on_fini: requests function-reset on fini */
>> bool needs_flr_on_fini;
>>
>> +    /** @pci_state: PCI state of device */
>> +    struct pci_saved_state *pci_state;
>> +
>> /* private: */
>>
>> #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>> index 85408e7a932b..437874a9a5a0 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.c
>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>> @@ -629,6 +629,26 @@ static int do_gt_restart(struct xe_gt *gt)
>> return 0;
>> }
>>
>> +/* Idle the GT */
>> +int xe_idle_gt(struct xe_gt *gt)
>
> any non-static function should use xe_gt_ prefix.
Ok will change it.
>
>> +{
>> +    int err;
>> +
>> +    xe_gt_sanitize(gt);
>> +
>> +    xe_uc_gucrc_disable(>uc);
>> +    xe_uc_stop_prepare(>uc);
>> +    xe_gt_pagefault_reset(gt);
>> +
>> +    err = xe_uc_stop(>uc);
>> +    if (err)
>> +    return err;
>> +
>> +    xe_gt_tlb_invalidation_reset(gt);
>> +
>> +    return err;
>> +}
>> +
>> static int gt_reset(struct xe_gt *gt)
>> {
>> int err;
>> @@ -645,21 +665,12 @@ static int gt_reset(struct xe_gt *gt)
>> }
>>
>> xe_pm_runtime_get(gt_to_xe(gt));
>> -    xe_gt_sanitize(gt);
>>
>> err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>> if (err)
>>     goto err_msg;
>>
>> -    xe_uc_gucrc_disable(>uc);
>> -    xe_uc_stop_prepare(>uc);
>> -    xe_gt_pagefault_reset(gt);
>> -
>> -    err = xe_uc_stop(>uc);
>> -    if (err)
>> -    goto err_out;
>> -
>> -    xe_gt_tlb_invalidation_reset(gt);
>> +    xe_idle_gt(gt);
>
> this and the above should be in a commit alone
> "drm/xe: Extract xe_gt_idle() helper" with explanation it will be used
> in other places outside of gt_reset path.
sure will separate it out.
>
> but I'm a little bit confused here... why do you need to remove
> xe_gt_sanitize() inside the function to make gt idle?
if I understood right this controls how the invalidation is 

sent via guc or mmio. hence moved into this function as we disable guc.

>
> Lucas De Marchi
>
>>
>> err = do_gt_reset(gt);
>> if (err)
>> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
>> index ed6ea8057e35..77df919199cc 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.h
>> +++ b/drivers/gpu/drm/xe/xe_gt.h
>> @@ -43,6 +43,7 @@ int xe_gt_resume(struct xe_gt *gt);
>> void xe_gt_reset_async(struct xe_gt *gt);
>> void xe_gt_sanitize(struct xe_gt *gt);
>> void xe_gt_remove(struct xe_gt *gt);
>> +int xe_idle_gt(struct xe_gt *gt);
>>
>> /**
>>  * xe_gt_any_hw_engine_by_reset_domain - scan the list of engines and return 
>> the
>> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>> index c401d4890386..fcd2a7f66f7b 100644
>> --- a/drivers/gpu/drm/xe/xe_pci.c
>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>> @@ -383,6 +383,41 @@ MODULE_DEVICE_TABLE(pci, pciidlist);
>>
>> #undef INTEL_VGA_DEVICE
>>
>> +static bool xe_save_pci_state(struct pci_dev *pdev)
>> +{
>> +    struct xe_device *xe = pci_get_drvdata(pdev);
>> +
>> +    if (pci_save_state(pdev))
>> +    return false;
>> +
>> +    kfree(xe->pci_state);
>> +
>> +    xe->pci_state = pci_store_saved_state(pdev);
>> +
>> +    if (!xe->pci_state) {
>> +    drm_err(>drm, "Failed to store PCI saved state\n");
>> +    return false;
>> +    }
>> +
>> +    return 

Re: [RFC 2/2] drm/xe/FLR: Support PCIe FLR

2024-03-21 Thread Aravind Iddamsetty


On 21/03/24 02:22, Rodrigo Vivi wrote:
> On Wed, Mar 20, 2024 at 04:14:26PM +0530, Aravind Iddamsetty wrote:
>> PCI subsystem provides callbacks to inform the driver about a request to
>> do function level reset by user, initiated by writing to sysfs entry
>> /sys/bus/pci/devices/.../reset. This will allow the driver to handle FLR
>> without the need to do unbind and rebind as the driver needs to
>> reinitialize the device afresh post FLR.
>>
>> Cc: Rodrigo Vivi 
>> Cc: Lucas De Marchi 
>> Signed-off-by: Aravind Iddamsetty 
>> ---
>>  drivers/gpu/drm/xe/Makefile  |  1 +
>>  drivers/gpu/drm/xe/xe_device_types.h |  3 +
>>  drivers/gpu/drm/xe/xe_gt.c   | 31 ++---
>>  drivers/gpu/drm/xe/xe_gt.h   |  1 +
>>  drivers/gpu/drm/xe/xe_pci.c  | 53 ++--
>>  drivers/gpu/drm/xe/xe_pci.h  |  6 +-
>>  drivers/gpu/drm/xe/xe_pci_err.c  | 94 
>>  7 files changed, 174 insertions(+), 15 deletions(-)
>>  create mode 100644 drivers/gpu/drm/xe/xe_pci_err.c
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 3c3e67885559..1447712fec65 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -114,6 +114,7 @@ xe-y += xe_bb.o \
>>  xe_module.o \
>>  xe_pat.o \
>>  xe_pci.o \
>> +xe_pci_err.o \
>>  xe_pcode.o \
>>  xe_pm.o \
>>  xe_preempt_fence.o \
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
>> b/drivers/gpu/drm/xe/xe_device_types.h
>> index 9785eef2e5a4..e9b8c7cbb428 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -455,6 +455,9 @@ struct xe_device {
>>  /** @needs_flr_on_fini: requests function-reset on fini */
>>  bool needs_flr_on_fini;
>>  
>> +/** @pci_state: PCI state of device */
>> +struct pci_saved_state *pci_state;
>> +
>>  /* private: */
>>  
>>  #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>> index 85408e7a932b..437874a9a5a0 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.c
>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>> @@ -629,6 +629,26 @@ static int do_gt_restart(struct xe_gt *gt)
>>  return 0;
>>  }
>>  
>> +/* Idle the GT */
>> +int xe_idle_gt(struct xe_gt *gt)
>> +{
>> +int err;
>> +
>> +xe_gt_sanitize(gt);
>> +
>> +xe_uc_gucrc_disable(>uc);
>> +xe_uc_stop_prepare(>uc);
>> +xe_gt_pagefault_reset(gt);
>> +
>> +err = xe_uc_stop(>uc);
>> +if (err)
>> +return err;
>> +
>> +xe_gt_tlb_invalidation_reset(gt);
>> +
>> +return err;
>> +}
>> +
>>  static int gt_reset(struct xe_gt *gt)
>>  {
>>  int err;
>> @@ -645,21 +665,12 @@ static int gt_reset(struct xe_gt *gt)
>>  }
>>  
>>  xe_pm_runtime_get(gt_to_xe(gt));
>> -xe_gt_sanitize(gt);
>>  
>>  err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>>  if (err)
>>  goto err_msg;
>>  
>> -xe_uc_gucrc_disable(>uc);
>> -xe_uc_stop_prepare(>uc);
>> -xe_gt_pagefault_reset(gt);
>> -
>> -err = xe_uc_stop(>uc);
>> -if (err)
>> -goto err_out;
>> -
>> -xe_gt_tlb_invalidation_reset(gt);
>> +xe_idle_gt(gt);
>>  
>>  err = do_gt_reset(gt);
>>  if (err)
>> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
>> index ed6ea8057e35..77df919199cc 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.h
>> +++ b/drivers/gpu/drm/xe/xe_gt.h
>> @@ -43,6 +43,7 @@ int xe_gt_resume(struct xe_gt *gt);
>>  void xe_gt_reset_async(struct xe_gt *gt);
>>  void xe_gt_sanitize(struct xe_gt *gt);
>>  void xe_gt_remove(struct xe_gt *gt);
>> +int xe_idle_gt(struct xe_gt *gt);
>>  
>>  /**
>>   * xe_gt_any_hw_engine_by_reset_domain - scan the list of engines and 
>> return the
>> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>> index c401d4890386..fcd2a7f66f7b 100644
>> --- a/drivers/gpu/drm/xe/xe_pci.c
>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>> @@ -383,6 +383,41 @@ MODULE_DEVICE_TABLE(pci, pciidlist);
>>  
>>  #undef INTEL_VGA_DEVICE
>>  
>> +static bool xe_save_pci_state(struct pci_dev *pdev)
>> +{
>> +struct xe_device *xe = pci_get_drvdata(pdev);
>> +
>> +if (pci_save_state(pdev))
>> +return false;
>> +
>> +kfree(xe->pci_state);
>> +
>> +xe->pci_state = pci_store_saved_state(pdev);
>> +
>> +if (!xe->pci_state) {
>> +drm_err(>drm, "Failed to store PCI saved state\n");
>> +return false;
>> +}
>> +
>> +return true;
>> +}
>> +
>> +void xe_load_pci_state(struct pci_dev *pdev)
>> +{
>> +struct xe_device *xe = pci_get_drvdata(pdev);
>> +int ret;
>> +
>> +if (!xe->pci_state)
>> +return;
>> +
>> +ret = pci_load_saved_state(pdev, xe->pci_state);
>> +if (!ret) {
>> +pci_restore_state(pdev);
>> +} else {
>> +drm_warn(>drm, "Failed to load PCI state err:%d\n", ret);
>> +}
>> +}
>> +
>>  /* is device_id present in comma 

Re: [RFC 2/2] drm/xe/FLR: Support PCIe FLR

2024-03-20 Thread Lucas De Marchi

On Wed, Mar 20, 2024 at 04:14:26PM +0530, Aravind Iddamsetty wrote:

PCI subsystem provides callbacks to inform the driver about a request to
do function level reset by user, initiated by writing to sysfs entry
/sys/bus/pci/devices/.../reset. This will allow the driver to handle FLR
without the need to do unbind and rebind as the driver needs to
reinitialize the device afresh post FLR.

Cc: Rodrigo Vivi 
Cc: Lucas De Marchi 
Signed-off-by: Aravind Iddamsetty 
---
drivers/gpu/drm/xe/Makefile  |  1 +
drivers/gpu/drm/xe/xe_device_types.h |  3 +
drivers/gpu/drm/xe/xe_gt.c   | 31 ++---
drivers/gpu/drm/xe/xe_gt.h   |  1 +
drivers/gpu/drm/xe/xe_pci.c  | 53 ++--
drivers/gpu/drm/xe/xe_pci.h  |  6 +-
drivers/gpu/drm/xe/xe_pci_err.c  | 94 
7 files changed, 174 insertions(+), 15 deletions(-)
create mode 100644 drivers/gpu/drm/xe/xe_pci_err.c

diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index 3c3e67885559..1447712fec65 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -114,6 +114,7 @@ xe-y += xe_bb.o \
xe_module.o \
xe_pat.o \
xe_pci.o \
+   xe_pci_err.o \
xe_pcode.o \
xe_pm.o \
xe_preempt_fence.o \
diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index 9785eef2e5a4..e9b8c7cbb428 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -455,6 +455,9 @@ struct xe_device {
/** @needs_flr_on_fini: requests function-reset on fini */
bool needs_flr_on_fini;

+   /** @pci_state: PCI state of device */
+   struct pci_saved_state *pci_state;
+
/* private: */

#if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 85408e7a932b..437874a9a5a0 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -629,6 +629,26 @@ static int do_gt_restart(struct xe_gt *gt)
return 0;
}

+/* Idle the GT */
+int xe_idle_gt(struct xe_gt *gt)


any non-static function should use xe_gt_ prefix.


+{
+   int err;
+
+   xe_gt_sanitize(gt);
+
+   xe_uc_gucrc_disable(>uc);
+   xe_uc_stop_prepare(>uc);
+   xe_gt_pagefault_reset(gt);
+
+   err = xe_uc_stop(>uc);
+   if (err)
+   return err;
+
+   xe_gt_tlb_invalidation_reset(gt);
+
+   return err;
+}
+
static int gt_reset(struct xe_gt *gt)
{
int err;
@@ -645,21 +665,12 @@ static int gt_reset(struct xe_gt *gt)
}

xe_pm_runtime_get(gt_to_xe(gt));
-   xe_gt_sanitize(gt);

err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
if (err)
goto err_msg;

-   xe_uc_gucrc_disable(>uc);
-   xe_uc_stop_prepare(>uc);
-   xe_gt_pagefault_reset(gt);
-
-   err = xe_uc_stop(>uc);
-   if (err)
-   goto err_out;
-
-   xe_gt_tlb_invalidation_reset(gt);
+   xe_idle_gt(gt);


this and the above should be in a commit alone
"drm/xe: Extract xe_gt_idle() helper" with explanation it will be used
in other places outside of gt_reset path.

but I'm a little bit confused here... why do you need to remove
xe_gt_sanitize() inside the function to make gt idle?

Lucas De Marchi



err = do_gt_reset(gt);
if (err)
diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
index ed6ea8057e35..77df919199cc 100644
--- a/drivers/gpu/drm/xe/xe_gt.h
+++ b/drivers/gpu/drm/xe/xe_gt.h
@@ -43,6 +43,7 @@ int xe_gt_resume(struct xe_gt *gt);
void xe_gt_reset_async(struct xe_gt *gt);
void xe_gt_sanitize(struct xe_gt *gt);
void xe_gt_remove(struct xe_gt *gt);
+int xe_idle_gt(struct xe_gt *gt);

/**
 * xe_gt_any_hw_engine_by_reset_domain - scan the list of engines and return the
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index c401d4890386..fcd2a7f66f7b 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -383,6 +383,41 @@ MODULE_DEVICE_TABLE(pci, pciidlist);

#undef INTEL_VGA_DEVICE

+static bool xe_save_pci_state(struct pci_dev *pdev)
+{
+   struct xe_device *xe = pci_get_drvdata(pdev);
+
+   if (pci_save_state(pdev))
+   return false;
+
+   kfree(xe->pci_state);
+
+   xe->pci_state = pci_store_saved_state(pdev);
+
+   if (!xe->pci_state) {
+   drm_err(>drm, "Failed to store PCI saved state\n");
+   return false;
+   }
+
+   return true;
+}
+
+void xe_load_pci_state(struct pci_dev *pdev)
+{
+   struct xe_device *xe = pci_get_drvdata(pdev);
+   int ret;
+
+   if (!xe->pci_state)
+   return;
+
+   ret = pci_load_saved_state(pdev, xe->pci_state);
+   if (!ret) {
+   pci_restore_state(pdev);
+   } else {
+   drm_warn(>drm, "Failed to load PCI state err:%d\n", ret);
+   }


please check coding style here and invert 

Re: [RFC 2/2] drm/xe/FLR: Support PCIe FLR

2024-03-20 Thread Rodrigo Vivi
On Wed, Mar 20, 2024 at 04:14:26PM +0530, Aravind Iddamsetty wrote:
> PCI subsystem provides callbacks to inform the driver about a request to
> do function level reset by user, initiated by writing to sysfs entry
> /sys/bus/pci/devices/.../reset. This will allow the driver to handle FLR
> without the need to do unbind and rebind as the driver needs to
> reinitialize the device afresh post FLR.
> 
> Cc: Rodrigo Vivi 
> Cc: Lucas De Marchi 
> Signed-off-by: Aravind Iddamsetty 
> ---
>  drivers/gpu/drm/xe/Makefile  |  1 +
>  drivers/gpu/drm/xe/xe_device_types.h |  3 +
>  drivers/gpu/drm/xe/xe_gt.c   | 31 ++---
>  drivers/gpu/drm/xe/xe_gt.h   |  1 +
>  drivers/gpu/drm/xe/xe_pci.c  | 53 ++--
>  drivers/gpu/drm/xe/xe_pci.h  |  6 +-
>  drivers/gpu/drm/xe/xe_pci_err.c  | 94 
>  7 files changed, 174 insertions(+), 15 deletions(-)
>  create mode 100644 drivers/gpu/drm/xe/xe_pci_err.c
> 
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index 3c3e67885559..1447712fec65 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -114,6 +114,7 @@ xe-y += xe_bb.o \
>   xe_module.o \
>   xe_pat.o \
>   xe_pci.o \
> + xe_pci_err.o \
>   xe_pcode.o \
>   xe_pm.o \
>   xe_preempt_fence.o \
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
> b/drivers/gpu/drm/xe/xe_device_types.h
> index 9785eef2e5a4..e9b8c7cbb428 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -455,6 +455,9 @@ struct xe_device {
>   /** @needs_flr_on_fini: requests function-reset on fini */
>   bool needs_flr_on_fini;
>  
> + /** @pci_state: PCI state of device */
> + struct pci_saved_state *pci_state;
> +
>   /* private: */
>  
>  #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 85408e7a932b..437874a9a5a0 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -629,6 +629,26 @@ static int do_gt_restart(struct xe_gt *gt)
>   return 0;
>  }
>  
> +/* Idle the GT */
> +int xe_idle_gt(struct xe_gt *gt)
> +{
> + int err;
> +
> + xe_gt_sanitize(gt);
> +
> + xe_uc_gucrc_disable(>uc);
> + xe_uc_stop_prepare(>uc);
> + xe_gt_pagefault_reset(gt);
> +
> + err = xe_uc_stop(>uc);
> + if (err)
> + return err;
> +
> + xe_gt_tlb_invalidation_reset(gt);
> +
> + return err;
> +}
> +
>  static int gt_reset(struct xe_gt *gt)
>  {
>   int err;
> @@ -645,21 +665,12 @@ static int gt_reset(struct xe_gt *gt)
>   }
>  
>   xe_pm_runtime_get(gt_to_xe(gt));
> - xe_gt_sanitize(gt);
>  
>   err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>   if (err)
>   goto err_msg;
>  
> - xe_uc_gucrc_disable(>uc);
> - xe_uc_stop_prepare(>uc);
> - xe_gt_pagefault_reset(gt);
> -
> - err = xe_uc_stop(>uc);
> - if (err)
> - goto err_out;
> -
> - xe_gt_tlb_invalidation_reset(gt);
> + xe_idle_gt(gt);
>  
>   err = do_gt_reset(gt);
>   if (err)
> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
> index ed6ea8057e35..77df919199cc 100644
> --- a/drivers/gpu/drm/xe/xe_gt.h
> +++ b/drivers/gpu/drm/xe/xe_gt.h
> @@ -43,6 +43,7 @@ int xe_gt_resume(struct xe_gt *gt);
>  void xe_gt_reset_async(struct xe_gt *gt);
>  void xe_gt_sanitize(struct xe_gt *gt);
>  void xe_gt_remove(struct xe_gt *gt);
> +int xe_idle_gt(struct xe_gt *gt);
>  
>  /**
>   * xe_gt_any_hw_engine_by_reset_domain - scan the list of engines and return 
> the
> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> index c401d4890386..fcd2a7f66f7b 100644
> --- a/drivers/gpu/drm/xe/xe_pci.c
> +++ b/drivers/gpu/drm/xe/xe_pci.c
> @@ -383,6 +383,41 @@ MODULE_DEVICE_TABLE(pci, pciidlist);
>  
>  #undef INTEL_VGA_DEVICE
>  
> +static bool xe_save_pci_state(struct pci_dev *pdev)
> +{
> + struct xe_device *xe = pci_get_drvdata(pdev);
> +
> + if (pci_save_state(pdev))
> + return false;
> +
> + kfree(xe->pci_state);
> +
> + xe->pci_state = pci_store_saved_state(pdev);
> +
> + if (!xe->pci_state) {
> + drm_err(>drm, "Failed to store PCI saved state\n");
> + return false;
> + }
> +
> + return true;
> +}
> +
> +void xe_load_pci_state(struct pci_dev *pdev)
> +{
> + struct xe_device *xe = pci_get_drvdata(pdev);
> + int ret;
> +
> + if (!xe->pci_state)
> + return;
> +
> + ret = pci_load_saved_state(pdev, xe->pci_state);
> + if (!ret) {
> + pci_restore_state(pdev);
> + } else {
> + drm_warn(>drm, "Failed to load PCI state err:%d\n", ret);
> + }
> +}
> +
>  /* is device_id present in comma separated list of ids */
>  static bool device_id_in_list(u16 device_id, const char *devices, bool 
> negative)
>  {
> @@ -688,10 +723,12 @@ 

[RFC 2/2] drm/xe/FLR: Support PCIe FLR

2024-03-20 Thread Aravind Iddamsetty
PCI subsystem provides callbacks to inform the driver about a request to
do function level reset by user, initiated by writing to sysfs entry
/sys/bus/pci/devices/.../reset. This will allow the driver to handle FLR
without the need to do unbind and rebind as the driver needs to
reinitialize the device afresh post FLR.

Cc: Rodrigo Vivi 
Cc: Lucas De Marchi 
Signed-off-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/xe/Makefile  |  1 +
 drivers/gpu/drm/xe/xe_device_types.h |  3 +
 drivers/gpu/drm/xe/xe_gt.c   | 31 ++---
 drivers/gpu/drm/xe/xe_gt.h   |  1 +
 drivers/gpu/drm/xe/xe_pci.c  | 53 ++--
 drivers/gpu/drm/xe/xe_pci.h  |  6 +-
 drivers/gpu/drm/xe/xe_pci_err.c  | 94 
 7 files changed, 174 insertions(+), 15 deletions(-)
 create mode 100644 drivers/gpu/drm/xe/xe_pci_err.c

diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index 3c3e67885559..1447712fec65 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -114,6 +114,7 @@ xe-y += xe_bb.o \
xe_module.o \
xe_pat.o \
xe_pci.o \
+   xe_pci_err.o \
xe_pcode.o \
xe_pm.o \
xe_preempt_fence.o \
diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index 9785eef2e5a4..e9b8c7cbb428 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -455,6 +455,9 @@ struct xe_device {
/** @needs_flr_on_fini: requests function-reset on fini */
bool needs_flr_on_fini;
 
+   /** @pci_state: PCI state of device */
+   struct pci_saved_state *pci_state;
+
/* private: */
 
 #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 85408e7a932b..437874a9a5a0 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -629,6 +629,26 @@ static int do_gt_restart(struct xe_gt *gt)
return 0;
 }
 
+/* Idle the GT */
+int xe_idle_gt(struct xe_gt *gt)
+{
+   int err;
+
+   xe_gt_sanitize(gt);
+
+   xe_uc_gucrc_disable(>uc);
+   xe_uc_stop_prepare(>uc);
+   xe_gt_pagefault_reset(gt);
+
+   err = xe_uc_stop(>uc);
+   if (err)
+   return err;
+
+   xe_gt_tlb_invalidation_reset(gt);
+
+   return err;
+}
+
 static int gt_reset(struct xe_gt *gt)
 {
int err;
@@ -645,21 +665,12 @@ static int gt_reset(struct xe_gt *gt)
}
 
xe_pm_runtime_get(gt_to_xe(gt));
-   xe_gt_sanitize(gt);
 
err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
if (err)
goto err_msg;
 
-   xe_uc_gucrc_disable(>uc);
-   xe_uc_stop_prepare(>uc);
-   xe_gt_pagefault_reset(gt);
-
-   err = xe_uc_stop(>uc);
-   if (err)
-   goto err_out;
-
-   xe_gt_tlb_invalidation_reset(gt);
+   xe_idle_gt(gt);
 
err = do_gt_reset(gt);
if (err)
diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
index ed6ea8057e35..77df919199cc 100644
--- a/drivers/gpu/drm/xe/xe_gt.h
+++ b/drivers/gpu/drm/xe/xe_gt.h
@@ -43,6 +43,7 @@ int xe_gt_resume(struct xe_gt *gt);
 void xe_gt_reset_async(struct xe_gt *gt);
 void xe_gt_sanitize(struct xe_gt *gt);
 void xe_gt_remove(struct xe_gt *gt);
+int xe_idle_gt(struct xe_gt *gt);
 
 /**
  * xe_gt_any_hw_engine_by_reset_domain - scan the list of engines and return 
the
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index c401d4890386..fcd2a7f66f7b 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -383,6 +383,41 @@ MODULE_DEVICE_TABLE(pci, pciidlist);
 
 #undef INTEL_VGA_DEVICE
 
+static bool xe_save_pci_state(struct pci_dev *pdev)
+{
+   struct xe_device *xe = pci_get_drvdata(pdev);
+
+   if (pci_save_state(pdev))
+   return false;
+
+   kfree(xe->pci_state);
+
+   xe->pci_state = pci_store_saved_state(pdev);
+
+   if (!xe->pci_state) {
+   drm_err(>drm, "Failed to store PCI saved state\n");
+   return false;
+   }
+
+   return true;
+}
+
+void xe_load_pci_state(struct pci_dev *pdev)
+{
+   struct xe_device *xe = pci_get_drvdata(pdev);
+   int ret;
+
+   if (!xe->pci_state)
+   return;
+
+   ret = pci_load_saved_state(pdev, xe->pci_state);
+   if (!ret) {
+   pci_restore_state(pdev);
+   } else {
+   drm_warn(>drm, "Failed to load PCI state err:%d\n", ret);
+   }
+}
+
 /* is device_id present in comma separated list of ids */
 static bool device_id_in_list(u16 device_id, const char *devices, bool 
negative)
 {
@@ -688,10 +723,12 @@ static void xe_pci_remove(struct pci_dev *pdev)
 
xe_device_remove(xe);
xe_pm_runtime_fini(xe);
+
+   kfree(xe->pci_state);
pci_set_drvdata(pdev, NULL);
 }
 
-static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+int