On Fri, Oct 30, 2015 at 04:35:54PM +1100, Alexey Kardashevskiy wrote:
>On 10/26/2015 02:16 PM, Wei Yang wrote:
>>When PF is EEH aware while VFs are not, VFs will be removed during EEH
>>recovery. This is not supported in current code, while will leads to the VF
>>lost.
>>
>>This patch fixes this by adding VFs back. VFs should be added back after PF
>>get recovered properly.
>>
>>Signed-off-by: Wei Yang <weiy...@linux.vnet.ibm.com>
>>Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
>
>btw please remove my "sob" from this patchset (here and 11/12 at least) as I
>did not "sob" the upstream versions of these and I did not post them and
>there is no public tree of mine with these patches. When I agree that the
>patches are good to go, it will be "reviewed-by" or "acked-by".
>

Sure, I would obey this rule in the future.

>
>>---
>>  arch/powerpc/include/asm/eeh.h   |  6 ++++++
>>  arch/powerpc/kernel/eeh_dev.c    |  1 +
>>  arch/powerpc/kernel/eeh_driver.c | 30 +++++++++++++++++++++++-------
>>  3 files changed, 30 insertions(+), 7 deletions(-)
>>
>>diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>>index ea1f13c4..c529a23 100644
>>--- a/arch/powerpc/include/asm/eeh.h
>>+++ b/arch/powerpc/include/asm/eeh.h
>>@@ -127,6 +127,11 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
>>  #define EEH_DEV_SYSFS               (1 << 9)        /* Sysfs created        
>> */
>>  #define EEH_DEV_REMOVED             (1 << 10)       /* Removed permanently  
>> */
>>
>>+struct eeh_rmv_data {
>>+     struct list_head edev_list;
>>+     int removed;
>>+};
>>+
>>  struct eeh_dev {
>>      int mode;                       /* EEH mode                     */
>>      int class_code;                 /* Class code of the device     */
>>@@ -139,6 +144,7 @@ struct eeh_dev {
>>      int af_cap;                     /* Saved AF capability          */
>>      struct eeh_pe *pe;              /* Associated PE                */
>>      struct list_head list;          /* Form link list in the PE     */
>>+     struct list_head rmv_list;      /* Record the removed edev      */
>>      struct pci_controller *phb;     /* Associated PHB               */
>>      struct pci_dn *pdn;             /* Associated PCI device node   */
>>      struct pci_dev *pdev;           /* Associated PCI device        */
>>diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
>>index aabba94..7815095 100644
>>--- a/arch/powerpc/kernel/eeh_dev.c
>>+++ b/arch/powerpc/kernel/eeh_dev.c
>>@@ -67,6 +67,7 @@ void *eeh_dev_init(struct pci_dn *pdn, void *data)
>>      edev->pdn = pdn;
>>      edev->phb = phb;
>>      INIT_LIST_HEAD(&edev->list);
>>+     INIT_LIST_HEAD(&edev->rmv_list);
>>
>>      return NULL;
>>  }
>>diff --git a/arch/powerpc/kernel/eeh_driver.c 
>>b/arch/powerpc/kernel/eeh_driver.c
>>index 99868e2..f2406b4 100644
>>--- a/arch/powerpc/kernel/eeh_driver.c
>>+++ b/arch/powerpc/kernel/eeh_driver.c
>>@@ -420,7 +420,8 @@ static void *eeh_rmv_device(void *data, void *userdata)
>>      struct pci_driver *driver;
>>      struct eeh_dev *edev = (struct eeh_dev *)data;
>>      struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
>>-     int *removed = (int *)userdata;
>>+     struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
>>+     int *removed = rmv_data ? &rmv_data->removed : NULL;
>
>
>You just touched @userdata/@removed in [10/12] and now you are touching it
>again.
>
>It feels like this patch is better to be merged into [10/12], this will
>reduce the noise about the userdata pointer changes passed into
>eeh_pe_dev_traverse() and make more sense as "powerpc/eeh: Support error
>recovery for VF PE" without adding VFs back is pretty useless.
>

Agree, will merge them.

>
>
>
>>      struct pci_dn *pdn = eeh_dev_to_pdn(edev);
>>
>>      /*
>>@@ -467,6 +468,9 @@ static void *eeh_rmv_device(void *data, void *userdata)
>>               * required to plug the VF successfully.
>>               */
>>              pdn->pe_number = IODA_INVALID_PE;
>>+
>>+             if (rmv_data)
>>+                     list_add(&edev->rmv_list, &rmv_data->edev_list);
>>      } else {
>>              pci_lock_rescan_remove();
>>              pci_stop_and_remove_bus_device(dev);
>>@@ -585,11 +589,12 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
>>   * During the reset, udev might be invoked because those affected
>>   * PCI devices will be removed and then added.
>>   */
>>-static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
>>+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
>>+                             struct eeh_rmv_data *rmv_data)
>>  {
>>      struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
>>      struct timeval tstamp;
>>-     int cnt, rc, removed = 0;
>>+     int cnt, rc;
>>      struct eeh_dev *edev;
>>
>>      /* pcibios will clear the counter; save the value */
>>@@ -612,7 +617,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct 
>>pci_bus *bus)
>>                      pci_unlock_rescan_remove();
>>              }
>>      } else if (frozen_bus)
>>-             eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed);
>>+             eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
>>
>>      /*
>>       * Reset the pci controller. (Asserts RST#; resets config space).
>>@@ -659,7 +664,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct 
>>pci_bus *bus)
>>                      eeh_add_virt_device(edev, NULL);
>>              else
>>                      pcibios_add_pci_devices(bus);
>>-     } else if (frozen_bus && removed) {
>>+     } else if (frozen_bus && rmv_data->removed) {
>>              pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
>>              ssleep(5);
>>
>>@@ -687,8 +692,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct 
>>pci_bus *bus)
>>  static void eeh_handle_normal_event(struct eeh_pe *pe)
>>  {
>>      struct pci_bus *frozen_bus;
>>+     struct eeh_dev *edev, *tmp;
>>      int rc = 0;
>>      enum pci_ers_result result = PCI_ERS_RESULT_NONE;
>>+     struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0};
>>
>>      frozen_bus = eeh_pe_bus_get(pe);
>>      if (!frozen_bus) {
>>@@ -735,7 +742,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
>>       */
>>      if (result == PCI_ERS_RESULT_NONE) {
>>              pr_info("EEH: Reset with hotplug activity\n");
>>-             rc = eeh_reset_device(pe, frozen_bus);
>>+             rc = eeh_reset_device(pe, frozen_bus, NULL);
>>              if (rc) {
>>                      pr_warn("%s: Unable to reset, err=%d\n",
>>                              __func__, rc);
>>@@ -787,7 +794,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
>>      /* If any device called out for a reset, then reset the slot */
>>      if (result == PCI_ERS_RESULT_NEED_RESET) {
>>              pr_info("EEH: Reset without hotplug activity\n");
>>-             rc = eeh_reset_device(pe, NULL);
>>+             rc = eeh_reset_device(pe, NULL, &rmv_data);
>>              if (rc) {
>>                      pr_warn("%s: Cannot reset, err=%d\n",
>>                              __func__, rc);
>>@@ -807,6 +814,15 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
>>              goto hard_fail;
>>      }
>>
>>+     /*
>>+      * For those hot removed VFs, we should add back them after PF get
>>+      * recovered properly.
>>+      */
>>+     list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) {
>>+             eeh_add_virt_device(edev, NULL);
>>+             list_del(&edev->rmv_list);
>>+     }
>>+
>>      /* Tell all device drivers that they can resume operations */
>>      pr_info("EEH: Notify device driver to resume\n");
>>      eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
>>
>
>
>-- 
>Alexey

-- 
Richard Yang
Help you, Help me

_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to