Author: hselasky
Date: Thu Mar  8 09:58:41 2018
New Revision: 330646
URL: https://svnweb.freebsd.org/changeset/base/330646

Log:
  Fix race between PCI error handlers and health work in mlx5core.
  
  linux commit 05ac2c0b7438ea08c5d54b48797acf9b22cb2f6f
  
  Submitted by: Matthew Finlay <m...@mellanox.com>
  MFC after:    1 week
  Sponsored by: Mellanox Technologies

Modified:
  head/sys/dev/mlx5/driver.h
  head/sys/dev/mlx5/mlx5_core/mlx5_health.c
  head/sys/dev/mlx5/mlx5_core/mlx5_main.c

Modified: head/sys/dev/mlx5/driver.h
==============================================================================
--- head/sys/dev/mlx5/driver.h  Thu Mar  8 09:51:33 2018        (r330645)
+++ head/sys/dev/mlx5/driver.h  Thu Mar  8 09:58:41 2018        (r330646)
@@ -482,7 +482,10 @@ struct mlx5_core_health {
        u32                             prev;
        int                             miss_counter;
        bool                            sick;
+       /* wq spinlock to synchronize draining */
+       spinlock_t                      wq_lock;
        struct workqueue_struct        *wq;
+       unsigned long                   flags;
        struct work_struct              work;
 };
 
@@ -872,6 +875,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 
 #define        mlx5_buf_alloc_node(dev, size, direct, buf, node) \
        mlx5_buf_alloc(dev, size, direct, buf)

Modified: head/sys/dev/mlx5/mlx5_core/mlx5_health.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_core/mlx5_health.c   Thu Mar  8 09:51:33 2018        
(r330645)
+++ head/sys/dev/mlx5/mlx5_core/mlx5_health.c   Thu Mar  8 09:58:41 2018        
(r330646)
@@ -43,6 +43,10 @@ enum {
        MLX5_NIC_IFC_NO_DRAM_NIC        = 2
 };
 
+enum {
+       MLX5_DROP_NEW_HEALTH_WORK,
+};
+
 static u8 get_nic_interface(struct mlx5_core_dev *dev)
 {
        return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
@@ -242,7 +246,13 @@ static void poll_health(unsigned long data)
        if (in_fatal(dev) && !health->sick) {
                health->sick = true;
                print_health_info(dev);
-               queue_work(health->wq, &health->work);
+               spin_lock(&health->wq_lock);
+               if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
+                       queue_work(health->wq, &health->work);
+               else
+                       dev_err(&dev->pdev->dev,
+                               "new health works are not permitted at this 
stage\n");
+               spin_unlock(&health->wq_lock);
        }
 }
 
@@ -252,6 +262,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 
        init_timer(&health->timer);
        health->sick = 0;
+       clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
        health->health = &dev->iseg->health;
        health->health_counter = &dev->iseg->health_counter;
 
@@ -267,6 +278,16 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
        del_timer_sync(&health->timer);
 }
 
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
+{
+       struct mlx5_core_health *health = &dev->priv.health;
+
+       spin_lock(&health->wq_lock);
+       set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+       spin_unlock(&health->wq_lock);
+       cancel_work_sync(&health->work);
+}
+
 void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 {
        struct mlx5_core_health *health = &dev->priv.health;
@@ -293,6 +314,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
        if (!health->wq)
                return -ENOMEM;
 
+       spin_lock_init(&health->wq_lock);
        INIT_WORK(&health->work, health_care);
 
        return 0;

Modified: head/sys/dev/mlx5/mlx5_core/mlx5_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_core/mlx5_main.c     Thu Mar  8 09:51:33 2018        
(r330645)
+++ head/sys/dev/mlx5/mlx5_core/mlx5_main.c     Thu Mar  8 09:58:41 2018        
(r330646)
@@ -1243,7 +1243,12 @@ static pci_ers_result_t mlx5_pci_err_detected(struct p
        dev_info(&pdev->dev, "%s was called\n", __func__);
        mlx5_enter_error_state(dev);
        mlx5_unload_one(dev, priv);
-       mlx5_pci_disable_device(dev);
+       if (state) {
+               pci_save_state(pdev->dev.bsddev);
+               mlx5_drain_health_wq(dev);
+               mlx5_pci_disable_device(dev);
+       }
+
        return state == pci_channel_io_perm_failure ?
                PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
 }
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to