Author: hselasky
Date: Thu May 16 17:15:41 2019
New Revision: 347803
URL: https://svnweb.freebsd.org/changeset/base/347803

Log:
  MFC r347253:
  Protect from infinite sw-reset loop in mlx5core.
  
  Avoid an infinite software firmware reset loop that may be caused by a
  hardware bug by limiting the maximum number of resets.
  The counter between resets is reset by request for reset, and not by a
  successful reset.
  The interval between two resets can be configured via sysctl:
  hw.mlx5.sw_reset_timeout
  which is global to all mlx5 devices in the system.
  
  Submitted by: slavash@
  Sponsored by: Mellanox Technologies

Modified:
  stable/11/sys/dev/mlx5/driver.h
  stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/dev/mlx5/driver.h
==============================================================================
--- stable/11/sys/dev/mlx5/driver.h     Thu May 16 17:15:00 2019        
(r347802)
+++ stable/11/sys/dev/mlx5/driver.h     Thu May 16 17:15:41 2019        
(r347803)
@@ -534,6 +534,7 @@ struct mlx5_core_health {
        unsigned long                   flags;
        struct work_struct              work;
        struct delayed_work             recover_work;
+       unsigned int                    last_reset_req;
 };
 
 #define        MLX5_CQ_LINEAR_ARRAY_SIZE       1024

Modified: stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c
==============================================================================
--- stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c      Thu May 16 17:15:00 
2019        (r347802)
+++ stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c      Thu May 16 17:15:41 
2019        (r347803)
@@ -64,6 +64,12 @@ SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLA
     &mlx5_fw_reset_enable, 0,
     "Enable firmware reset");
 
+static unsigned int sw_reset_to = 1200;
+SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
+    &sw_reset_to, 0,
+    "Minimum timeout in seconds between two firmware resets");
+
+
 static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
 {
        int ret;
@@ -218,6 +224,32 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
                    &dev->iseg->cmdq_addr_l_sz);
 }
 
+static bool
+mlx5_health_allow_reset(struct mlx5_core_dev *dev)
+{
+       struct mlx5_core_health *health = &dev->priv.health;
+       unsigned int delta;
+       bool ret;
+
+       if (health->last_reset_req != 0) {
+               delta = ticks - health->last_reset_req;
+               delta /= hz;
+               ret = delta >= sw_reset_to;
+       } else {
+               ret = true;
+       }
+
+       /*
+        * In principle, ticks may be 0. Setting it to off by one (-1)
+        * to prevent certain reset in next request.
+        */
+       health->last_reset_req = ticks ? : -1;
+       if (!ret)
+               mlx5_core_warn(dev, "Firmware reset elided due to "
+                   "auto-reset frequency threshold.\n");
+       return (ret);
+}
+
 #define MLX5_CRDUMP_WAIT_MS    60000
 #define MLX5_FW_RESET_WAIT_MS  1000
 #define MLX5_NIC_STATE_POLL_MS 5
@@ -243,7 +275,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev,
        if (force)
                goto err_state_done;
 
-       if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) {
+       if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
+           mlx5_health_allow_reset(dev)) {
                /* Get cr-dump and reset FW semaphore */
                if (mlx5_core_is_pf(dev))
                        lock = lock_sem_sw_reset(dev);
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to