Am 29.04.25 um 17:52 schrieb Maciej Fijalkowski:
> Currently ixgbe driver checks periodically in its watchdog subtask if
> there is anything to be transmitted (consdidering both Tx and XDP rings)
> under state of carrier not being 'ok'. Such event is interpreted as Tx
> hang and therefore results in interface reset.
> 
> This is currently problematic for ndo_xdp_xmit() as it is allowed to
> produce descriptors when interface is going through reset or its carrier
> is turned off.
> 
> Furthermore, XDP rings should not really be objects of Tx hang
> detection. This mechanism is rather a matter of ndo_tx_timeout() being
> called from dev_watchdog against Tx rings exposed to networking stack.
> 
> Taking into account issues described above, let us have a two fold fix -
> do not respect XDP rings in local ixgbe watchdog and do not produce Tx
> descriptors in ndo_xdp_xmit callback when there is some problem with
> carrier currently. For now, keep the Tx hang checks in clean Tx irq
> routine, but adjust it to not execute it for XDP rings.
> 
> Cc: Tobias Böhm <[email protected]>
> Reported-by: Marcus Wichelmann <[email protected]>
> Closes: 
> https://lore.kernel.org/netdev/[email protected]/
> Fixes: 6453073987ba ("ixgbe: add initial support for xdp redirect")
> Fixes: 33fdc82f0883 ("ixgbe: add support for XDP_TX action")
> Signed-off-by: Maciej Fijalkowski <[email protected]>
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 34 ++++++-------------
>  1 file changed, 11 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
> b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 467f81239e12..21bfea8aeb67 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -966,10 +966,6 @@ static void ixgbe_update_xoff_rx_lfc(struct 
> ixgbe_adapter *adapter)
>       for (i = 0; i < adapter->num_tx_queues; i++)
>               clear_bit(__IXGBE_HANG_CHECK_ARMED,
>                         &adapter->tx_ring[i]->state);
> -
> -     for (i = 0; i < adapter->num_xdp_queues; i++)
> -             clear_bit(__IXGBE_HANG_CHECK_ARMED,
> -                       &adapter->xdp_ring[i]->state);
>  }
>  
>  static void ixgbe_update_xoff_received(struct ixgbe_adapter *adapter)
> @@ -1263,10 +1259,13 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector 
> *q_vector,
>                                  total_bytes);
>       adapter->tx_ipsec += total_ipsec;
>  
> +     if (ring_is_xdp(tx_ring))
> +             return !!budget;
> +
>       if (check_for_tx_hang(tx_ring) && ixgbe_check_tx_hang(tx_ring)) {
>               /* schedule immediate reset if we believe we hung */
>               struct ixgbe_hw *hw = &adapter->hw;
> -             e_err(drv, "Detected Tx Unit Hang %s\n"
> +             e_err(drv, "Detected Tx Unit Hang\n"
>                       "  Tx Queue             <%d>\n"
>                       "  TDH, TDT             <%x>, <%x>\n"
>                       "  next_to_use          <%x>\n"
> @@ -1274,16 +1273,14 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector 
> *q_vector,
>                       "tx_buffer_info[next_to_clean]\n"
>                       "  time_stamp           <%lx>\n"
>                       "  jiffies              <%lx>\n",
> -                     ring_is_xdp(tx_ring) ? "(XDP)" : "",
>                       tx_ring->queue_index,
>                       IXGBE_READ_REG(hw, IXGBE_TDH(tx_ring->reg_idx)),
>                       IXGBE_READ_REG(hw, IXGBE_TDT(tx_ring->reg_idx)),
>                       tx_ring->next_to_use, i,
>                       tx_ring->tx_buffer_info[i].time_stamp, jiffies);
>  
> -             if (!ring_is_xdp(tx_ring))
> -                     netif_stop_subqueue(tx_ring->netdev,
> -                                         tx_ring->queue_index);
> +             netif_stop_subqueue(tx_ring->netdev,
> +                                 tx_ring->queue_index);
>  
>               e_info(probe,
>                      "tx hang %d detected on queue %d, resetting adapter\n",
> @@ -1296,9 +1293,6 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector 
> *q_vector,
>               return true;
>       }
>  
> -     if (ring_is_xdp(tx_ring))
> -             return !!budget;
> -
>  #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
>       txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index);
>       if (!__netif_txq_completed_wake(txq, total_packets, total_bytes,
> @@ -7791,12 +7785,9 @@ static void ixgbe_check_hang_subtask(struct 
> ixgbe_adapter *adapter)
>               return;
>  
>       /* Force detection of hung controller */
> -     if (netif_carrier_ok(adapter->netdev)) {
> +     if (netif_carrier_ok(adapter->netdev))
>               for (i = 0; i < adapter->num_tx_queues; i++)
>                       set_check_for_tx_hang(adapter->tx_ring[i]);
> -             for (i = 0; i < adapter->num_xdp_queues; i++)
> -                     set_check_for_tx_hang(adapter->xdp_ring[i]);
> -     }
>  
>       if (!(adapter->flags & IXGBE_FLAG_MSIX_ENABLED)) {
>               /*
> @@ -8011,13 +8002,6 @@ static bool ixgbe_ring_tx_pending(struct ixgbe_adapter 
> *adapter)
>                       return true;
>       }
>  
> -     for (i = 0; i < adapter->num_xdp_queues; i++) {
> -             struct ixgbe_ring *ring = adapter->xdp_ring[i];
> -
> -             if (ring->next_to_use != ring->next_to_clean)
> -                     return true;
> -     }
> -
>       return false;
>  }
>  
> @@ -10742,6 +10726,10 @@ static int ixgbe_xdp_xmit(struct net_device *dev, 
> int n,
>       if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
>               return -ENETDOWN;
>  
> +     if (!netif_carrier_ok(adapter->netdev) ||
> +         !netif_running(adapter->netdev))
> +             return -ENETDOWN;
> +
>       if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
>               return -EINVAL;
>  

Hi,

thank you very much for this patch.

We have done more tests now in a production-like environment and I can confirm 
again
that this solves our issue and no more interface resets occur.

Tested-by: Marcus Wichelmann <[email protected]>

Kind regards,
Marcus

Reply via email to