On Thu, Nov 21, 2019 at 10:36:36PM -0300, Julio Faracco wrote:
> Driver virtio_net is not handling error events for TX provided by
> dev_watchdog. This event is reached when transmission queue is having
> problems to transmit packets. This could happen for any reason. To
> enable it, driver should have .ndo_tx_timeout implemented.
> 
> This commit brings back virtnet_reset method to recover TX queues from a
> error state. That function is called by schedule_work method and it puts
> the reset function into work queue.
> 
> As the error cause is unknown at this moment, it would be better to
> reset all queues.
> 
> Signed-off-by: Julio Faracco <[email protected]>
> Signed-off-by: Daiane Mendes <[email protected]>
> Cc: Jason Wang <[email protected]>
> ---
> v1-v2: Tag `net-next` was included to indentify where patch would be
> applied.
> ---
>  drivers/net/virtio_net.c | 95 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 94 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4d7d5434cc5d..31890d77eaf2 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -75,6 +75,7 @@ struct virtnet_sq_stats {
>       u64 xdp_tx;
>       u64 xdp_tx_drops;
>       u64 kicks;
> +     u64 tx_timeouts;
>  };
>  
>  struct virtnet_rq_stats {
> @@ -98,6 +99,7 @@ static const struct virtnet_stat_desc 
> virtnet_sq_stats_desc[] = {
>       { "xdp_tx",             VIRTNET_SQ_STAT(xdp_tx) },
>       { "xdp_tx_drops",       VIRTNET_SQ_STAT(xdp_tx_drops) },
>       { "kicks",              VIRTNET_SQ_STAT(kicks) },
> +     { "tx_timeouts",        VIRTNET_SQ_STAT(tx_timeouts) },
>  };
>  
>  static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> @@ -211,6 +213,9 @@ struct virtnet_info {
>       /* Work struct for config space updates */
>       struct work_struct config_work;
>  
> +     /* Work struct for resetting the virtio-net driver. */
> +     struct work_struct reset_work;
> +
>       /* Does the affinity hint is set for virtqueues? */
>       bool affinity_hint_set;
>  
> @@ -1721,7 +1726,7 @@ static void virtnet_stats(struct net_device *dev,
>       int i;
>  
>       for (i = 0; i < vi->max_queue_pairs; i++) {
> -             u64 tpackets, tbytes, rpackets, rbytes, rdrops;
> +             u64 tpackets, tbytes, terrors, rpackets, rbytes, rdrops;
>               struct receive_queue *rq = &vi->rq[i];
>               struct send_queue *sq = &vi->sq[i];
>  
> @@ -1729,6 +1734,7 @@ static void virtnet_stats(struct net_device *dev,
>                       start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
>                       tpackets = sq->stats.packets;
>                       tbytes   = sq->stats.bytes;
> +                     terrors  = sq->stats.tx_timeouts;
>               } while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
>  
>               do {
> @@ -1743,6 +1749,7 @@ static void virtnet_stats(struct net_device *dev,
>               tot->rx_bytes   += rbytes;
>               tot->tx_bytes   += tbytes;
>               tot->rx_dropped += rdrops;
> +             tot->tx_errors  += terrors;
>       }
>  
>       tot->tx_dropped = dev->stats.tx_dropped;
> @@ -2578,6 +2585,33 @@ static int virtnet_set_features(struct net_device *dev,
>       return 0;
>  }
>  
> +static void virtnet_tx_timeout(struct net_device *dev)
> +{
> +     struct virtnet_info *vi = netdev_priv(dev);
> +     u32 i;
> +
> +     netdev_warn(dev, "TX timeout stats:\n");
> +     /* find the stopped queue the same way dev_watchdog() does */
> +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> +             struct send_queue *sq = &vi->sq[i];
> +
> +             if (!netif_xmit_stopped(netdev_get_tx_queue(dev, i))) {
> +                     netdev_warn(dev, " Available send queue: %d, sq: %s, 
> vq: %d, name: %s\n",
> +                                 i, sq->name, sq->vq->index, sq->vq->name);

What does this mean?

> +                     continue;
> +             }
> +
> +             u64_stats_update_begin(&sq->stats.syncp);
> +             sq->stats.tx_timeouts++;
> +             u64_stats_update_end(&sq->stats.syncp);
> +
> +             netdev_warn(dev, " Unavailable send queue: %d, sq: %s, vq: %d, 
> name: %s\n",
> +                         i, sq->name, sq->vq->index, sq->vq->name);
> +     }

Can we make the warning less cryptic?
I wonder why don't we get the sq from timeout directly?
Would seem cleaner.

> +
> +     schedule_work(&vi->reset_work);
> +}
> +
>  static const struct net_device_ops virtnet_netdev = {
>       .ndo_open            = virtnet_open,
>       .ndo_stop            = virtnet_close,
> @@ -2593,6 +2627,7 @@ static const struct net_device_ops virtnet_netdev = {
>       .ndo_features_check     = passthru_features_check,
>       .ndo_get_phys_port_name = virtnet_get_phys_port_name,
>       .ndo_set_features       = virtnet_set_features,
> +     .ndo_tx_timeout         = virtnet_tx_timeout,
>  };
>  
>  static void virtnet_config_changed_work(struct work_struct *work)
> @@ -2982,6 +3017,62 @@ static int virtnet_validate(struct virtio_device *vdev)
>       return 0;
>  }
>  
> +static void _remove_vq_common(struct virtnet_info *vi)
> +{
> +     vi->vdev->config->reset(vi->vdev);
> +
> +     /* Free unused buffers in both send and recv, if any. */
> +     free_unused_bufs(vi);
> +
> +     _free_receive_bufs(vi);
> +
> +     free_receive_page_frags(vi);
> +
> +     virtnet_del_vqs(vi);
> +}
> +
> +static int _virtnet_reset(struct virtnet_info *vi)
> +{
> +     struct virtio_device *vdev = vi->vdev;
> +     int ret;
> +
> +     virtio_config_disable(vdev);
> +     vdev->failed = vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_FAILED;
> +
> +     virtnet_freeze_down(vdev);
> +     _remove_vq_common(vi);
> +
> +     virtio_add_status(vdev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> +     virtio_add_status(vdev, VIRTIO_CONFIG_S_DRIVER);
> +
> +     ret = virtio_finalize_features(vdev);
> +     if (ret)
> +             goto err;
> +
> +     ret = virtnet_restore_up(vdev);
> +     if (ret)
> +             goto err;
> +
> +     ret = _virtnet_set_queues(vi, vi->curr_queue_pairs);
> +     if (ret)
> +             goto err;
> +
> +     virtio_add_status(vdev, VIRTIO_CONFIG_S_DRIVER_OK);
> +     virtio_config_enable(vdev);


Is this enough? E.g. all RX mode programming has been lost.



> +     return 0;
> +err:
> +     virtio_add_status(vdev, VIRTIO_CONFIG_S_FAILED);
> +     return ret;
> +}
> +
> +static void virtnet_reset(struct work_struct *work)
> +{
> +     struct virtnet_info *vi =
> +             container_of(work, struct virtnet_info, reset_work);
> +
> +     _virtnet_reset(vi);
> +}
> +
>  static int virtnet_probe(struct virtio_device *vdev)
>  {
>       int i, err = -ENOMEM;
> @@ -3011,6 +3102,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>       dev->netdev_ops = &virtnet_netdev;
>       dev->features = NETIF_F_HIGHDMA;
>  
> +     dev->watchdog_timeo = 5 * HZ;
>       dev->ethtool_ops = &virtnet_ethtool_ops;
>       SET_NETDEV_DEV(dev, &vdev->dev);
>

Is there a way to make this tuneable from ethtool?
  
> @@ -3068,6 +3160,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>       vdev->priv = vi;
>  
>       INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> +     INIT_WORK(&vi->reset_work, virtnet_reset);
>  
>       /* If we can receive ANY GSO packets, we must allocate large ones. */
>       if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> -- 
> 2.17.1

_______________________________________________
Virtualization mailing list
[email protected]
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Reply via email to