On 10/7/24 12:20, Kevin Traynor wrote:
> If there is a device notification from the vhost-event thread
> that calls destroy_device() and the user deletes the vhost port at
> the same time, there is a possibility of a deadlock occuring. e.g.
> 
> vhost-event thread is blocking on the ovsrcu_synchronize(), waiting
> for main thread to queisce, i.e.
> 
> 0x0000000003690c09 in ovsrcu_synchronize () at lib/ovs-rcu.c:237
> 0x00000000037162fa in destroy_device (vid=0) at lib/netdev-dpdk.c:4919
> 0x0000000003238a12 in vhost_destroy_device_notify (dev=0x14ba693c0)
> at ../lib/vhost/vhost.c:756
> 
> While main thread is looping in rte_vhost_driver_unregister(), waiting
> for vhost-event callback to finish, meaning it cannot quiesce. i.e.:
> 
> 0x00000000032336e8 in rte_vhost_driver_unregister (path=0xd217a30
> "/tmp/vhost0") at ../lib/vhost/socket.c:1075
> 0x000000000370d5f3 in dpdk_vhost_driver_unregister (dev=0x17d6e8b40,
> vhost_id=0xd217a30 "/tmp/vhost0") at lib/netdev-dpdk.c:1811
> 0x000000000370d709 in netdev_dpdk_vhost_destruct
> (netdev=0x17d6e8bc0) at lib/netdev-dpdk.c:1843
> 
> In order to prevent this situation, use ovsrcu to postpone the call to
> rte_vhost_driver_unregister(). This means that main thread will not
> block and will quiesce, allowing ovsrcu_synchornize() in destroy_device()
> to complete.
> 
> Fixes: afee281f7f4f ("netdev-dpdk: Fix dpdk_watchdog failure to quiesce.")
> Reported-by: Xinxin Zhao <[email protected]>
> Signed-off-by: Kevin Traynor <[email protected]>
> ---
>  lib/netdev-dpdk.c | 66 +++++++++++++++++++++++++++++++++++------------
>  1 file changed, 50 insertions(+), 16 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index e454a4a5d..ab1cca07c 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -201,4 +201,11 @@ static const struct rte_vhost_device_ops 
> virtio_net_device_ops =
>  };
>  
> +/* Information used for postponed vhost driver unregistering. */
> +struct vhost_unregister_info {
> +    char *vhost_id;
> +    char *name;
> +    bool is_server;
> +};
> +
>  /* Custom software stats for dpdk ports */
>  struct netdev_dpdk_sw_stats {
> @@ -1802,11 +1809,40 @@ netdev_dpdk_destruct(struct netdev *netdev)
>   * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'.  To avoid a
>   * deadlock, none of the mutexes must be held while calling this function. */
> -static int
> -dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
> -                             char *vhost_id)
> +static void
> +dpdk_vhost_driver_unregister(struct vhost_unregister_info *vhost_info)
>      OVS_EXCLUDED(dpdk_mutex)
> -    OVS_EXCLUDED(dev->mutex)
>  {
> -    return rte_vhost_driver_unregister(vhost_id);
> +    char *vhost_id = vhost_info->vhost_id;
> +    char *name = vhost_info->name;
> +
> +    if (rte_vhost_driver_unregister(vhost_id)) {
> +        VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
> +                 name, vhost_id);
> +    } else if (vhost_info->is_server) {
> +        /* OVS server mode - remove this socket from list for deletion */
> +        fatal_signal_remove_file_to_unlink(vhost_id);
> +    }
> +    free(vhost_id);
> +    free(name);
> +    free(vhost_info);
> +}
> +
> +/* Postpone calling rte_vhost_driver_unregister() so it is not called from 
> the
> + * main thread otherwise it may deadlock with a device notification calling
> + * destroy_device(). i.e. rte_vhost_driver_unregister() from main thread
> + * blocks waiting for destroy_device() to complete, destroy_device() blocks
> + * waiting for main thread to quiesce. */
> +static void
> +dpdk_vhost_driver_unregister_postpone(const char *vhost_id, const char *name,
> +                                      bool is_server)
> +{
> +    struct vhost_unregister_info *vhost_info;
> +
> +    vhost_info = xmalloc(sizeof *vhost_info);
> +    vhost_info->vhost_id = xstrdup(vhost_id);
> +    vhost_info->name = xstrdup(name);
> +    vhost_info->is_server = is_server;
> +
> +    ovsrcu_postpone(dpdk_vhost_driver_unregister, vhost_info);
>  }
>  
> @@ -1837,16 +1873,11 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
>  
>      if (!vhost_id) {
> -        goto out;
> +        free(vhost_id);
> +        return;
>      }
>  
> -    if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
> -        VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
> -                 netdev->name, vhost_id);
> -    } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
> -        /* OVS server mode - remove this socket from list for deletion */
> -        fatal_signal_remove_file_to_unlink(vhost_id);
> -    }
> -out:
> -    free(vhost_id);
> +    dpdk_vhost_driver_unregister_postpone(vhost_id, netdev_get_name(netdev),
> +                                          !(dev->vhost_driver_flags
> +                                          & RTE_VHOST_USER_CLIENT));

What happens if the netdev is closed and then opened back while the RCU
handler didn't fire yet?  I'm guessing we'll fail to register this id
and the device opening may fail.

>  }
>  
> @@ -6285,5 +6316,8 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev 
> *netdev)
>  
>      if (unregister) {
> -        dpdk_vhost_driver_unregister(dev, vhost_id);
> +        dpdk_vhost_driver_unregister_postpone(vhost_id,
> +                                              netdev_get_name(netdev),
> +                                              !(dev->vhost_driver_flags
> +                                              & RTE_VHOST_USER_CLIENT));
>      }
>  


Hmm.  This doesn't look right.  If we postpone unregistering here, then
we'll try to register an already registered id a few lines below.  And
that will likely fail (?).  And then postponed operation will unregister
the device.  So, IIUC, re-cofiguration for the purpose of a feature set
change will break the netdev.  Or am I missing something?

Best regards, Ilya Maximets.
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to