At 2024-10-07 18:20:51, "Kevin Traynor" <[email protected]> wrote:
>If there is a device notification from the vhost-event thread
>that calls destroy_device() and the user deletes the vhost port at
>the same time, there is a possibility of a deadlock occuring. e.g.
>
>vhost-event thread is blocking on the ovsrcu_synchronize(), waiting
>for main thread to queisce, i.e.
>
>0x0000000003690c09 in ovsrcu_synchronize () at lib/ovs-rcu.c:237
>0x00000000037162fa in destroy_device (vid=0) at lib/netdev-dpdk.c:4919
>0x0000000003238a12 in vhost_destroy_device_notify (dev=0x14ba693c0)
>at ../lib/vhost/vhost.c:756
>
>While main thread is looping in rte_vhost_driver_unregister(), waiting
>for vhost-event callback to finish, meaning it cannot quiesce. i.e.:
>
>0x00000000032336e8 in rte_vhost_driver_unregister (path=0xd217a30
>"/tmp/vhost0") at ../lib/vhost/socket.c:1075
>0x000000000370d5f3 in dpdk_vhost_driver_unregister (dev=0x17d6e8b40,
>vhost_id=0xd217a30 "/tmp/vhost0") at lib/netdev-dpdk.c:1811
>0x000000000370d709 in netdev_dpdk_vhost_destruct
>(netdev=0x17d6e8bc0) at lib/netdev-dpdk.c:1843
>
>In order to prevent this situation, use ovsrcu to postpone the call to
>rte_vhost_driver_unregister(). This means that main thread will not
>block and will quiesce, allowing ovsrcu_synchornize() in destroy_device()
>to complete.

>


The problem I encountered is the same as the scenario you described.


>Fixes: afee281f7f4f ("netdev-dpdk: Fix dpdk_watchdog failure to quiesce.")
>Reported-by: Xinxin Zhao <[email protected]>
>Signed-off-by: Kevin Traynor <[email protected]>
>---
> lib/netdev-dpdk.c | 66 +++++++++++++++++++++++++++++++++++------------
> 1 file changed, 50 insertions(+), 16 deletions(-)
>
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index e454a4a5d..ab1cca07c 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -201,4 +201,11 @@ static const struct rte_vhost_device_ops 
>virtio_net_device_ops =
> };
> 
>+/* Information used for postponed vhost driver unregistering. */
>+struct vhost_unregister_info {
>+    char *vhost_id;
>+    char *name;
>+    bool is_server;
>+};
>+
> /* Custom software stats for dpdk ports */
> struct netdev_dpdk_sw_stats {
>@@ -1802,11 +1809,40 @@ netdev_dpdk_destruct(struct netdev *netdev)
>  * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'.  To avoid a
>  * deadlock, none of the mutexes must be held while calling this function. */
>-static int
>-dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
>-                             char *vhost_id)
>+static void
>+dpdk_vhost_driver_unregister(struct vhost_unregister_info *vhost_info)
>     OVS_EXCLUDED(dpdk_mutex)
>-    OVS_EXCLUDED(dev->mutex)
> {
>-    return rte_vhost_driver_unregister(vhost_id);
>+    char *vhost_id = vhost_info->vhost_id;
>+    char *name = vhost_info->name;
>+
>+    if (rte_vhost_driver_unregister(vhost_id)) {
>+        VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
>+                 name, vhost_id);
>+    } else if (vhost_info->is_server) {
>+        /* OVS server mode - remove this socket from list for deletion */
>+        fatal_signal_remove_file_to_unlink(vhost_id);
>+    }
>+    free(vhost_id);
>+    free(name);
>+    free(vhost_info);
>+}
>+
>+/* Postpone calling rte_vhost_driver_unregister() so it is not called from the
>+ * main thread otherwise it may deadlock with a device notification calling
>+ * destroy_device(). i.e. rte_vhost_driver_unregister() from main thread
>+ * blocks waiting for destroy_device() to complete, destroy_device() blocks
>+ * waiting for main thread to quiesce. */
>+static void
>+dpdk_vhost_driver_unregister_postpone(const char *vhost_id, const char *name,
>+                                      bool is_server)
>+{
>+    struct vhost_unregister_info *vhost_info;
>+
>+    vhost_info = xmalloc(sizeof *vhost_info);
>+    vhost_info->vhost_id = xstrdup(vhost_id);
>+    vhost_info->name = xstrdup(name);
>+    vhost_info->is_server = is_server;
>+
>+    ovsrcu_postpone(dpdk_vhost_driver_unregister, vhost_info);
> }
> 
>@@ -1837,16 +1873,11 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
> 
>     if (!vhost_id) {
>-        goto out;
>+        free(vhost_id);
>+        return;
>     }
> 
>-    if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
>-        VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
>-                 netdev->name, vhost_id);
>-    } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
>-        /* OVS server mode - remove this socket from list for deletion */
>-        fatal_signal_remove_file_to_unlink(vhost_id);
>-    }
>-out:
>-    free(vhost_id);
>+    dpdk_vhost_driver_unregister_postpone(vhost_id, netdev_get_name(netdev),
>+                                          !(dev->vhost_driver_flags
>+                                          & RTE_VHOST_USER_CLIENT));
> }
> 
>@@ -6285,5 +6316,8 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev 
>*netdev)
> 
>     if (unregister) {
>-        dpdk_vhost_driver_unregister(dev, vhost_id);
>+        dpdk_vhost_driver_unregister_postpone(vhost_id,
>+                                              netdev_get_name(netdev),
>+                                              !(dev->vhost_driver_flags
>+                                              & RTE_VHOST_USER_CLIENT));
>     }
> 
>-- 

>2.46.2


My opinion agrees with Ilya Maximets.
If the rcu thread postpones calling rte_vhost_driver_unregister(), 
if rcu does not call it in time, the main thread may fail to re-register at 
this time.

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to