>>
>>>>              } else {
>>>> +                /* If the queue is disabled in the guest, the 
>>>> corresponding qid
>>>> +                 * map shall be set to OVS_VHOST_QUEUE_DISABLED(-2).
>>>> +                 *
>>>> +                 * The packets that were queued in 'qid' could be 
>>>> potentially
>>>> +                 * stuck and needs to be dropped.
>>>> +                 *
>>>> +                 * XXX: The queues may be already disabled in the guest so
>>>> +                 * flush function in this case only helps in updating 
>>>> stats
>>>> +                 * and freeing memory.
>>>> +                 */
>>>> +                netdev_dpdk_vhost_txq_flush(&dev->up, qid, 0);
>>>>                  dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
>>>>              }
>>>>              netdev_dpdk_remap_txqs(dev);
>>
>> 'netdev_dpdk_remap_txqs()', actually, is able to change mapping for
>> all the disabled in guest queues. So, we need to flush all of them
>> while remapping somewhere inside the function.
>> One other thing is that there is a race window between flush and
>> mapping update where another process able to enqueue more packets in
>> just flushed queue. The order of operations should be changed, or both
>> of them should be done under the same tx_lock. I think, it's required
>> to make tx_q[].map field atomic to fix the race condition, because
>> send function takes the 'map' and then locks the corresponding queue.
>> It wasn't an issue before, because packets in case of race was just
>> dropped on attempt to send to disabled queue, but with this patch
>> applied they will be enqueued to the intermediate queue and stuck there.
>
>Making 'map' atomic will not help. To solve the race we should make 'reading
>of map + enqueue' an atomic operation by some spinlock.
>Like this:
>
>vhost_send:
>----------------------------------------------------------------
>    qid = qid % netdev->n_txq;
>    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>
>    mapped_qid = dev->tx_q[qid].map;
>
>    if (qid != mapped_qid) {
>        rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
>    }
>
>    tx_enqueue(mapped_qid, pkts, cnt);
>
>    if (qid != mapped_qid) {
>        rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
>    }
>
>    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>----------------------------------------------------------------
>
>txq remapping inside 'netdev_dpdk_remap_txqs()' or
>'vring_state_changed()':
>----------------------------------------------------------------
>    qid - queue we need to remap.
>    new_qid - queue we need to remap to.
>
>    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>
>    mapped_qid = dev->tx_q[qid].map;
>    if (qid != mapped_qid) {
>        rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
>    }
>
>    tx_flush(mapped_qid)
>
>    if (qid != mapped_qid) {
>        rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
>    }
>
>    dev->tx_q[qid].map = new_qid;
>
>    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>----------------------------------------------------------------
>
>Above schema should work without races, but looks kind of ugly and requires
>taking of additional spinlock on each send.
>
>P.S. Sorry for talking with myself. Just want to share my thoughts.

Hi Ilya,

Can you please review the below changes based on what you suggested above. 
As the problem only happens when the queues are enabled/disabled in the guest, 
I did some  preliminary testing with the below changes by sending some traffic 
in to the VM
and enabling and disabling the queues inside the guest the same time. 

Vhost_send()
---------------------------------------------------------------------------------
    qid = qid % netdev->n_txq;

    /* Acquire tx_lock before reading tx_q[qid].map and enqueueing packets.
     * tx_q[].map gets updated in vring_state_changed() when vrings are
     * enabled/disabled in the guest. */
    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

    mapped_qid = dev->tx_q[qid].map;
    if (OVS_UNLIKELY(qid != mapped_qid)) {
        rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
    }

    if (OVS_UNLIKELY(!is_vhost_running(dev) || mapped_qid < 0
                     || !(dev->flags & NETDEV_UP))) {
        rte_spinlock_lock(&dev->stats_lock);
        dev->stats.tx_dropped+= cnt;
        rte_spinlock_unlock(&dev->stats_lock);

        for (i = 0; i < total_pkts; i++) {
            dp_packet_delete(pkts[i]);
        }

        if (OVS_UNLIKELY(qid != mapped_qid)) {
            rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
        }
        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

        return;
    }

    cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
    /* Check has QoS has been configured for the netdev */
    cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt);
    dropped = total_pkts - cnt;

    int idx = 0;
    struct dpdk_tx_queue *txq = &dev->tx_q[mapped_qid];
    while (idx < cnt) {
        txq->vhost_burst_pkts[txq->vhost_pkt_cnt++] = pkts[idx++];

        if (txq->vhost_pkt_cnt >= INTERIM_QUEUE_BURST_THRESHOLD) {
            dropped += netdev_dpdk_vhost_tx_burst(dev, mapped_qid);
        }
    }

    if (OVS_UNLIKELY(qid != mapped_qid)) {
        rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
    }

    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

    rte_spinlock_lock(&dev->stats_lock);
    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
                                         dropped);
    rte_spinlock_unlock(&dev->stats_lock);
-------------------------------------------------------------------------------------------------------


Vring_state_changed().

As t_q[].map should be atomic and is updated both in vring_state_changed and 
netdev_dpdk_remap_txqs(),
 I made updates to vring_state_changed(). 

------------------------------------------------------------------------------------------------------
LIST_FOR_EACH (dev, list_node, &dpdk_list) {
        ovs_mutex_lock(&dev->mutex);
        if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
            int mapped_qid;

            /* Acquire tx_lock as the dpdk_vhost_send() function will
             * read the tx_q[qid].map and lock the corresponding queue. */
            rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

            mapped_qid = dev->tx_q[qid].map;
            if (OVS_UNLIKELY(qid != mapped_qid)) {
                rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
            }

            netdev_dpdk_vhost_txq_flush(&dev->up, mapped_qid, 0);

            if (enable) {
                dev->tx_q[qid].map = qid;
            } else {
                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
            }

            netdev_dpdk_remap_txqs(dev);

            if (OVS_UNLIKELY(qid != mapped_qid)) {
                rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
            }
            rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

            exists = true;
            ovs_mutex_unlock(&dev->mutex);
            break;
        }
        ovs_mutex_unlock(&dev->mutex);
    }
----------------------------------------------------------------------------------------------

Regards,
Bhanuprakash.
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to