Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-11 Thread Ilya Maximets
On 11.08.2017 16:11, Bodireddy, Bhanuprakash wrote:
>> On 09.08.2017 15:35, Bodireddy, Bhanuprakash wrote:
>
> +static int
> +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid) {
> +struct dpdk_tx_queue *txq = &dev->tx_q[qid];
> +struct rte_mbuf **cur_pkts = (struct rte_mbuf
> +**)txq->vhost_burst_pkts;
> +
> +int tx_vid = netdev_dpdk_get_vid(dev);
> +int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
> +uint32_t sent = 0;
> +uint32_t retries = 0;
> +uint32_t sum, total_pkts;
> +
> +total_pkts = sum = txq->vhost_pkt_cnt;
> +do {
> +uint32_t ret;
> +ret = rte_vhost_enqueue_burst(tx_vid, tx_qid,
> + &cur_pkts[sent],
 sum);
> +if (OVS_UNLIKELY(!ret)) {
> +/* No packets enqueued - do not retry. */
> +break;
> +} else {
> +/* Packet have been sent. */
> +sent += ret;
> +
> +/* 'sum' packet have to be retransmitted. */
> +sum -= ret;
> +}
> +} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
> +
> +for (int i = 0; i < total_pkts; i++) {
> +dp_packet_delete(txq->vhost_burst_pkts[i]);
> +}
> +
> +/* Reset pkt count. */
> +txq->vhost_pkt_cnt = 0;
> +
> +/* 'sum' refers to packets dropped. */
> +return sum;
> +}
> +
> +/* Flush the txq if there are any packets available. */ static int
> +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
> +bool concurrent_txq OVS_UNUSED) {
> +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +struct dpdk_tx_queue *txq;
> +
> +qid = dev->tx_q[qid % netdev->n_txq].map;
> +
> +/* The qid may be disabled in the guest and has been set to
> + * OVS_VHOST_QUEUE_DISABLED.
> + */
> +if (OVS_UNLIKELY(qid < 0)) {
> +return 0;
> +}
> +
> +txq = &dev->tx_q[qid];
> +/* Increment the drop count and free the memory. */
> +if (OVS_UNLIKELY(!is_vhost_running(dev) ||
> + !(dev->flags & NETDEV_UP))) {
> +
> +if (txq->vhost_pkt_cnt) {
> +rte_spinlock_lock(&dev->stats_lock);
> +dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
> +rte_spinlock_unlock(&dev->stats_lock);
> +
> +for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
> +dp_packet_delete(txq->vhost_burst_pkts[i]);

 Spinlock (tx_lock) must be held here to avoid queue and mempool
>> breakage.
>>>
>>> I think you are right. tx_lock might be acquired for freeing the packets.
>>
>> I think that 'vhost_pkt_cnt' reads and updates also should be protected to
>> avoid races.
> 
> From the discussion in the thread 
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/337133.html,
> We are going to acquire tx_lock for updating the map and flushing the queue 
> inside vring_state_changed(). 
> 
> That triggers a deadlock in the  flushing function as we have already 
> acquired the same lock in netdev_dpdk_vhost_txq_flush().
> This is the same problem for freeing the memory and protecting the updates to 
> vhost_pkt_cnt.
> 
>if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>  rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> netdev_dpdk_vhost_tx_burst(dev, qid);
> rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>}
> 
> As the problem is triggered when the guest queues are enabled/disabled, with 
> a small race window where packets can get enqueued in to the queue just after 
> the flush and before map value is updated in cb 
> function(vring_state_changed()), how abt this?
> 
> Technically as the queues are disabled, there is no point in flushing the 
> packets, so let's free the packets and set the txq->vhost_pkt_cnt in 
> vring_state_changed() itself instead of calling flush().

Technically, enabling case also should be handled, because while enabling
we're remapping the queue and, in some specific cases, I guess, the old
queue may be not used after remapping by the threads.

> 
> vring_state_changed().
> --
> rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> 
> mapped_qid = dev->tx_q[qid].map;
>  if (OVS_UNLIKELY(qid != mapped_qid)) {
> rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
> }
> 
> if (enable) {
> dev->tx_q[qid].map = qid;
>   } else {
> struct dpdk_tx_queue *txq = &dev->tx_q[qid];
> if (txq->vhost_pkt_cnt) {
> rte_spinlock_lock(&dev->stats_lock);
> dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
> rte_spinlock_unlock(&dev->stats_lock);
> 
> for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
>  

Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-11 Thread Ilya Maximets
On 10.08.2017 21:52, Bodireddy, Bhanuprakash wrote:
>>>
>  } else {
> +/* If the queue is disabled in the guest, the 
> corresponding qid
> + * map shall be set to OVS_VHOST_QUEUE_DISABLED(-2).
> + *
> + * The packets that were queued in 'qid' could be 
> potentially
> + * stuck and needs to be dropped.
> + *
> + * XXX: The queues may be already disabled in the guest 
> so
> + * flush function in this case only helps in updating 
> stats
> + * and freeing memory.
> + */
> +netdev_dpdk_vhost_txq_flush(&dev->up, qid, 0);
>  dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
>  }
>  netdev_dpdk_remap_txqs(dev);
>>>
>>> 'netdev_dpdk_remap_txqs()', actually, is able to change mapping for
>>> all the disabled in guest queues. So, we need to flush all of them
>>> while remapping somewhere inside the function.
>>> One other thing is that there is a race window between flush and
>>> mapping update where another process able to enqueue more packets in
>>> just flushed queue. The order of operations should be changed, or both
>>> of them should be done under the same tx_lock. I think, it's required
>>> to make tx_q[].map field atomic to fix the race condition, because
>>> send function takes the 'map' and then locks the corresponding queue.
>>> It wasn't an issue before, because packets in case of race was just
>>> dropped on attempt to send to disabled queue, but with this patch
>>> applied they will be enqueued to the intermediate queue and stuck there.
>>
>> Making 'map' atomic will not help. To solve the race we should make 'reading
>> of map + enqueue' an atomic operation by some spinlock.
>> Like this:
>>
>> vhost_send:
>> 
>>qid = qid % netdev->n_txq;
>>rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>>
>>mapped_qid = dev->tx_q[qid].map;
>>
>>if (qid != mapped_qid) {
>>rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
>>}
>>
>>tx_enqueue(mapped_qid, pkts, cnt);
>>
>>if (qid != mapped_qid) {
>>rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
>>}
>>
>>rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>> 
>>
>> txq remapping inside 'netdev_dpdk_remap_txqs()' or
>> 'vring_state_changed()':
>> 
>>qid - queue we need to remap.
>>new_qid - queue we need to remap to.
>>
>>rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>>
>>mapped_qid = dev->tx_q[qid].map;
>>if (qid != mapped_qid) {
>>rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
>>}
>>
>>tx_flush(mapped_qid)
>>
>>if (qid != mapped_qid) {
>>rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
>>}
>>
>>dev->tx_q[qid].map = new_qid;
>>
>>rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>> 
>>
>> Above schema should work without races, but looks kind of ugly and requires
>> taking of additional spinlock on each send.
>>
>> P.S. Sorry for talking with myself. Just want to share my thoughts.
> 
> Hi Ilya,
> 
> Can you please review the below changes based on what you suggested above. 
> As the problem only happens when the queues are enabled/disabled in the 
> guest, 
> I did some  preliminary testing with the below changes by sending some 
> traffic in to the VM
> and enabling and disabling the queues inside the guest the same time. 
> 
> Vhost_send()
> -
> qid = qid % netdev->n_txq;
> 
> /* Acquire tx_lock before reading tx_q[qid].map and enqueueing packets.
>  * tx_q[].map gets updated in vring_state_changed() when vrings are
>  * enabled/disabled in the guest. */
> rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> 
> mapped_qid = dev->tx_q[qid].map;
> if (OVS_UNLIKELY(qid != mapped_qid)) {
> rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
> }
> 
> if (OVS_UNLIKELY(!is_vhost_running(dev) || mapped_qid < 0
>  || !(dev->flags & NETDEV_UP))) {
> rte_spinlock_lock(&dev->stats_lock);
> dev->stats.tx_dropped+= cnt;
> rte_spinlock_unlock(&dev->stats_lock);
> 
> for (i = 0; i < total_pkts; i++) {
> dp_packet_delete(pkts[i]);
> }
> 
> if (OVS_UNLIKELY(qid != mapped_qid)) {
> rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
> }
> rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> 
> return;
> }
> 
> cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
> /* Check has QoS has been

Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-11 Thread Bodireddy, Bhanuprakash
>On 09.08.2017 15:35, Bodireddy, Bhanuprakash wrote:

 +static int
 +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid) {
 +struct dpdk_tx_queue *txq = &dev->tx_q[qid];
 +struct rte_mbuf **cur_pkts = (struct rte_mbuf
 +**)txq->vhost_burst_pkts;
 +
 +int tx_vid = netdev_dpdk_get_vid(dev);
 +int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
 +uint32_t sent = 0;
 +uint32_t retries = 0;
 +uint32_t sum, total_pkts;
 +
 +total_pkts = sum = txq->vhost_pkt_cnt;
 +do {
 +uint32_t ret;
 +ret = rte_vhost_enqueue_burst(tx_vid, tx_qid,
 + &cur_pkts[sent],
>>> sum);
 +if (OVS_UNLIKELY(!ret)) {
 +/* No packets enqueued - do not retry. */
 +break;
 +} else {
 +/* Packet have been sent. */
 +sent += ret;
 +
 +/* 'sum' packet have to be retransmitted. */
 +sum -= ret;
 +}
 +} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
 +
 +for (int i = 0; i < total_pkts; i++) {
 +dp_packet_delete(txq->vhost_burst_pkts[i]);
 +}
 +
 +/* Reset pkt count. */
 +txq->vhost_pkt_cnt = 0;
 +
 +/* 'sum' refers to packets dropped. */
 +return sum;
 +}
 +
 +/* Flush the txq if there are any packets available. */ static int
 +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
 +bool concurrent_txq OVS_UNUSED) {
 +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 +struct dpdk_tx_queue *txq;
 +
 +qid = dev->tx_q[qid % netdev->n_txq].map;
 +
 +/* The qid may be disabled in the guest and has been set to
 + * OVS_VHOST_QUEUE_DISABLED.
 + */
 +if (OVS_UNLIKELY(qid < 0)) {
 +return 0;
 +}
 +
 +txq = &dev->tx_q[qid];
 +/* Increment the drop count and free the memory. */
 +if (OVS_UNLIKELY(!is_vhost_running(dev) ||
 + !(dev->flags & NETDEV_UP))) {
 +
 +if (txq->vhost_pkt_cnt) {
 +rte_spinlock_lock(&dev->stats_lock);
 +dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
 +rte_spinlock_unlock(&dev->stats_lock);
 +
 +for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
 +dp_packet_delete(txq->vhost_burst_pkts[i]);
>>>
>>> Spinlock (tx_lock) must be held here to avoid queue and mempool
>breakage.
>>
>> I think you are right. tx_lock might be acquired for freeing the packets.
>
>I think that 'vhost_pkt_cnt' reads and updates also should be protected to
>avoid races.

>From the discussion in the thread 
>https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/337133.html,
We are going to acquire tx_lock for updating the map and flushing the queue 
inside vring_state_changed(). 

That triggers a deadlock in the  flushing function as we have already acquired 
the same lock in netdev_dpdk_vhost_txq_flush().
This is the same problem for freeing the memory and protecting the updates to 
vhost_pkt_cnt.

   if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
netdev_dpdk_vhost_tx_burst(dev, qid);
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
   }

As the problem is triggered when the guest queues are enabled/disabled, with a 
small race window where packets can get enqueued in to the queue just after the 
flush and before map value is updated in cb function(vring_state_changed()), 
how abt this?

Technically as the queues are disabled, there is no point in flushing the 
packets, so let's free the packets and set the txq->vhost_pkt_cnt in 
vring_state_changed() itself instead of calling flush().

vring_state_changed().
--
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

mapped_qid = dev->tx_q[qid].map;
 if (OVS_UNLIKELY(qid != mapped_qid)) {
rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
}

if (enable) {
dev->tx_q[qid].map = qid;
  } else {
struct dpdk_tx_queue *txq = &dev->tx_q[qid];
if (txq->vhost_pkt_cnt) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
rte_spinlock_unlock(&dev->stats_lock);

for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
dp_packet_delete(txq->vhost_burst_pkts[i]);
}
txq->vhost_pkt_cnt = 0;
}

dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
  }
-

Regards,
Bhanuprakash.

>
>> ---
>> rte_spinlock_lock(&dev->tx_q[qid].tx

Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-10 Thread Bodireddy, Bhanuprakash
>>
  } else {
 +/* If the queue is disabled in the guest, the 
 corresponding qid
 + * map shall be set to OVS_VHOST_QUEUE_DISABLED(-2).
 + *
 + * The packets that were queued in 'qid' could be 
 potentially
 + * stuck and needs to be dropped.
 + *
 + * XXX: The queues may be already disabled in the guest so
 + * flush function in this case only helps in updating 
 stats
 + * and freeing memory.
 + */
 +netdev_dpdk_vhost_txq_flush(&dev->up, qid, 0);
  dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
  }
  netdev_dpdk_remap_txqs(dev);
>>
>> 'netdev_dpdk_remap_txqs()', actually, is able to change mapping for
>> all the disabled in guest queues. So, we need to flush all of them
>> while remapping somewhere inside the function.
>> One other thing is that there is a race window between flush and
>> mapping update where another process able to enqueue more packets in
>> just flushed queue. The order of operations should be changed, or both
>> of them should be done under the same tx_lock. I think, it's required
>> to make tx_q[].map field atomic to fix the race condition, because
>> send function takes the 'map' and then locks the corresponding queue.
>> It wasn't an issue before, because packets in case of race was just
>> dropped on attempt to send to disabled queue, but with this patch
>> applied they will be enqueued to the intermediate queue and stuck there.
>
>Making 'map' atomic will not help. To solve the race we should make 'reading
>of map + enqueue' an atomic operation by some spinlock.
>Like this:
>
>vhost_send:
>
>qid = qid % netdev->n_txq;
>rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>
>mapped_qid = dev->tx_q[qid].map;
>
>if (qid != mapped_qid) {
>rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
>}
>
>tx_enqueue(mapped_qid, pkts, cnt);
>
>if (qid != mapped_qid) {
>rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
>}
>
>rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>
>
>txq remapping inside 'netdev_dpdk_remap_txqs()' or
>'vring_state_changed()':
>
>qid - queue we need to remap.
>new_qid - queue we need to remap to.
>
>rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>
>mapped_qid = dev->tx_q[qid].map;
>if (qid != mapped_qid) {
>rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
>}
>
>tx_flush(mapped_qid)
>
>if (qid != mapped_qid) {
>rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
>}
>
>dev->tx_q[qid].map = new_qid;
>
>rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>
>
>Above schema should work without races, but looks kind of ugly and requires
>taking of additional spinlock on each send.
>
>P.S. Sorry for talking with myself. Just want to share my thoughts.

Hi Ilya,

Can you please review the below changes based on what you suggested above. 
As the problem only happens when the queues are enabled/disabled in the guest, 
I did some  preliminary testing with the below changes by sending some traffic 
in to the VM
and enabling and disabling the queues inside the guest the same time. 

Vhost_send()
-
qid = qid % netdev->n_txq;

/* Acquire tx_lock before reading tx_q[qid].map and enqueueing packets.
 * tx_q[].map gets updated in vring_state_changed() when vrings are
 * enabled/disabled in the guest. */
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

mapped_qid = dev->tx_q[qid].map;
if (OVS_UNLIKELY(qid != mapped_qid)) {
rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
}

if (OVS_UNLIKELY(!is_vhost_running(dev) || mapped_qid < 0
 || !(dev->flags & NETDEV_UP))) {
rte_spinlock_lock(&dev->stats_lock);
dev->stats.tx_dropped+= cnt;
rte_spinlock_unlock(&dev->stats_lock);

for (i = 0; i < total_pkts; i++) {
dp_packet_delete(pkts[i]);
}

if (OVS_UNLIKELY(qid != mapped_qid)) {
rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
}
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

return;
}

cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
/* Check has QoS has been configured for the netdev */
cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt);
dropped = total_pkts - cnt;

int idx = 0;
struct dpdk_tx_queue *txq = &dev->tx_q[mapped_qid];
while (idx < cnt) {
txq->vhost_burst_pkts[txq->

Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-09 Thread Ilya Maximets
On 09.08.2017 15:35, Bodireddy, Bhanuprakash wrote:
>>>
>>> +static int
>>> +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid) {
>>> +struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>>> +struct rte_mbuf **cur_pkts = (struct rte_mbuf
>>> +**)txq->vhost_burst_pkts;
>>> +
>>> +int tx_vid = netdev_dpdk_get_vid(dev);
>>> +int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
>>> +uint32_t sent = 0;
>>> +uint32_t retries = 0;
>>> +uint32_t sum, total_pkts;
>>> +
>>> +total_pkts = sum = txq->vhost_pkt_cnt;
>>> +do {
>>> +uint32_t ret;
>>> +ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, &cur_pkts[sent],
>> sum);
>>> +if (OVS_UNLIKELY(!ret)) {
>>> +/* No packets enqueued - do not retry. */
>>> +break;
>>> +} else {
>>> +/* Packet have been sent. */
>>> +sent += ret;
>>> +
>>> +/* 'sum' packet have to be retransmitted. */
>>> +sum -= ret;
>>> +}
>>> +} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
>>> +
>>> +for (int i = 0; i < total_pkts; i++) {
>>> +dp_packet_delete(txq->vhost_burst_pkts[i]);
>>> +}
>>> +
>>> +/* Reset pkt count. */
>>> +txq->vhost_pkt_cnt = 0;
>>> +
>>> +/* 'sum' refers to packets dropped. */
>>> +return sum;
>>> +}
>>> +
>>> +/* Flush the txq if there are any packets available. */ static int
>>> +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>>> +bool concurrent_txq OVS_UNUSED) {
>>> +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>> +struct dpdk_tx_queue *txq;
>>> +
>>> +qid = dev->tx_q[qid % netdev->n_txq].map;
>>> +
>>> +/* The qid may be disabled in the guest and has been set to
>>> + * OVS_VHOST_QUEUE_DISABLED.
>>> + */
>>> +if (OVS_UNLIKELY(qid < 0)) {
>>> +return 0;
>>> +}
>>> +
>>> +txq = &dev->tx_q[qid];
>>> +/* Increment the drop count and free the memory. */
>>> +if (OVS_UNLIKELY(!is_vhost_running(dev) ||
>>> + !(dev->flags & NETDEV_UP))) {
>>> +
>>> +if (txq->vhost_pkt_cnt) {
>>> +rte_spinlock_lock(&dev->stats_lock);
>>> +dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
>>> +rte_spinlock_unlock(&dev->stats_lock);
>>> +
>>> +for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
>>> +dp_packet_delete(txq->vhost_burst_pkts[i]);
>>
>> Spinlock (tx_lock) must be held here to avoid queue and mempool breakage.
> 
> I think you are right. tx_lock might be acquired for freeing the packets.

I think that 'vhost_pkt_cnt' reads and updates also should be protected to 
avoid races.

> ---
> rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
>  dp_packet_delete(txq->vhost_burst_pkts[i]);
> }
> rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> 
> - Bhanuprakash
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-09 Thread Bodireddy, Bhanuprakash
>enable)
  if (enable) {
  dev->tx_q[qid].map = qid;
>>
>> Here flushing required too because we're possibly enabling previously
>remapped queue.
>>
  } else {
 +/* If the queue is disabled in the guest, the 
 corresponding qid
 + * map shall be set to OVS_VHOST_QUEUE_DISABLED(-2).
 + *
 + * The packets that were queued in 'qid' could be 
 potentially
 + * stuck and needs to be dropped.
 + *
 + * XXX: The queues may be already disabled in the guest so
 + * flush function in this case only helps in updating 
 stats
 + * and freeing memory.
 + */
 +netdev_dpdk_vhost_txq_flush(&dev->up, qid, 0);
  dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
  }
  netdev_dpdk_remap_txqs(dev);
>>
>> 'netdev_dpdk_remap_txqs()', actually, is able to change mapping for
>> all the disabled in guest queues. So, we need to flush all of them
>> while remapping somewhere inside the function.
>> One other thing is that there is a race window between flush and
>> mapping update where another process able to enqueue more packets in
>> just flushed queue. The order of operations should be changed, or both
>> of them should be done under the same tx_lock. I think, it's required
>> to make tx_q[].map field atomic to fix the race condition, because
>> send function takes the 'map' and then locks the corresponding queue.
>> It wasn't an issue before, because packets in case of race was just
>> dropped on attempt to send to disabled queue, but with this patch
>> applied they will be enqueued to the intermediate queue and stuck there.
>
>Making 'map' atomic will not help. To solve the race we should make 'reading
>of map + enqueue' an atomic operation by some spinlock.
>Like this:
>
>vhost_send:
>
>qid = qid % netdev->n_txq;
>rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>
>mapped_qid = dev->tx_q[qid].map;
>
>if (qid != mapped_qid) {
>rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
>}
>
>tx_enqueue(mapped_qid, pkts, cnt);
>
>if (qid != mapped_qid) {
>rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
>}
>
>rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>
>
>txq remapping inside 'netdev_dpdk_remap_txqs()' or
>'vring_state_changed()':
>
>qid - queue we need to remap.
>new_qid - queue we need to remap to.
>
>rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>
>mapped_qid = dev->tx_q[qid].map;
>if (qid != mapped_qid) {
>rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
>}
>
>tx_flush(mapped_qid)
>
>if (qid != mapped_qid) {
>rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
>}
>
>dev->tx_q[qid].map = new_qid;
>
>rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>
>
>Above schema should work without races, but looks kind of ugly and requires
>taking of additional spinlock on each send.
>
>P.S. Sorry for talking with myself. Just want to share my thoughts.

Hi Ilya,

Thanks for reviewing the patches and providing inputs.
I went through your comments for this patch(2/5) and agree with the suggestions.
Meanwhile  while go through the changes above and get back to you.

Bhanuprakash. 


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-09 Thread Bodireddy, Bhanuprakash
>>
>> +static int
>> +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid) {
>> +struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>> +struct rte_mbuf **cur_pkts = (struct rte_mbuf
>> +**)txq->vhost_burst_pkts;
>> +
>> +int tx_vid = netdev_dpdk_get_vid(dev);
>> +int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
>> +uint32_t sent = 0;
>> +uint32_t retries = 0;
>> +uint32_t sum, total_pkts;
>> +
>> +total_pkts = sum = txq->vhost_pkt_cnt;
>> +do {
>> +uint32_t ret;
>> +ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, &cur_pkts[sent],
>sum);
>> +if (OVS_UNLIKELY(!ret)) {
>> +/* No packets enqueued - do not retry. */
>> +break;
>> +} else {
>> +/* Packet have been sent. */
>> +sent += ret;
>> +
>> +/* 'sum' packet have to be retransmitted. */
>> +sum -= ret;
>> +}
>> +} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
>> +
>> +for (int i = 0; i < total_pkts; i++) {
>> +dp_packet_delete(txq->vhost_burst_pkts[i]);
>> +}
>> +
>> +/* Reset pkt count. */
>> +txq->vhost_pkt_cnt = 0;
>> +
>> +/* 'sum' refers to packets dropped. */
>> +return sum;
>> +}
>> +
>> +/* Flush the txq if there are any packets available. */ static int
>> +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>> +bool concurrent_txq OVS_UNUSED) {
>> +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> +struct dpdk_tx_queue *txq;
>> +
>> +qid = dev->tx_q[qid % netdev->n_txq].map;
>> +
>> +/* The qid may be disabled in the guest and has been set to
>> + * OVS_VHOST_QUEUE_DISABLED.
>> + */
>> +if (OVS_UNLIKELY(qid < 0)) {
>> +return 0;
>> +}
>> +
>> +txq = &dev->tx_q[qid];
>> +/* Increment the drop count and free the memory. */
>> +if (OVS_UNLIKELY(!is_vhost_running(dev) ||
>> + !(dev->flags & NETDEV_UP))) {
>> +
>> +if (txq->vhost_pkt_cnt) {
>> +rte_spinlock_lock(&dev->stats_lock);
>> +dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
>> +rte_spinlock_unlock(&dev->stats_lock);
>> +
>> +for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
>> +dp_packet_delete(txq->vhost_burst_pkts[i]);
>
>Spinlock (tx_lock) must be held here to avoid queue and mempool breakage.

I think you are right. tx_lock might be acquired for freeing the packets.

---
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
 dp_packet_delete(txq->vhost_burst_pkts[i]);
}
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

- Bhanuprakash
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-09 Thread Ilya Maximets
On 09.08.2017 13:03, Ilya Maximets wrote:
> One more comment inline.
> 
> On 09.08.2017 11:06, Ilya Maximets wrote:
>> Not a full review.
>> One comment inline.
>>
>>> Add netdev_dpdk_vhost_txq_flush(), that flushes packets on vHost User
>>> port queues. Also add netdev_dpdk_vhost_tx_burst() function that
>>> uses rte_vhost_enqueue_burst() to enqueue burst of packets on vHost User
>>> ports.
>>>
>>> Signed-off-by: Bhanuprakash Bodireddy 
>>> Signed-off-by: Antonio Fischetti 
>>> Co-authored-by: Antonio Fischetti 
>>> Acked-by: Eelco Chaudron 
>>> ---
>>>  lib/netdev-dpdk.c | 112 
>>> --
>>>  1 file changed, 108 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>>> index 50d6b29..d3892fe 100644
>>> --- a/lib/netdev-dpdk.c
>>> +++ b/lib/netdev-dpdk.c
>>> @@ -327,12 +327,22 @@ struct dpdk_tx_queue {
>>>  * pmd threads (see 'concurrent_txq'). 
>>> */
>>>  int map;   /* Mapping of configured vhost-user 
>>> queues
>>>  * to enabled by guest. */
>>> -int dpdk_pkt_cnt;  /* Number of buffered packets waiting to
>>> +union {
>>> +int dpdk_pkt_cnt;  /* Number of buffered packets waiting to
>>>be sent on DPDK tx queue. */
>>> -struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>>> +int vhost_pkt_cnt; /* Number of buffered packets waiting to
>>> +  be sent on vhost port. */
>>> +};
>>> +
>>> +union {
>>> +struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>>> /* Intermediate queue where packets can
>>>  * be buffered to amortize the cost of 
>>> MMIO
>>>  * writes. */
>>> +struct dp_packet *vhost_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>>> +   /* Intermediate queue where packets can
>>> +* be buffered for vhost ports. */
>>> +};
>>>  };
>>>  
>>>  /* dpdk has no way to remove dpdk ring ethernet devices
>>> @@ -1756,6 +1766,88 @@ netdev_dpdk_vhost_update_tx_counters(struct 
>>> netdev_stats *stats,
>>>  }
>>>  }
>>>  
>>> +static int
>>> +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid)
>>> +{
>>> +struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>>> +struct rte_mbuf **cur_pkts = (struct rte_mbuf **)txq->vhost_burst_pkts;
>>> +
>>> +int tx_vid = netdev_dpdk_get_vid(dev);
>>> +int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
>>> +uint32_t sent = 0;
>>> +uint32_t retries = 0;
>>> +uint32_t sum, total_pkts;
>>> +
>>> +total_pkts = sum = txq->vhost_pkt_cnt;
>>> +do {
>>> +uint32_t ret;
>>> +ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, &cur_pkts[sent], 
>>> sum);
>>> +if (OVS_UNLIKELY(!ret)) {
>>> +/* No packets enqueued - do not retry. */
>>> +break;
>>> +} else {
>>> +/* Packet have been sent. */
>>> +sent += ret;
>>> +
>>> +/* 'sum' packet have to be retransmitted. */
>>> +sum -= ret;
>>> +}
>>> +} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
>>> +
>>> +for (int i = 0; i < total_pkts; i++) {
>>> +dp_packet_delete(txq->vhost_burst_pkts[i]);
>>> +}
>>> +
>>> +/* Reset pkt count. */
>>> +txq->vhost_pkt_cnt = 0;
>>> +
>>> +/* 'sum' refers to packets dropped. */
>>> +return sum;
>>> +}
>>> +
>>> +/* Flush the txq if there are any packets available. */
>>> +static int
>>> +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>>> +bool concurrent_txq OVS_UNUSED)
>>> +{
>>> +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>> +struct dpdk_tx_queue *txq;
>>> +
>>> +qid = dev->tx_q[qid % netdev->n_txq].map;
>>> +
>>> +/* The qid may be disabled in the guest and has been set to
>>> + * OVS_VHOST_QUEUE_DISABLED.
>>> + */
>>> +if (OVS_UNLIKELY(qid < 0)) {
>>> +return 0;
>>> +}
>>> +
>>> +txq = &dev->tx_q[qid];
>>> +/* Increment the drop count and free the memory. */
>>> +if (OVS_UNLIKELY(!is_vhost_running(dev) ||
>>> + !(dev->flags & NETDEV_UP))) {
>>> +
>>> +if (txq->vhost_pkt_cnt) {
>>> +rte_spinlock_lock(&dev->stats_lock);
>>> +dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
>>> +rte_spinlock_unlock(&dev->stats_lock);
>>> +
>>> +for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
>>> +dp_packet_delete(txq->vhost_burst_pkts[i]);
>>
>> Spinlock (tx_lock) must be held here to avoid queue and mempool breakage.
>>
>>> +}
>>> +txq->vhost_pkt_cnt = 0;
>>> +}
>>> +}
>>> +
>>> +if (OVS_LIKELY

Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-09 Thread Ilya Maximets
One more comment inline.

On 09.08.2017 11:06, Ilya Maximets wrote:
> Not a full review.
> One comment inline.
> 
>> Add netdev_dpdk_vhost_txq_flush(), that flushes packets on vHost User
>> port queues. Also add netdev_dpdk_vhost_tx_burst() function that
>> uses rte_vhost_enqueue_burst() to enqueue burst of packets on vHost User
>> ports.
>>
>> Signed-off-by: Bhanuprakash Bodireddy 
>> Signed-off-by: Antonio Fischetti 
>> Co-authored-by: Antonio Fischetti 
>> Acked-by: Eelco Chaudron 
>> ---
>>  lib/netdev-dpdk.c | 112 
>> --
>>  1 file changed, 108 insertions(+), 4 deletions(-)
>>
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>> index 50d6b29..d3892fe 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -327,12 +327,22 @@ struct dpdk_tx_queue {
>>  * pmd threads (see 'concurrent_txq'). */
>>  int map;   /* Mapping of configured vhost-user 
>> queues
>>  * to enabled by guest. */
>> -int dpdk_pkt_cnt;  /* Number of buffered packets waiting to
>> +union {
>> +int dpdk_pkt_cnt;  /* Number of buffered packets waiting to
>>be sent on DPDK tx queue. */
>> -struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>> +int vhost_pkt_cnt; /* Number of buffered packets waiting to
>> +  be sent on vhost port. */
>> +};
>> +
>> +union {
>> +struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>> /* Intermediate queue where packets can
>>  * be buffered to amortize the cost of 
>> MMIO
>>  * writes. */
>> +struct dp_packet *vhost_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>> +   /* Intermediate queue where packets can
>> +* be buffered for vhost ports. */
>> +};
>>  };
>>  
>>  /* dpdk has no way to remove dpdk ring ethernet devices
>> @@ -1756,6 +1766,88 @@ netdev_dpdk_vhost_update_tx_counters(struct 
>> netdev_stats *stats,
>>  }
>>  }
>>  
>> +static int
>> +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid)
>> +{
>> +struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>> +struct rte_mbuf **cur_pkts = (struct rte_mbuf **)txq->vhost_burst_pkts;
>> +
>> +int tx_vid = netdev_dpdk_get_vid(dev);
>> +int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
>> +uint32_t sent = 0;
>> +uint32_t retries = 0;
>> +uint32_t sum, total_pkts;
>> +
>> +total_pkts = sum = txq->vhost_pkt_cnt;
>> +do {
>> +uint32_t ret;
>> +ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, &cur_pkts[sent], sum);
>> +if (OVS_UNLIKELY(!ret)) {
>> +/* No packets enqueued - do not retry. */
>> +break;
>> +} else {
>> +/* Packet have been sent. */
>> +sent += ret;
>> +
>> +/* 'sum' packet have to be retransmitted. */
>> +sum -= ret;
>> +}
>> +} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
>> +
>> +for (int i = 0; i < total_pkts; i++) {
>> +dp_packet_delete(txq->vhost_burst_pkts[i]);
>> +}
>> +
>> +/* Reset pkt count. */
>> +txq->vhost_pkt_cnt = 0;
>> +
>> +/* 'sum' refers to packets dropped. */
>> +return sum;
>> +}
>> +
>> +/* Flush the txq if there are any packets available. */
>> +static int
>> +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>> +bool concurrent_txq OVS_UNUSED)
>> +{
>> +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> +struct dpdk_tx_queue *txq;
>> +
>> +qid = dev->tx_q[qid % netdev->n_txq].map;
>> +
>> +/* The qid may be disabled in the guest and has been set to
>> + * OVS_VHOST_QUEUE_DISABLED.
>> + */
>> +if (OVS_UNLIKELY(qid < 0)) {
>> +return 0;
>> +}
>> +
>> +txq = &dev->tx_q[qid];
>> +/* Increment the drop count and free the memory. */
>> +if (OVS_UNLIKELY(!is_vhost_running(dev) ||
>> + !(dev->flags & NETDEV_UP))) {
>> +
>> +if (txq->vhost_pkt_cnt) {
>> +rte_spinlock_lock(&dev->stats_lock);
>> +dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
>> +rte_spinlock_unlock(&dev->stats_lock);
>> +
>> +for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
>> +dp_packet_delete(txq->vhost_burst_pkts[i]);
> 
> Spinlock (tx_lock) must be held here to avoid queue and mempool breakage.
> 
>> +}
>> +txq->vhost_pkt_cnt = 0;
>> +}
>> +}
>> +
>> +if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>> +rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>> +netdev_dpdk_vhost_tx_burst(dev, qid);
>> +rte_spinlock_unlock(&dev->tx_q[qid].tx_lock

Re: [ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-09 Thread Ilya Maximets
Not a full review.
One comment inline.

> Add netdev_dpdk_vhost_txq_flush(), that flushes packets on vHost User
> port queues. Also add netdev_dpdk_vhost_tx_burst() function that
> uses rte_vhost_enqueue_burst() to enqueue burst of packets on vHost User
> ports.
> 
> Signed-off-by: Bhanuprakash Bodireddy 
> Signed-off-by: Antonio Fischetti 
> Co-authored-by: Antonio Fischetti 
> Acked-by: Eelco Chaudron 
> ---
>  lib/netdev-dpdk.c | 112 
> --
>  1 file changed, 108 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 50d6b29..d3892fe 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -327,12 +327,22 @@ struct dpdk_tx_queue {
>  * pmd threads (see 'concurrent_txq'). */
>  int map;   /* Mapping of configured vhost-user queues
>  * to enabled by guest. */
> -int dpdk_pkt_cnt;  /* Number of buffered packets waiting to
> +union {
> +int dpdk_pkt_cnt;  /* Number of buffered packets waiting to
>be sent on DPDK tx queue. */
> -struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
> +int vhost_pkt_cnt; /* Number of buffered packets waiting to
> +  be sent on vhost port. */
> +};
> +
> +union {
> +struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
> /* Intermediate queue where packets can
>  * be buffered to amortize the cost of 
> MMIO
>  * writes. */
> +struct dp_packet *vhost_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
> +   /* Intermediate queue where packets can
> +* be buffered for vhost ports. */
> +};
>  };
>  
>  /* dpdk has no way to remove dpdk ring ethernet devices
> @@ -1756,6 +1766,88 @@ netdev_dpdk_vhost_update_tx_counters(struct 
> netdev_stats *stats,
>  }
>  }
>  
> +static int
> +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid)
> +{
> +struct dpdk_tx_queue *txq = &dev->tx_q[qid];
> +struct rte_mbuf **cur_pkts = (struct rte_mbuf **)txq->vhost_burst_pkts;
> +
> +int tx_vid = netdev_dpdk_get_vid(dev);
> +int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
> +uint32_t sent = 0;
> +uint32_t retries = 0;
> +uint32_t sum, total_pkts;
> +
> +total_pkts = sum = txq->vhost_pkt_cnt;
> +do {
> +uint32_t ret;
> +ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, &cur_pkts[sent], sum);
> +if (OVS_UNLIKELY(!ret)) {
> +/* No packets enqueued - do not retry. */
> +break;
> +} else {
> +/* Packet have been sent. */
> +sent += ret;
> +
> +/* 'sum' packet have to be retransmitted. */
> +sum -= ret;
> +}
> +} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
> +
> +for (int i = 0; i < total_pkts; i++) {
> +dp_packet_delete(txq->vhost_burst_pkts[i]);
> +}
> +
> +/* Reset pkt count. */
> +txq->vhost_pkt_cnt = 0;
> +
> +/* 'sum' refers to packets dropped. */
> +return sum;
> +}
> +
> +/* Flush the txq if there are any packets available. */
> +static int
> +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
> +bool concurrent_txq OVS_UNUSED)
> +{
> +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +struct dpdk_tx_queue *txq;
> +
> +qid = dev->tx_q[qid % netdev->n_txq].map;
> +
> +/* The qid may be disabled in the guest and has been set to
> + * OVS_VHOST_QUEUE_DISABLED.
> + */
> +if (OVS_UNLIKELY(qid < 0)) {
> +return 0;
> +}
> +
> +txq = &dev->tx_q[qid];
> +/* Increment the drop count and free the memory. */
> +if (OVS_UNLIKELY(!is_vhost_running(dev) ||
> + !(dev->flags & NETDEV_UP))) {
> +
> +if (txq->vhost_pkt_cnt) {
> +rte_spinlock_lock(&dev->stats_lock);
> +dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
> +rte_spinlock_unlock(&dev->stats_lock);
> +
> +for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
> +dp_packet_delete(txq->vhost_burst_pkts[i]);

Spinlock (tx_lock) must be held here to avoid queue and mempool breakage.

> +}
> +txq->vhost_pkt_cnt = 0;
> +}
> +}
> +
> +if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
> +rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +netdev_dpdk_vhost_tx_burst(dev, qid);
> +rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +}
> +
> +return 0;
> +}
> +
>  static void
>  __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>   struct dp_packet **pkts, int cnt)
> @@ -2799,6 +2891,17 @@ vrin

[ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

2017-08-08 Thread Bhanuprakash Bodireddy
Add netdev_dpdk_vhost_txq_flush(), that flushes packets on vHost User
port queues. Also add netdev_dpdk_vhost_tx_burst() function that
uses rte_vhost_enqueue_burst() to enqueue burst of packets on vHost User
ports.

Signed-off-by: Bhanuprakash Bodireddy 
Signed-off-by: Antonio Fischetti 
Co-authored-by: Antonio Fischetti 
Acked-by: Eelco Chaudron 
---
 lib/netdev-dpdk.c | 112 --
 1 file changed, 108 insertions(+), 4 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 50d6b29..d3892fe 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -327,12 +327,22 @@ struct dpdk_tx_queue {
 * pmd threads (see 'concurrent_txq'). */
 int map;   /* Mapping of configured vhost-user queues
 * to enabled by guest. */
-int dpdk_pkt_cnt;  /* Number of buffered packets waiting to
+union {
+int dpdk_pkt_cnt;  /* Number of buffered packets waiting to
   be sent on DPDK tx queue. */
-struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
+int vhost_pkt_cnt; /* Number of buffered packets waiting to
+  be sent on vhost port. */
+};
+
+union {
+struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
/* Intermediate queue where packets can
 * be buffered to amortize the cost of MMIO
 * writes. */
+struct dp_packet *vhost_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
+   /* Intermediate queue where packets can
+* be buffered for vhost ports. */
+};
 };
 
 /* dpdk has no way to remove dpdk ring ethernet devices
@@ -1756,6 +1766,88 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_stats 
*stats,
 }
 }
 
+static int
+netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid)
+{
+struct dpdk_tx_queue *txq = &dev->tx_q[qid];
+struct rte_mbuf **cur_pkts = (struct rte_mbuf **)txq->vhost_burst_pkts;
+
+int tx_vid = netdev_dpdk_get_vid(dev);
+int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
+uint32_t sent = 0;
+uint32_t retries = 0;
+uint32_t sum, total_pkts;
+
+total_pkts = sum = txq->vhost_pkt_cnt;
+do {
+uint32_t ret;
+ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, &cur_pkts[sent], sum);
+if (OVS_UNLIKELY(!ret)) {
+/* No packets enqueued - do not retry. */
+break;
+} else {
+/* Packet have been sent. */
+sent += ret;
+
+/* 'sum' packet have to be retransmitted. */
+sum -= ret;
+}
+} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
+
+for (int i = 0; i < total_pkts; i++) {
+dp_packet_delete(txq->vhost_burst_pkts[i]);
+}
+
+/* Reset pkt count. */
+txq->vhost_pkt_cnt = 0;
+
+/* 'sum' refers to packets dropped. */
+return sum;
+}
+
+/* Flush the txq if there are any packets available. */
+static int
+netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
+bool concurrent_txq OVS_UNUSED)
+{
+struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+struct dpdk_tx_queue *txq;
+
+qid = dev->tx_q[qid % netdev->n_txq].map;
+
+/* The qid may be disabled in the guest and has been set to
+ * OVS_VHOST_QUEUE_DISABLED.
+ */
+if (OVS_UNLIKELY(qid < 0)) {
+return 0;
+}
+
+txq = &dev->tx_q[qid];
+/* Increment the drop count and free the memory. */
+if (OVS_UNLIKELY(!is_vhost_running(dev) ||
+ !(dev->flags & NETDEV_UP))) {
+
+if (txq->vhost_pkt_cnt) {
+rte_spinlock_lock(&dev->stats_lock);
+dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
+rte_spinlock_unlock(&dev->stats_lock);
+
+for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
+dp_packet_delete(txq->vhost_burst_pkts[i]);
+}
+txq->vhost_pkt_cnt = 0;
+}
+}
+
+if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
+rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
+netdev_dpdk_vhost_tx_burst(dev, qid);
+rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
+}
+
+return 0;
+}
+
 static void
 __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
  struct dp_packet **pkts, int cnt)
@@ -2799,6 +2891,17 @@ vring_state_changed(int vid, uint16_t queue_id, int 
enable)
 if (enable) {
 dev->tx_q[qid].map = qid;
 } else {
+/* If the queue is disabled in the guest, the corresponding qid
+ * map shall be set to OVS_VHOST_QUEUE_DISABLED(-2).
+ *
+ * The packets that were queued in 'qid' could be p