Hi IIya, Thanks for working on it. This patch consumes dp_packet_batch_clone so I have concern on the performance. Could you please show some performace number with/without your patch.
Thanks Zhenyu Gao 2017-08-10 23:38 GMT+08:00 Ilya Maximets <[email protected]>: > While processing incoming batch of packets they are scattered > across many per-flow batches and sent separately. > > This becomes an issue while using more than a few flows. > > For example if we have balanced-tcp OvS bonding with 2 ports > there will be 256 datapath internal flows for each dp_hash > pattern. This will lead to scattering of a single recieved > batch across all of that 256 per-flow batches and invoking > send for each packet separately. This behaviour greatly degrades > overall performance of netdev_send because of inability to use > advantages of vectorized transmit functions. > But the half (if 2 ports in bonding) of datapath flows will > have the same output actions. This means that we can collect > them in a single place back and send at once using single call > to netdev_send. This patch introduces per-port packet batch > for output packets for that purpose. > > 'output_pkts' batch is thread local and located in send port cache. > > Signed-off-by: Ilya Maximets <[email protected]> > --- > lib/dpif-netdev.c | 104 ++++++++++++++++++++++++++++++ > ++++++++++++------------ > 1 file changed, 82 insertions(+), 22 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index e2cd931..a2a25be 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -502,6 +502,7 @@ struct tx_port { > int qid; > long long last_used; > struct hmap_node node; > + struct dp_packet_batch output_pkts; > }; > > /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate > @@ -633,9 +634,10 @@ static void dp_netdev_execute_actions(struct > dp_netdev_pmd_thread *pmd, > size_t actions_len, > long long now); > static void dp_netdev_input(struct dp_netdev_pmd_thread *, > - struct dp_packet_batch *, odp_port_t port_no); > + struct dp_packet_batch *, odp_port_t port_no, > + long long now); > static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *, > - struct dp_packet_batch *); > + struct dp_packet_batch *, long long > now); > > static void dp_netdev_disable_upcall(struct dp_netdev *); > static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd); > @@ -667,6 +669,9 @@ static void dp_netdev_add_rxq_to_pmd(struct > dp_netdev_pmd_thread *pmd, > static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, > struct rxq_poll *poll) > OVS_REQUIRES(pmd->port_mutex); > +static void > +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, > + long long now); > static void reconfigure_datapath(struct dp_netdev *dp) > OVS_REQUIRES(dp->port_mutex); > static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd); > @@ -2809,6 +2814,7 @@ dpif_netdev_execute(struct dpif *dpif, struct > dpif_execute *execute) > struct dp_netdev *dp = get_dp_netdev(dpif); > struct dp_netdev_pmd_thread *pmd; > struct dp_packet_batch pp; > + long long now = time_msec(); > > if (dp_packet_size(execute->packet) < ETH_HEADER_LEN || > dp_packet_size(execute->packet) > UINT16_MAX) { > @@ -2851,8 +2857,8 @@ dpif_netdev_execute(struct dpif *dpif, struct > dpif_execute *execute) > > dp_packet_batch_init_packet(&pp, execute->packet); > dp_netdev_execute_actions(pmd, &pp, false, execute->flow, > - execute->actions, execute->actions_len, > - time_msec()); > + execute->actions, execute->actions_len, > now); > + dp_netdev_pmd_flush_output_packets(pmd, now); > > if (pmd->core_id == NON_PMD_CORE_ID) { > ovs_mutex_unlock(&dp->non_pmd_mutex); > @@ -3101,6 +3107,37 @@ cycles_count_intermediate(struct > dp_netdev_pmd_thread *pmd, > non_atomic_ullong_add(&pmd->cycles.n[type], interval); > } > > +static void > +dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd, > + struct tx_port *p, long long now) > +{ > + int tx_qid; > + bool dynamic_txqs; > + > + dynamic_txqs = p->port->dynamic_txqs; > + if (dynamic_txqs) { > + tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now); > + } else { > + tx_qid = pmd->static_tx_qid; > + } > + > + netdev_send(p->port->netdev, tx_qid, &p->output_pkts, true, > dynamic_txqs); > + dp_packet_batch_init(&p->output_pkts); > +} > + > +static void > +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, > + long long now) > +{ > + struct tx_port *p; > + > + HMAP_FOR_EACH (p, node, &pmd->send_port_cache) { > + if (!dp_packet_batch_is_empty(&p->output_pkts)) { > + dp_netdev_pmd_flush_output_on_port(pmd, p, now); > + } > + } > +} > + > static int > dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, > struct netdev_rxq *rx, > @@ -3113,10 +3150,13 @@ dp_netdev_process_rxq_port(struct > dp_netdev_pmd_thread *pmd, > dp_packet_batch_init(&batch); > error = netdev_rxq_recv(rx, &batch); > if (!error) { > + long long now = time_msec(); > + > *recirc_depth_get() = 0; > > batch_cnt = batch.count; > - dp_netdev_input(pmd, &batch, port_no); > + dp_netdev_input(pmd, &batch, port_no, now); > + dp_netdev_pmd_flush_output_packets(pmd, now); > } else if (error != EAGAIN && error != EOPNOTSUPP) { > static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); > > @@ -4481,6 +4521,7 @@ dp_netdev_add_port_tx_to_pmd(struct > dp_netdev_pmd_thread *pmd, > > tx->port = port; > tx->qid = -1; > + dp_packet_batch_init(&tx->output_pkts); > > hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_ > no)); > pmd->need_reload = true; > @@ -4901,7 +4942,8 @@ fast_path_processing(struct dp_netdev_pmd_thread > *pmd, > static void > dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, > struct dp_packet_batch *packets, > - bool md_is_valid, odp_port_t port_no) > + bool md_is_valid, odp_port_t port_no, > + long long now) > { > int cnt = packets->count; > #if !defined(__CHECKER__) && !defined(_WIN32) > @@ -4913,7 +4955,6 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, > OVS_ALIGNED_VAR(CACHE_LINE_SIZE) > struct netdev_flow_key keys[PKT_ARRAY_SIZE]; > struct packet_batch_per_flow batches[PKT_ARRAY_SIZE]; > - long long now = time_msec(); > size_t n_batches; > odp_port_t in_port; > > @@ -4949,16 +4990,16 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, > static void > dp_netdev_input(struct dp_netdev_pmd_thread *pmd, > struct dp_packet_batch *packets, > - odp_port_t port_no) > + odp_port_t port_no, long long now) > { > - dp_netdev_input__(pmd, packets, false, port_no); > + dp_netdev_input__(pmd, packets, false, port_no, now); > } > > static void > dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd, > - struct dp_packet_batch *packets) > + struct dp_packet_batch *packets, long long now) > { > - dp_netdev_input__(pmd, packets, true, 0); > + dp_netdev_input__(pmd, packets, true, 0, now); > } > > struct dp_netdev_execute_aux { > @@ -5136,18 +5177,37 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > case OVS_ACTION_ATTR_OUTPUT: > p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a)); > if (OVS_LIKELY(p)) { > - int tx_qid; > - bool dynamic_txqs; > + struct dp_packet *packet; > + struct dp_packet_batch out; > > - dynamic_txqs = p->port->dynamic_txqs; > - if (dynamic_txqs) { > - tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p, now); > - } else { > - tx_qid = pmd->static_tx_qid; > + if (!may_steal) { > + dp_packet_batch_clone(&out, packets_); > + dp_packet_batch_reset_cutlen(packets_); > + packets_ = &out; > + } > + dp_packet_batch_apply_cutlen(packets_); > + > +#ifdef DPDK_NETDEV > + if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts) > + && packets_->packets[0]->source > + != p->output_pkts.packets[0]->source)) { > + /* XXX: netdev-dpdk assumes that all packets in a single > + * outptut batch has the same source. Flush here to > + * avoid memory access issues. */ > + dp_netdev_pmd_flush_output_on_port(pmd, p, now); > + } > +#endif > + > + if (OVS_UNLIKELY(dp_packet_batch_size(&p->output_pkts) > + + dp_packet_batch_size(packets_) > > NETDEV_MAX_BURST)) { > + /* Some packets was generated while input batch > processing. > + * Flush here to avoid overflow. */ > + dp_netdev_pmd_flush_output_on_port(pmd, p, now); > } > > - netdev_send(p->port->netdev, tx_qid, packets_, may_steal, > - dynamic_txqs); > + DP_PACKET_BATCH_FOR_EACH (packet, packets_) { > + dp_packet_batch_add(&p->output_pkts, packet); > + } > return; > } > break; > @@ -5188,7 +5248,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > } > > (*depth)++; > - dp_netdev_recirculate(pmd, packets_); > + dp_netdev_recirculate(pmd, packets_, now); > (*depth)--; > return; > } > @@ -5253,7 +5313,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch > *packets_, > } > > (*depth)++; > - dp_netdev_recirculate(pmd, packets_); > + dp_netdev_recirculate(pmd, packets_, now); > (*depth)--; > > return; > -- > 2.7.4 > > _______________________________________________ > dev mailing list > [email protected] > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
