[ovs-dev] [PATCH 5/5] lib/netdev-dpdk: copy large packet to multi-segment mbufs
From: Michael QiuCurrently, one packet is only copied to one segment in function dpdk_do_tx_copy(), this could be an issue when a jumbo frame comes, especially for multiple segments. This patch calculate the segment number needed by the packet and copy the data to different segments. Signed-off-by: Michael Qiu --- lib/netdev-dpdk.c | 53 - 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 0485872..38ec2ed 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1776,14 +1776,16 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) #endif struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); struct rte_mbuf *pkts[PKT_ARRAY_SIZE]; +struct rte_mbuf *temp, *head = NULL; int dropped = 0; int newcnt = 0; -int i; +int i, j, nb_segs; dp_packet_batch_apply_cutlen(batch); for (i = 0; i < batch->count; i++) { int size = dp_packet_size(batch->packets[i]); +int max_data_len, tmp_len; if (OVS_UNLIKELY(size > dev->max_packet_len)) { VLOG_WARN_RL(, "Too big size %d max_packet_len %d", @@ -1793,7 +1795,24 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) continue; } -pkts[newcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp); +temp = pkts[newcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp); + +/* all new allocated mbuf's max data len is the same */ +max_data_len = temp->buf_len - temp->data_off; + +nb_segs = size/max_data_len; +if (size % max_data_len) +nb_segs = nb_segs + 1; + +for (j = 1; j < nb_segs; j++) { +temp->next = rte_pktmbuf_alloc(dev->dpdk_mp->mp); +if (!temp->next) { +rte_pktmbuf_free(pkts[newcnt]); +pkts[newcnt] = NULL; +break; +} +temp = temp->next; +} if (!pkts[newcnt]) { dropped += batch->count - i; @@ -1801,10 +1820,34 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) } /* We have to do a copy for now */ -memcpy(rte_pktmbuf_mtod(pkts[newcnt], void *), - dp_packet_data(batch->packets[i]), size); +rte_pktmbuf_pkt_len(pkts[newcnt]) = size; +temp = pkts[newcnt]; +tmp_len = size < max_data_len ? size: max_data_len; +if (batch->packets[i]->source == DPBUF_DPDK) { +head = &(batch->packets[i]->mbuf); +while (temp && head && size > 0) { +rte_memcpy(rte_pktmbuf_mtod(temp, void*), dp_packet_data((struct dp_packet *)head),tmp_len); +rte_pktmbuf_data_len(temp) = tmp_len; +head = head->next; +size = size - tmp_len; +tmp_len = size < max_data_len ? size: max_data_len; +temp = temp->next; +} +} else { +int offset = 0; +while (temp && size > 0) { +memcpy(rte_pktmbuf_mtod(temp, void *), +dp_packet_at(batch->packets[i], offset,tmp_len), tmp_len); +rte_pktmbuf_data_len(temp) = tmp_len; +temp = temp->next; +size = size - tmp_len; +offset +=tmp_len; +tmp_len = size < max_data_len ? size: max_data_len; +} +} + -pkts[newcnt]->nb_segs = batch->packets[i]->mbuf.nb_segs; +pkts[newcnt]->nb_segs = nb_segs; pkts[newcnt]->ol_flags = batch->packets[i]->mbuf.ol_flags; pkts[newcnt]->packet_type = batch->packets[i]->mbuf.packet_type; pkts[newcnt]->tx_offload = batch->packets[i]->mbuf.tx_offload; -- 1.8.3.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 4/5] lib/dp-packet: copy multi-segments data from DPDK mbuf
From: Michael QiuWhen doing packet clone, if packet source is from DPDK driver, multi-segment must be considered, and copy the segment's data one by one. Signed-off-by: Michael Qiu --- lib/dp-packet.c | 27 --- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 9f872a1..278706e 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -167,9 +167,30 @@ dp_packet_clone_with_headroom(const struct dp_packet *buffer, size_t headroom) { struct dp_packet *new_buffer; -new_buffer = dp_packet_clone_data_with_headroom(dp_packet_data(buffer), - dp_packet_size(buffer), - headroom); +uint32_t size = dp_packet_size(buffer); + +/* copy multi-seg data */ +#ifdef DPDK_NETDEV +if (buffer->source == DPBUF_DPDK && buffer->mbuf.nb_segs > 1) { +uint32_t off_set = 0; +void *dst = NULL; +struct rte_mbuf *tmbuf = CONST_CAST(struct rte_mbuf *, &(buffer->mbuf)); + +new_buffer = dp_packet_new_with_headroom(size, headroom); +dst = dp_packet_put_uninit(new_buffer, size); + +while (tmbuf) { +rte_memcpy((char *)dst + off_set, + rte_pktmbuf_mtod(tmbuf, void *), tmbuf->data_len); +off_set += tmbuf->data_len; +tmbuf = tmbuf->next; +} +} +else +#endif +new_buffer = dp_packet_clone_data_with_headroom(dp_packet_data(buffer), +size, headroom); + new_buffer->l2_pad_size = buffer->l2_pad_size; new_buffer->l2_5_ofs = buffer->l2_5_ofs; new_buffer->l3_ofs = buffer->l3_ofs; -- 1.8.3.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 2/5] lib/dp-packet: copy additional packet info when do packet copy
From: Michael QiuCurrently, when doing packet copy, lots of DPDK mbuf's info will be missed, like packet type, ol_flags, etc. Those information is very important for DPDK to do packets processing. Signed-off-by: Michael Qiu --- lib/dp-packet.c | 3 +++ lib/netdev-dpdk.c | 4 2 files changed, 7 insertions(+) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index ee2c449..9f872a1 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -179,6 +179,9 @@ dp_packet_clone_with_headroom(const struct dp_packet *buffer, size_t headroom) new_buffer->packet_type = buffer->packet_type; #ifdef DPDK_NETDEV new_buffer->mbuf.ol_flags = buffer->mbuf.ol_flags; +new_buffer->mbuf.tx_offload = buffer->mbuf.tx_offload; +new_buffer->mbuf.packet_type = buffer->mbuf.packet_type; +new_buffer->mbuf.nb_segs = buffer->mbuf.nb_segs; #else new_buffer->rss_hash_valid = buffer->rss_hash_valid; #endif diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index bba4de3..0485872 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1804,6 +1804,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) memcpy(rte_pktmbuf_mtod(pkts[newcnt], void *), dp_packet_data(batch->packets[i]), size); +pkts[newcnt]->nb_segs = batch->packets[i]->mbuf.nb_segs; +pkts[newcnt]->ol_flags = batch->packets[i]->mbuf.ol_flags; +pkts[newcnt]->packet_type = batch->packets[i]->mbuf.packet_type; +pkts[newcnt]->tx_offload = batch->packets[i]->mbuf.tx_offload; rte_pktmbuf_data_len(pkts[newcnt]) = size; rte_pktmbuf_pkt_len(pkts[newcnt]) = size; -- 1.8.3.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 0/5 v3] DPDK multi-segment mbuf support
From: Michael QiuCurrently, OVS only support DPDK single segment mbuf, it could lead problems, like a large non-DPDK source packet transmit to dpdk port. Also, OVS doesn't copy enough info in mbuf when do packet copy. At the same time, vlan and tunnelling packet's DPDK offloads, for example TSO, needs multi-segment mbuf's support. This patchset solved all above issues. -- v3 --> v2 rebase code to newest upstream. using MIN() to calculate the data_len instead of if/else v2 --> v1 rebase code to newest upstream. fix some typo in commit log. Michael Qiu (4): lib/dp-packet: init the mbuf to zero when build with DPDK lib/dp-packet: Fix data_len issue with multi-segments lib/dp-packet: copy multi-segments data from DPDK mbuf lib/netdev-dpdk: copy large packet to multi-segment mbufs suzhengwei (1): lib/dp-packet: copy additional packet info when do packet copy lib/dp-packet.c | 33 ++--- lib/dp-packet.h | 18 -- lib/netdev-dpdk.c | 55 +++ 3 files changed, 89 insertions(+), 17 deletions(-) -- 1.8.3.1 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v2 02/12] openvswitch.h: Use odp_port_t for port numbers in userspace-only structs.
Reviewed-by: nickcooper-zhangtonghao> On Jun 19, 2017, at 7:29 AM, Ben Pfaff wrote: > > Using the correct type reduces the need for type conversions. > > Signed-off-by: Ben Pfaff > > --- > datapath/linux/compat/include/linux/openvswitch.h | 4 ++-- > lib/dpif-netdev.c | 2 +- > lib/netdev.c | 2 +- > ofproto/ofproto-dpif-sflow.c | 2 +- > ofproto/ofproto-dpif-xlate.c | 4 ++-- > 5 files changed, 7 insertions(+), 7 deletions(-) > > diff --git a/datapath/linux/compat/include/linux/openvswitch.h > b/datapath/linux/compat/include/linux/openvswitch.h > index 4c88de1d610d..24e51cb311d2 100644 > --- a/datapath/linux/compat/include/linux/openvswitch.h > +++ b/datapath/linux/compat/include/linux/openvswitch.h > @@ -714,8 +714,8 @@ struct ovs_action_hash { > * this header to build final header according to actual packet parameters. > */ > struct ovs_action_push_tnl { > - uint32_t tnl_port; > - uint32_t out_port; > + odp_port_t tnl_port; > + odp_port_t out_port; > uint32_t header_len; > uint32_t tnl_type; /* For logging. */ > uint32_t header[TNL_PUSH_HEADER_SIZE / 4]; > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 2b65dc74a269..f97e97ab2931 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -4956,7 +4956,7 @@ push_tnl_action(const struct dp_netdev_pmd_thread *pmd, > > data = nl_attr_get(attr); > > -tun_port = pmd_tnl_port_cache_lookup(pmd, u32_to_odp(data->tnl_port)); > +tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port); > if (!tun_port) { > err = -EINVAL; > goto error; > diff --git a/lib/netdev.c b/lib/netdev.c > index 001b7b37bb57..765bf4b9ccad 100644 > --- a/lib/netdev.c > +++ b/lib/netdev.c > @@ -831,7 +831,7 @@ netdev_push_header(const struct netdev *netdev, > struct dp_packet *packet; > DP_PACKET_BATCH_FOR_EACH (packet, batch) { > netdev->netdev_class->push_header(packet, data); > -pkt_metadata_init(>md, u32_to_odp(data->out_port)); > +pkt_metadata_init(>md, data->out_port); > } > > return 0; > diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c > index d9fddb1564b5..fc665a636853 100644 > --- a/ofproto/ofproto-dpif-sflow.c > +++ b/ofproto/ofproto-dpif-sflow.c > @@ -901,7 +901,7 @@ sflow_read_tnl_push_action(const struct nlattr *attr, > const struct ip_header *ip > = ALIGNED_CAST(const struct ip_header *, eth + 1); > > -sflow_actions->out_port = u32_to_odp(data->out_port); > +sflow_actions->out_port = data->out_port; > > /* Ethernet. */ > /* TODO: SFlow does not currently define a MAC-in-MAC > diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c > index e15e3dec3f1c..48c4bad4ac0b 100644 > --- a/ofproto/ofproto-dpif-xlate.c > +++ b/ofproto/ofproto-dpif-xlate.c > @@ -3211,8 +3211,8 @@ build_tunnel_send(struct xlate_ctx *ctx, const struct > xport *xport, > if (err) { > return err; > } > -tnl_push_data.tnl_port = odp_to_u32(tunnel_odp_port); > -tnl_push_data.out_port = odp_to_u32(out_dev->odp_port); > +tnl_push_data.tnl_port = tunnel_odp_port; > +tnl_push_data.out_port = out_dev->odp_port; > > /* After tunnel header has been added, packet_type of flow and base_flow > * need to be set to PT_ETH. */ > -- > 2.10.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v2 01/12] ofp-util: Remove prototype for unimplemented function.
Reviewed-by: nickcooper-zhangtonghao> On Jun 19, 2017, at 7:29 AM, Ben Pfaff wrote: > > Signed-off-by: Ben Pfaff > > --- > include/openvswitch/ofp-util.h | 2 -- > 1 file changed, 2 deletions(-) > > diff --git a/include/openvswitch/ofp-util.h b/include/openvswitch/ofp-util.h > index bbf6ffec5dd3..07723b427ce8 100644 > --- a/include/openvswitch/ofp-util.h > +++ b/include/openvswitch/ofp-util.h > @@ -247,8 +247,6 @@ void ofputil_match_to_ofp10_match(const struct match *, > struct ofp10_match *); > enum ofperr ofputil_pull_ofp11_match(struct ofpbuf *, const struct tun_table > *, > const struct vl_mff_map *, struct match > *, > uint16_t *padded_match_len); > -enum ofperr ofputil_pull_ofp11_mask(struct ofpbuf *, struct match *, > -struct mf_bitmap *bm); > enum ofperr ofputil_match_from_ofp11_match(const struct ofp11_match *, >struct match *); > int ofputil_put_ofp11_match(struct ofpbuf *, const struct match *, > -- > 2.10.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v2 00/12] Packet type aware pipeline
On Mon, Jun 19, 2017 at 07:29:29AM +0800, Ben Pfaff wrote: > This series is based on Zoltan Balogh's series here: > https://patchwork.ozlabs.org/patch/770490/ > https://patchwork.ozlabs.org/patch/770487/ > https://patchwork.ozlabs.org/patch/770495/ > https://patchwork.ozlabs.org/patch/770498/ > https://patchwork.ozlabs.org/patch/770488/ > https://patchwork.ozlabs.org/patch/770489/ > > v1->v2: > - Squash fixup patches. > - Apply changes agreed with Jan. > - Not yet done: Figure out whether to really show packet_type in (some) > match_format() output. > - New patch at the end unsuccessfully tries to re-enable packet-aware > test. Either I don't have enough insight yet, or it just reveals a > bug or two. > - 4 new patches at beginning. First one is trivial. Next 3 are intended > to make it easier to debug the packet aware test that is still failing. > Jan, you don't have to feel obligated to review these if you feel they > are off-topic; I will get separate reviews. A new concern came up while thinking about this series. The OVS_ATTR_PACKET_TYPE does not appear to be implemented in the kernel module, and what's more, because of #ifdefs, OVS_ATTR_PACKET_TYPE will actually have a different value in the kernel module than in userspace. What's the plan here? ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v2 11/12] userspace: Introduce packet_type in OF 1.5 packet-out
From: Zoltán BaloghIntroducing packet_type in OF 1.5 packet-out. Partly based on Jean Tourrilhes's work. Add test cases for OF1.5 packet-out Add negative test case for OF1.5 packet-out Modify wildcarding and packet-out test printout. Signed-off-by: Jean Tourrilhes Signed-off-by: Zoltan Balogh Co-authored-by: Jan Scheurich Signed-off-by: Ben Pfaff --- lib/flow.c | 36 +++- lib/ofp-parse.c | 13 + lib/ofp-print.c | 4 +- lib/ofp-util.c | 2 + ofproto/ofproto.c | 3 + tests/ofproto.at| 85 + tests/system-userspace-packet-type-aware.at | 2 +- utilities/ovs-ofctl.c | 1 + 8 files changed, 128 insertions(+), 18 deletions(-) diff --git a/lib/flow.c b/lib/flow.c index dbca4d03da3d..75a91cc6a2f3 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -1441,6 +1441,8 @@ void flow_wildcards_init_for_packet(struct flow_wildcards *wc, const struct flow *flow) { +ovs_be16 dl_type = OVS_BE16_MAX; + memset(>masks, 0x0, sizeof wc->masks); /* Update this function whenever struct flow changes. */ @@ -1493,25 +1495,29 @@ flow_wildcards_init_for_packet(struct flow_wildcards *wc, /* actset_output wildcarded. */ WC_MASK_FIELD(wc, packet_type); -WC_MASK_FIELD(wc, dl_dst); -WC_MASK_FIELD(wc, dl_src); -WC_MASK_FIELD(wc, dl_type); - -/* No need to set mask of inner VLANs that don't exist. */ -for (int i = 0; i < FLOW_MAX_VLAN_HEADERS; i++) { -/* Always show the first zero VLAN. */ -WC_MASK_FIELD(wc, vlans[i]); -if (flow->vlans[i].tci == htons(0)) { -break; +if (flow->packet_type == htonl(PT_ETH)) { +WC_MASK_FIELD(wc, dl_dst); +WC_MASK_FIELD(wc, dl_src); +WC_MASK_FIELD(wc, dl_type); +/* No need to set mask of inner VLANs that don't exist. */ +for (int i = 0; i < FLOW_MAX_VLAN_HEADERS; i++) { +/* Always show the first zero VLAN. */ +WC_MASK_FIELD(wc, vlans[i]); +if (flow->vlans[i].tci == htons(0)) { +break; +} } +dl_type = flow->dl_type; +} else { +dl_type = pt_ns_type_be(flow->packet_type); } -if (flow->dl_type == htons(ETH_TYPE_IP)) { +if (dl_type == htons(ETH_TYPE_IP)) { WC_MASK_FIELD(wc, nw_src); WC_MASK_FIELD(wc, nw_dst); WC_MASK_FIELD(wc, ct_nw_src); WC_MASK_FIELD(wc, ct_nw_dst); -} else if (flow->dl_type == htons(ETH_TYPE_IPV6)) { +} else if (dl_type == htons(ETH_TYPE_IPV6)) { WC_MASK_FIELD(wc, ipv6_src); WC_MASK_FIELD(wc, ipv6_dst); WC_MASK_FIELD(wc, ipv6_label); @@ -1523,15 +1529,15 @@ flow_wildcards_init_for_packet(struct flow_wildcards *wc, WC_MASK_FIELD(wc, ct_ipv6_src); WC_MASK_FIELD(wc, ct_ipv6_dst); } -} else if (flow->dl_type == htons(ETH_TYPE_ARP) || - flow->dl_type == htons(ETH_TYPE_RARP)) { +} else if (dl_type == htons(ETH_TYPE_ARP) || + dl_type == htons(ETH_TYPE_RARP)) { WC_MASK_FIELD(wc, nw_src); WC_MASK_FIELD(wc, nw_dst); WC_MASK_FIELD(wc, nw_proto); WC_MASK_FIELD(wc, arp_sha); WC_MASK_FIELD(wc, arp_tha); return; -} else if (eth_type_mpls(flow->dl_type)) { +} else if (eth_type_mpls(dl_type)) { for (int i = 0; i < FLOW_MAX_MPLS_LABELS; i++) { WC_MASK_FIELD(wc, mpls_lse[i]); if (flow->mpls_lse[i] & htonl(MPLS_BOS_MASK)) { diff --git a/lib/ofp-parse.c b/lib/ofp-parse.c index 8e2448b20dbd..528b75b4f4e1 100644 --- a/lib/ofp-parse.c +++ b/lib/ofp-parse.c @@ -667,6 +667,19 @@ parse_ofp_packet_out_str__(struct ofputil_packet_out *po, char *string, goto out; } match_set_in_port(>flow_metadata, in_port); +} else if (!strcmp(name, "packet_type")) { +char *ns = value; +char *ns_type = strstr(value, ","); +if (ns_type) { +ovs_be32 packet_type; +*ns_type = '\0'; +packet_type = PACKET_TYPE_BE(strtoul(ns, NULL, 0), + strtoul(++ns_type, NULL, 0)); +match_set_packet_type(>flow_metadata, packet_type); +} else { +error = xasprintf("%s(%s) can't be interpreted", name, value); +goto out; +} } else if (!strcmp(name, "packet")) { const char *error_msg = eth_from_hex(value, ); if (error_msg) { diff --git a/lib/ofp-print.c b/lib/ofp-print.c index 8a6c54e1da0f..4370cb5221fc 100644 --- a/lib/ofp-print.c +++
[ovs-dev] [PATCH v2 07/12] userspace: Add OXM field MFF_PACKET_TYPE
From: Jan ScheurichAllow packet type namespace OFPHTN_ETHERTYPE as alternative pre-requisite for matching L3 protocols (MPLS, IP, IPv6, ARP etc). Change the meta-flow definition of packet_type field to use the new custom format MFS_PACKET_TYPE representing "(NS,NS_TYPE)". Parsing routine for MFS_PACKET_TYPE added to meta-flow.c. Formatting routine for field packet_type extracted from match_format() and moved to flow.c to be used from meta-flow.c for formatting MFS_PACKET_TYPE. Updated the ovs-fields man page source meta-flow.xml with documentation for packet-type-aware bridges and added documentation for field packet_type. Added packet_type to the matching properties in tests/ofproto.at. Should be removed later, when packet_type_aware bridge attribute will be introduced. Signed-off-by: Jan Scheurich Signed-off-by: Ben Pfaff --- build-aux/extract-ofp-fields| 3 +- include/openvswitch/match.h | 5 + include/openvswitch/meta-flow.h | 20 lib/flow.c | 34 +- lib/flow.h | 27 +++-- lib/learn.c | 1 + lib/match.c | 98 +++-- lib/meta-flow.c | 86 +-- lib/meta-flow.xml | 156 +++ lib/nx-match.c | 34 +- lib/odp-util.c | 38 +++ lib/ofp-parse.c | 12 +++ lib/ofp-util.c | 67 +--- ofproto/ofproto-dpif-xlate.c| 1 + ofproto/tunnel.c| 2 - tests/dpif-netdev.at| 89 tests/odp.at| 1 + tests/ofproto-dpif.at | 230 tests/ofproto.at| 1 + tests/ovs-ofctl.at | 2 +- tests/pmd.at| 8 +- tests/tunnel-push-pop-ipv6.at | 2 +- tests/tunnel-push-pop.at| 2 +- tests/tunnel.at | 18 ++-- 24 files changed, 653 insertions(+), 284 deletions(-) diff --git a/build-aux/extract-ofp-fields b/build-aux/extract-ofp-fields index d5b8a820251e..24dd756ad7d5 100755 --- a/build-aux/extract-ofp-fields +++ b/build-aux/extract-ofp-fields @@ -36,7 +36,8 @@ FORMATTING = {"decimal":("MFS_DECIMAL", 1, 8), "OpenFlow 1.1+ port": ("MFS_OFP_PORT_OXM", 4, 4), "frag": ("MFS_FRAG", 1, 1), "tunnel flags": ("MFS_TNL_FLAGS",2, 2), - "TCP flags": ("MFS_TCP_FLAGS",2, 2)} + "TCP flags": ("MFS_TCP_FLAGS",2, 2), + "packet type":("MFS_PACKET_TYPE", 4, 4)} PREREQS = {"none": "MFP_NONE", "Ethernet": "MFP_ETHERNET", diff --git a/include/openvswitch/match.h b/include/openvswitch/match.h index 70da928fe47d..aca725265c79 100644 --- a/include/openvswitch/match.h +++ b/include/openvswitch/match.h @@ -23,6 +23,7 @@ struct ds; struct ofputil_port_map; +struct mf_field; /* A flow classification match. * @@ -119,6 +120,10 @@ void match_set_ct_ipv6_dst_masked(struct match *, const struct in6_addr *, const struct in6_addr *); void match_set_packet_type(struct match *, ovs_be32 packet_type); +void match_set_default_packet_type(struct match *); +bool match_has_default_packet_type(const struct match *); +void match_add_ethernet_prereq(struct match *, const struct mf_field *); + void match_set_skb_priority(struct match *, uint32_t skb_priority); void match_set_dl_type(struct match *, ovs_be16); void match_set_dl_src(struct match *, const struct eth_addr ); diff --git a/include/openvswitch/meta-flow.h b/include/openvswitch/meta-flow.h index cbfd3ba65b73..fc109501d869 100644 --- a/include/openvswitch/meta-flow.h +++ b/include/openvswitch/meta-flow.h @@ -133,6 +133,11 @@ struct ofputil_tlv_table_mod; * * TCP flags: See the description of tcp_flags in ovs-ofctl(8). * + * packet type: A pair of packet type namespace NS and NS_TYPE within + * that namespace "(NS,NS_TYPE)". NS and NS_TYPE are formatted in + * decimal or hexadecimal as and accept decimal and hexadecimal (with + * 0x prefix) at parsing. + * * Prerequisites: * * The field's prerequisites. The values should be straightfoward. @@ -248,6 +253,20 @@ enum OVS_PACKED_ENUM mf_field_id { */ MFF_RECIRC_ID, +/* "packet_type". + * + * Define the packet type in OpenFlow 1.5+. + * + * Type: be32. + * Maskable: no. + * Formatting: packet type. + * Prerequisites: none. + * Access: read-only. + * NXM: none. + * OXM: OXM_OF_PACKET_TYPE(44) since OF1.5 and v2.8. + */ +MFF_PACKET_TYPE, + /* "conj_id". * * ID for "conjunction" actions. Please refer to ovs-ofctl(8) @@ -1860,6 +1879,7 @@ enum OVS_PACKED_ENUM
[ovs-dev] [PATCH v2 06/12] nx-match: Add context argument to nxm_put__().
An upcoming commit will need to pass an extra piece of data from nx_put_raw() into all of its direct and indirect calls to nxm_put__(). This commit prepares for that by switching from a "struct ofpbuf *" parameter to a context structure that, currently, contains just a struct ofpbuf *. The upcoming commit will add another member to the context struct. This commit has no visible effect on behavior. Signed-off-by: Ben Pfaff--- lib/nx-match.c | 232 + lib/nx-match.h | 6 +- lib/tun-metadata.c | 4 +- 3 files changed, 131 insertions(+), 111 deletions(-) diff --git a/lib/nx-match.c b/lib/nx-match.c index 334ecd4a3f1a..6278b7758783 100644 --- a/lib/nx-match.c +++ b/lib/nx-match.c @@ -772,203 +772,222 @@ oxm_pull_field_array(const void *fields_data, size_t fields_len, * 'put' functions whose names end in 'm' add a field that might be wildcarded. * Other 'put' functions add exact-match fields. */ + +struct nxm_put_ctx { +struct ofpbuf *output; +}; + void -nxm_put__(struct ofpbuf *b, enum mf_field_id field, enum ofp_version version, - const void *value, const void *mask, size_t n_bytes) +nxm_put_entry_raw(struct ofpbuf *b, + enum mf_field_id field, enum ofp_version version, + const void *value, const void *mask, size_t n_bytes) { nx_put_header_len(b, field, version, !!mask, n_bytes); ofpbuf_put(b, value, n_bytes); if (mask) { ofpbuf_put(b, mask, n_bytes); } +} +static void +nxm_put__(struct nxm_put_ctx *ctx, + enum mf_field_id field, enum ofp_version version, + const void *value, const void *mask, size_t n_bytes) +{ +nxm_put_entry_raw(ctx->output, field, version, value, mask, n_bytes); } static void -nxm_put(struct ofpbuf *b, enum mf_field_id field, enum ofp_version version, +nxm_put(struct nxm_put_ctx *ctx, +enum mf_field_id field, enum ofp_version version, const void *value, const void *mask, size_t n_bytes) { if (!is_all_zeros(mask, n_bytes)) { bool masked = !is_all_ones(mask, n_bytes); -nxm_put__(b, field, version, value, masked ? mask : NULL, n_bytes); +nxm_put__(ctx, field, version, value, masked ? mask : NULL, n_bytes); } } static void -nxm_put_8m(struct ofpbuf *b, enum mf_field_id field, enum ofp_version version, +nxm_put_8m(struct nxm_put_ctx *ctx, + enum mf_field_id field, enum ofp_version version, uint8_t value, uint8_t mask) { -nxm_put(b, field, version, , , sizeof value); +nxm_put(ctx, field, version, , , sizeof value); } static void -nxm_put_8(struct ofpbuf *b, enum mf_field_id field, enum ofp_version version, - uint8_t value) +nxm_put_8(struct nxm_put_ctx *ctx, + enum mf_field_id field, enum ofp_version version, uint8_t value) { -nxm_put__(b, field, version, , NULL, sizeof value); +nxm_put__(ctx, field, version, , NULL, sizeof value); } static void -nxm_put_16m(struct ofpbuf *b, enum mf_field_id field, enum ofp_version version, +nxm_put_16m(struct nxm_put_ctx *ctx, +enum mf_field_id field, enum ofp_version version, ovs_be16 value, ovs_be16 mask) { -nxm_put(b, field, version, , , sizeof value); +nxm_put(ctx, field, version, , , sizeof value); } static void -nxm_put_16(struct ofpbuf *b, enum mf_field_id field, enum ofp_version version, - ovs_be16 value) +nxm_put_16(struct nxm_put_ctx *ctx, + enum mf_field_id field, enum ofp_version version, ovs_be16 value) { -nxm_put__(b, field, version, , NULL, sizeof value); +nxm_put__(ctx, field, version, , NULL, sizeof value); } static void -nxm_put_32m(struct ofpbuf *b, enum mf_field_id field, enum ofp_version version, +nxm_put_32m(struct nxm_put_ctx *ctx, +enum mf_field_id field, enum ofp_version version, ovs_be32 value, ovs_be32 mask) { -nxm_put(b, field, version, , , sizeof value); +nxm_put(ctx, field, version, , , sizeof value); } static void -nxm_put_32(struct ofpbuf *b, enum mf_field_id field, enum ofp_version version, - ovs_be32 value) +nxm_put_32(struct nxm_put_ctx *ctx, + enum mf_field_id field, enum ofp_version version, ovs_be32 value) { -nxm_put__(b, field, version, , NULL, sizeof value); +nxm_put__(ctx, field, version, , NULL, sizeof value); } static void -nxm_put_64m(struct ofpbuf *b, enum mf_field_id field, enum ofp_version version, +nxm_put_64m(struct nxm_put_ctx *ctx, +enum mf_field_id field, enum ofp_version version, ovs_be64 value, ovs_be64 mask) { -nxm_put(b, field, version, , , sizeof value); +nxm_put(ctx, field, version, , , sizeof value); } static void -nxm_put_128m(struct ofpbuf *b, +nxm_put_128m(struct nxm_put_ctx *ctx, enum mf_field_id field, enum ofp_version version, const ovs_be128 value, const ovs_be128 mask) { -
[ovs-dev] [PATCH v2 08/12] userspace: Handling of versatile tunnel ports
In netdev_gre_build_header(), GRE protocol and VXLAN next_potocol is set based on packet_type of flow. If it's about an Ethernet packet, it is set to ETP_TYPE_TEB. Otherwise, if the name space is OFPHTN_ETHERNET, it is set according to the name space type. Signed-off-by: Jan ScheurichSigned-off-by: Ben Pfaff --- NEWS | 6 +-- lib/meta-flow.xml | 28 ++- lib/netdev-bsd.c | 1 + lib/netdev-dpdk.c | 1 + lib/netdev-dummy.c| 1 + lib/netdev-linux.c| 1 + lib/netdev-native-tnl.c | 23 ++--- lib/netdev-provider.h | 6 +++ lib/netdev-vport.c| 106 ++ lib/netdev-vport.h| 1 - lib/netdev.c | 8 lib/netdev.h | 29 +++- ofproto/ofproto-dpif-xlate.c | 35 -- ofproto/ofproto-dpif.c| 4 +- ofproto/tunnel.c | 27 --- tests/tunnel-push-pop-ipv6.at | 4 +- tests/tunnel-push-pop.at | 4 +- vswitchd/vswitch.xml | 94 ++--- 18 files changed, 277 insertions(+), 102 deletions(-) diff --git a/NEWS b/NEWS index a2f5a6dc8e54..8b0ad6191325 100644 --- a/NEWS +++ b/NEWS @@ -59,11 +59,9 @@ Post-v2.7.0 * OVN services are no longer restarted automatically after upgrade. - Add --cleanup option to command 'ovs-appctl exit' (see ovs-vswitchd(8)). - L3 tunneling: - * Add "layer3" options for tunnel ports that support non-Ethernet (L3) - payload (GRE, VXLAN-GPE). + * Use new tunnel port option "packet_type" to configure L2 vs. L3. * New vxlan tunnel extension "gpe" to support VXLAN-GPE tunnels. - * Transparently pop and push Ethernet headers at transmit/reception - of packets to/from L3 tunnels. + * New support for non-Ethernet (L3) payloads in GRE and VXLAN-GPE. - The BFD detection multiplier is now user-configurable. - New support for HW offloading * HW offloading is disabled by default. diff --git a/lib/meta-flow.xml b/lib/meta-flow.xml index 856e1ba8cf7b..dc2731e2a260 100644 --- a/lib/meta-flow.xml +++ b/lib/meta-flow.xml @@ -26,19 +26,25 @@ networking technology in use are called called root fields. Open vSwitch 2.7 and earlier considered Ethernet fields to be root fields, and this remains the default mode of operation for Open vSwitch bridges. -In this mode, when a packet is received from a non-Ethernet interfaces, -such as a layer-3 LISP or GRE tunnel, Open vSwitch force-fits it to this +When a packet is received from a non-Ethernet interfaces, such as a layer-3 +LISP tunnel, Open vSwitch 2.7 and earlier force-fit the packet to this Ethernet-centric point of view by pretending that an Ethernet header is present whose Ethernet type that indicates the packet's actual type (and whose source and destination addresses are all-zero). -Open vSwitch 2.8 and later supports the ``packet type-aware pipeline'' -concept introduced in OpenFlow 1.5. A bridge configured to be packet -type-aware can handle packets of multiple networking technologies, such as -Ethernet, IP, ARP, MPLS, or NSH in parallel. Such a bridge does not have -any root fields. +Open vSwitch 2.8 and later implement the ``packet type-aware pipeline'' +concept introduced in OpenFlow 1.5. Such a pipeline does not have any root +fields. Instead, a new metadata field, , +indicates the basic type of the packet, which can be Ethernet, IPv4, IPv6, +or another type. For backward compatibility, by default Open vSwitch 2.8 +imitates the behavior of Open vSwitch 2.7 and earlier. Later versions of +Open vSwitch may change the default, and in the meantime controllers can +turn off this legacy behavior by setting +other-config:packet-type to ptap in the +Bridge table. (See ovs-vwitchd.conf.db(5) for +more information.) @@ -332,14 +338,6 @@ tcp,tp_src=0x07c0/0xfff0 mplsm eth_type=0x8848 - -These shorthand notations continue to work in packet type-aware bridges. -The absence of a packet_type match implies -packet_type=ethernet, so that shorthands match on Ethernet -packets with the implied eth_type. Please note that the shorthand -ip does not match packets of packet_type (1,0x800) for IPv4. - - Evolution of OpenFlow Fields diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index f863a189cd5e..6cc83d347795 100644 --- a/lib/netdev-bsd.c +++ b/lib/netdev-bsd.c @@ -1517,6 +1517,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off, \ GET_FEATURES,\ NULL, /* set_advertisement */\ +NULL, /* get_pt_mode */ \ NULL,
[ovs-dev] [PATCH v2 05/12] ofpbuf: New function ofpbuf_insert().
This will receive its first users in an upcoming commit. Signed-off-by: Ben Pfaff--- include/openvswitch/ofpbuf.h | 1 + lib/ofpbuf.c | 18 ++ 2 files changed, 19 insertions(+) diff --git a/include/openvswitch/ofpbuf.h b/include/openvswitch/ofpbuf.h index bc25bb8a1780..6142f4a588e1 100644 --- a/include/openvswitch/ofpbuf.h +++ b/include/openvswitch/ofpbuf.h @@ -141,6 +141,7 @@ void ofpbuf_reserve(struct ofpbuf *, size_t); void *ofpbuf_push_uninit(struct ofpbuf *b, size_t); void *ofpbuf_push_zeros(struct ofpbuf *, size_t); void *ofpbuf_push(struct ofpbuf *b, const void *, size_t); +void ofpbuf_insert(struct ofpbuf *b, size_t offset, const void *data, size_t); static inline size_t ofpbuf_headroom(const struct ofpbuf *); static inline size_t ofpbuf_tailroom(const struct ofpbuf *); diff --git a/lib/ofpbuf.c b/lib/ofpbuf.c index f4a9040646ef..9c0623688f16 100644 --- a/lib/ofpbuf.c +++ b/lib/ofpbuf.c @@ -461,6 +461,24 @@ ofpbuf_push(struct ofpbuf *b, const void *p, size_t size) return dst; } +/* Inserts the 'n' bytes of 'data' into 'b' starting at the given 'offset', + * moving data forward as necessary to make room. + * + * 'data' must not point inside 'b'. */ +void +ofpbuf_insert(struct ofpbuf *b, size_t offset, const void *data, size_t n) +{ +if (offset < b->size) { +ofpbuf_put_uninit(b, n); +memmove((char *) b->data + offset + n, (char *) b->data + offset, +b->size - offset); +memcpy((char *) b->data + offset, data, n); +} else { +ovs_assert(offset == b->size); +ofpbuf_put(b, data, n); +} +} + /* Returns the data in 'b' as a block of malloc()'d memory and frees the buffer * within 'b'. (If 'b' itself was dynamically allocated, e.g. with * ofpbuf_new(), then it should still be freed with, e.g., ofpbuf_delete().) */ -- 2.10.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v2 02/12] openvswitch.h: Use odp_port_t for port numbers in userspace-only structs.
Using the correct type reduces the need for type conversions. Signed-off-by: Ben Pfaff--- datapath/linux/compat/include/linux/openvswitch.h | 4 ++-- lib/dpif-netdev.c | 2 +- lib/netdev.c | 2 +- ofproto/ofproto-dpif-sflow.c | 2 +- ofproto/ofproto-dpif-xlate.c | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h index 4c88de1d610d..24e51cb311d2 100644 --- a/datapath/linux/compat/include/linux/openvswitch.h +++ b/datapath/linux/compat/include/linux/openvswitch.h @@ -714,8 +714,8 @@ struct ovs_action_hash { * this header to build final header according to actual packet parameters. */ struct ovs_action_push_tnl { - uint32_t tnl_port; - uint32_t out_port; + odp_port_t tnl_port; + odp_port_t out_port; uint32_t header_len; uint32_t tnl_type; /* For logging. */ uint32_t header[TNL_PUSH_HEADER_SIZE / 4]; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 2b65dc74a269..f97e97ab2931 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -4956,7 +4956,7 @@ push_tnl_action(const struct dp_netdev_pmd_thread *pmd, data = nl_attr_get(attr); -tun_port = pmd_tnl_port_cache_lookup(pmd, u32_to_odp(data->tnl_port)); +tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port); if (!tun_port) { err = -EINVAL; goto error; diff --git a/lib/netdev.c b/lib/netdev.c index 001b7b37bb57..765bf4b9ccad 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -831,7 +831,7 @@ netdev_push_header(const struct netdev *netdev, struct dp_packet *packet; DP_PACKET_BATCH_FOR_EACH (packet, batch) { netdev->netdev_class->push_header(packet, data); -pkt_metadata_init(>md, u32_to_odp(data->out_port)); +pkt_metadata_init(>md, data->out_port); } return 0; diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c index d9fddb1564b5..fc665a636853 100644 --- a/ofproto/ofproto-dpif-sflow.c +++ b/ofproto/ofproto-dpif-sflow.c @@ -901,7 +901,7 @@ sflow_read_tnl_push_action(const struct nlattr *attr, const struct ip_header *ip = ALIGNED_CAST(const struct ip_header *, eth + 1); -sflow_actions->out_port = u32_to_odp(data->out_port); +sflow_actions->out_port = data->out_port; /* Ethernet. */ /* TODO: SFlow does not currently define a MAC-in-MAC diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index e15e3dec3f1c..48c4bad4ac0b 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3211,8 +3211,8 @@ build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport, if (err) { return err; } -tnl_push_data.tnl_port = odp_to_u32(tunnel_odp_port); -tnl_push_data.out_port = odp_to_u32(out_dev->odp_port); +tnl_push_data.tnl_port = tunnel_odp_port; +tnl_push_data.out_port = out_dev->odp_port; /* After tunnel header has been added, packet_type of flow and base_flow * need to be set to PT_ETH. */ -- 2.10.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v2 03/12] ovs-dpctl: New --names option to use port names in flow dumps.
Until now, printing names in "ovs-dpctl dump-flows" was tied to the overall output verbosity, which in practice meant that to see port names a user had to see a distracting amount of verbosity. This decouples names from verbosity. I'd like to make showing names the default for interactive usage, but so far names aren't accepted in input so that would frustrate cut-and-paste, which is an important use of "ovs-dpctl dump-flows" output. Signed-off-by: Ben Pfaff--- lib/dpctl.c | 76 ++-- lib/dpctl.h | 3 ++ lib/dpctl.man| 7 +++-- lib/odp-util.c | 4 +-- ofproto/ofproto-dpif.c | 48 +- utilities/ovs-dpctl.8.in | 9 +- utilities/ovs-dpctl.c| 20 + 7 files changed, 125 insertions(+), 42 deletions(-) diff --git a/lib/dpctl.c b/lib/dpctl.c index 7f44d025dcf1..1e3bf0a517db 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -49,6 +49,8 @@ #include "unixctl.h" #include "util.h" #include "openvswitch/ofp-parse.h" +#include "openvswitch/vlog.h" +VLOG_DEFINE_THIS_MODULE(dpctl); typedef int dpctl_command_handler(int argc, const char *argv[], struct dpctl_params *); @@ -762,6 +764,36 @@ static char *supported_dump_types[] = { "ovs", }; +static struct hmap * +dpctl_get_portno_names(struct dpif *dpif, const struct dpctl_params *dpctl_p) +{ +if (dpctl_p->names) { +struct hmap *portno_names = xmalloc(sizeof *portno_names); +hmap_init(portno_names); + +struct dpif_port_dump port_dump; +struct dpif_port dpif_port; +DPIF_PORT_FOR_EACH (_port, _dump, dpif) { +odp_portno_names_set(portno_names, dpif_port.port_no, + dpif_port.name); +} + +return portno_names; +} else { +return NULL; +} +} + +static void +dpctl_free_portno_names(struct hmap *portno_names) +{ +if (portno_names) { +odp_portno_names_destroy(portno_names); +hmap_destroy(portno_names); +free(portno_names); +} +} + static int dpctl_dump_flows(int argc, const char *argv[], struct dpctl_params *dpctl_p) { @@ -774,10 +806,6 @@ dpctl_dump_flows(int argc, const char *argv[], struct dpctl_params *dpctl_p) struct flow flow_filter; struct flow_wildcards wc_filter; -struct dpif_port_dump port_dump; -struct dpif_port dpif_port; -struct hmap portno_names; - struct dpif_flow_dump_thread *flow_dump_thread; struct dpif_flow_dump *flow_dump; struct dpif_flow f; @@ -807,15 +835,14 @@ dpctl_dump_flows(int argc, const char *argv[], struct dpctl_params *dpctl_p) goto out_free; } - -hmap_init(_names); -DPIF_PORT_FOR_EACH (_port, _dump, dpif) { -odp_portno_names_set(_names, dpif_port.port_no, dpif_port.name); -} +struct hmap *portno_names = dpctl_get_portno_names(dpif, dpctl_p); if (filter) { struct ofputil_port_map port_map; ofputil_port_map_init(_map); + +struct dpif_port_dump port_dump; +struct dpif_port dpif_port; DPIF_PORT_FOR_EACH (_port, _dump, dpif) { ofputil_port_map_put(_map, u16_to_ofp(odp_to_u32(dpif_port.port_no)), @@ -890,7 +917,7 @@ dpctl_dump_flows(int argc, const char *argv[], struct dpctl_params *dpctl_p) } pmd_id = f.pmd_id; } -format_dpif_flow(, , _names, type, dpctl_p); +format_dpif_flow(, , portno_names, type, dpctl_p); dpctl_print(dpctl_p, "%s\n", ds_cstr()); } @@ -903,8 +930,7 @@ dpctl_dump_flows(int argc, const char *argv[], struct dpctl_params *dpctl_p) ds_destroy(); out_dpifclose: -odp_portno_names_destroy(_names); -hmap_destroy(_names); +dpctl_free_portno_names(portno_names); dpif_close(dpif); out_free: free(filter); @@ -1032,11 +1058,8 @@ dpctl_get_flow(int argc, const char *argv[], struct dpctl_params *dpctl_p) { const char *key_s = argv[argc - 1]; struct dpif_flow flow; -struct dpif_port dpif_port; -struct dpif_port_dump port_dump; struct dpif *dpif; char *dp_name; -struct hmap portno_names; ovs_u128 ufid; struct ofpbuf buf; uint64_t stub[DPIF_FLOW_BUFSIZE / 8]; @@ -1055,10 +1078,8 @@ dpctl_get_flow(int argc, const char *argv[], struct dpctl_params *dpctl_p) } ofpbuf_use_stub(, , sizeof stub); -hmap_init(_names); -DPIF_PORT_FOR_EACH (_port, _dump, dpif) { -odp_portno_names_set(_names, dpif_port.port_no, dpif_port.name); -} + +struct hmap *portno_names = dpctl_get_portno_names(dpif, dpctl_p); n = odp_ufid_from_string(key_s, ); if (n <= 0) { @@ -1074,13 +1095,12 @@ dpctl_get_flow(int argc, const char *argv[], struct dpctl_params *dpctl_p) } ds_init(); -format_dpif_flow(, , _names, NULL, dpctl_p); +
[ovs-dev] [PATCH v2 01/12] ofp-util: Remove prototype for unimplemented function.
Signed-off-by: Ben Pfaff--- include/openvswitch/ofp-util.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/openvswitch/ofp-util.h b/include/openvswitch/ofp-util.h index bbf6ffec5dd3..07723b427ce8 100644 --- a/include/openvswitch/ofp-util.h +++ b/include/openvswitch/ofp-util.h @@ -247,8 +247,6 @@ void ofputil_match_to_ofp10_match(const struct match *, struct ofp10_match *); enum ofperr ofputil_pull_ofp11_match(struct ofpbuf *, const struct tun_table *, const struct vl_mff_map *, struct match *, uint16_t *padded_match_len); -enum ofperr ofputil_pull_ofp11_mask(struct ofpbuf *, struct match *, -struct mf_bitmap *bm); enum ofperr ofputil_match_from_ofp11_match(const struct ofp11_match *, struct match *); int ofputil_put_ofp11_match(struct ofpbuf *, const struct match *, -- 2.10.2 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH v2 00/12] Packet type aware pipeline
This series is based on Zoltan Balogh's series here: https://patchwork.ozlabs.org/patch/770490/ https://patchwork.ozlabs.org/patch/770487/ https://patchwork.ozlabs.org/patch/770495/ https://patchwork.ozlabs.org/patch/770498/ https://patchwork.ozlabs.org/patch/770488/ https://patchwork.ozlabs.org/patch/770489/ v1->v2: - Squash fixup patches. - Apply changes agreed with Jan. - Not yet done: Figure out whether to really show packet_type in (some) match_format() output. - New patch at the end unsuccessfully tries to re-enable packet-aware test. Either I don't have enough insight yet, or it just reveals a bug or two. - 4 new patches at beginning. First one is trivial. Next 3 are intended to make it easier to debug the packet aware test that is still failing. Jan, you don't have to feel obligated to review these if you feel they are off-topic; I will get separate reviews. Ben Pfaff (8): ofp-util: Remove prototype for unimplemented function. openvswitch.h: Use odp_port_t for port numbers in userspace-only structs. ovs-dpctl: New --names option to use port names in flow dumps. odp-util: Use port names in output in more places. ofpbuf: New function ofpbuf_insert(). nx-match: Add context argument to nxm_put__(). userspace: Handling of versatile tunnel ports work on packet aware test Jan Scheurich (3): userspace: Add OXM field MFF_PACKET_TYPE tests: Added unit tests in packet-type-aware.at userspace: Complete Packet In handling Zoltán Balogh (1): userspace: Introduce packet_type in OF 1.5 packet-out NEWS | 6 +- build-aux/extract-ofp-fields | 3 +- datapath/linux/compat/include/linux/openvswitch.h | 4 +- include/openvswitch/match.h | 5 + include/openvswitch/meta-flow.h | 20 + include/openvswitch/ofp-util.h| 2 - include/openvswitch/ofpbuf.h | 1 + lib/dpctl.c | 84 ++-- lib/dpctl.h | 3 + lib/dpctl.man | 7 +- lib/dpif-netdev.c | 4 +- lib/dpif.c| 4 +- lib/flow.c| 74 +++- lib/flow.h| 27 +- lib/learn.c | 1 + lib/match.c | 98 +++-- lib/meta-flow.c | 86 +++- lib/meta-flow.xml | 154 ++- lib/netdev-bsd.c | 1 + lib/netdev-dpdk.c | 1 + lib/netdev-dummy.c| 1 + lib/netdev-linux.c| 1 + lib/netdev-native-tnl.c | 23 +- lib/netdev-provider.h | 6 + lib/netdev-vport.c| 106 +++-- lib/netdev-vport.h| 1 - lib/netdev.c | 10 +- lib/netdev.h | 29 +- lib/nx-match.c| 264 +++- lib/nx-match.h| 6 +- lib/odp-util.c| 138 +++--- lib/odp-util.h| 5 +- lib/ofp-parse.c | 25 ++ lib/ofp-print.c | 11 +- lib/ofp-util.c| 69 ++- lib/ofpbuf.c | 18 + lib/tun-metadata.c| 4 +- ofproto/ofproto-dpif-sflow.c | 2 +- ofproto/ofproto-dpif-trace.c | 2 +- ofproto/ofproto-dpif-xlate.c | 45 +- ofproto/ofproto-dpif.c| 54 ++- ofproto/ofproto.c | 3 + ofproto/tunnel.c | 29 +- tests/automake.mk | 6 +- tests/dpif-netdev.at | 89 ++-- tests/odp.at | 1 + tests/ofproto-dpif.at | 230 +- tests/ofproto.at | 86 tests/ovs-ofctl.at| 2 +- tests/packet-type-aware.at| 484 ++ tests/pmd.at | 8 +- tests/system-userspace-packet-type-aware.at | 422 +++ tests/system-userspace-testsuite.at | 1 + tests/test-odp.c | 2 +- tests/testsuite.at| 1 + tests/tunnel-push-pop-ipv6.at
[ovs-dev] Darlehen Angebot bei 2% pro Jahr
Wir bieten Darlehensfonds zu einem Jahreszins von 2% an. Es gibt keine Bonitätsprüfung Geschichte. Vertrauen Sie uns mit allen Arten von Darlehen umzugehen. Interessierte Kandidaten sollten uns heute für ihre schnelle Online-und leichte Darlehen und keine Sicherheiten Einlagen kontaktieren. E-mail: williaminvestmentllc...@gmail.com Lender: Roush W. James Handy: +1 (516)341-6554 --- This email has been checked for viruses by Avast antivirus software. https://www.avast.com/antivirus ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 1/2] checkpatch: Suggest ovs_assert() to author.
Suggest the author to use the OVS wrapper of the assert function. Signed-off-by: Bhanuprakash Bodireddy--- utilities/checkpatch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index b45a255..304d2fd 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -256,6 +256,7 @@ std_functions = [ ('strerror', 'Use ovs_strerror() in place of strerror()'), ('sleep', 'Use xsleep() in place of sleep()'), ('abort', 'Use ovs_abort() in place of abort()'), +('assert', 'Use ovs_assert() in place of assert()'), ('error', 'Use ovs_error() in place of error()'), ] checks += [ -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 5/6] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.
Add netdev_dpdk_vhost_txq_flush(), that flushes packets on vHost User port queues. Also add netdev_dpdk_vhost_tx_burst() function that uses rte_vhost_enqueue_burst() to enqueue burst of packets on vHost User ports. Signed-off-by: Bhanuprakash BodireddySigned-off-by: Antonio Fischetti Co-authored-by: Antonio Fischetti Acked-by: Eelco Chaudron --- lib/netdev-dpdk.c | 76 --- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 50a9a2c..47343e8 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -307,12 +307,22 @@ struct dpdk_tx_queue { * pmd threads (see 'concurrent_txq'). */ int map; /* Mapping of configured vhost-user queues * to enabled by guest. */ -int dpdk_pkt_cnt; /* Number of buffered packets waiting to +union { +int dpdk_pkt_cnt; /* Number of buffered packets waiting to be sent on DPDK tx queue. */ -struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD]; +int vhost_pkt_cnt; /* Number of buffered packets waiting to + be sent on vhost port. */ +}; + +union { +struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD]; /* Intermediate queue where packets can * be buffered to amortize the cost of MMIO * writes. */ +struct dp_packet *vhost_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD]; + /* Intermediate queue where packets can +* be buffered for vhost ports. */ +}; }; /* dpdk has no way to remove dpdk ring ethernet devices @@ -1719,6 +1729,63 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats, } } +static int +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid) +{ +struct dpdk_tx_queue *txq = >tx_q[qid]; +struct rte_mbuf **cur_pkts = (struct rte_mbuf **)txq->vhost_burst_pkts; + +int tx_vid = netdev_dpdk_get_vid(dev); +int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ; +uint32_t sent = 0; +uint32_t retries = 0; +uint32_t sum, total_pkts; + +total_pkts = sum = txq->vhost_pkt_cnt; +do { +uint32_t ret; +ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, _pkts[sent], sum); +if (OVS_UNLIKELY(!ret)) { +/* No packets enqueued - do not retry. */ +break; +} else { +/* Packet have been sent */ +sent += ret; + +/* 'sum' packet have to be retransmitted */ +sum -= ret; +} +} while (sum && (retries++ < VHOST_ENQ_RETRY_NUM)); + +for (int i = 0; i < total_pkts; i++) { +dp_packet_delete(txq->vhost_burst_pkts[i]); +} + +/* Reset pkt count */ +txq->vhost_pkt_cnt = 0; + +/* 'sum' refers to packets dropped */ +return sum; +} + +/* Flush the txq if there are any packets available. + * dynamic_txqs/concurrent_txq is disabled for vHost User ports as + * 'OVS_VHOST_MAX_QUEUE_NUM' txqs are preallocated. + */ +static int +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid, +bool concurrent_txq OVS_UNUSED) +{ +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); +struct dpdk_tx_queue *txq = >tx_q[qid]; + +if (OVS_LIKELY(txq->vhost_pkt_cnt)) { +netdev_dpdk_vhost_tx_burst(dev, qid); +} + +return 0; +} + static void __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts, int cnt) @@ -3432,7 +3499,8 @@ static const struct netdev_class dpdk_vhost_class = NULL, netdev_dpdk_vhost_reconfigure, netdev_dpdk_vhost_rxq_recv, -NULL); +netdev_dpdk_vhost_txq_flush); + static const struct netdev_class dpdk_vhost_client_class = NETDEV_DPDK_CLASS( "dpdkvhostuserclient", @@ -3448,7 +3516,7 @@ static const struct netdev_class dpdk_vhost_client_class = NULL, netdev_dpdk_vhost_client_reconfigure, netdev_dpdk_vhost_rxq_recv, -NULL); +netdev_dpdk_vhost_txq_flush); void netdev_dpdk_register(void) -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 6/6] netdev-dpdk: Enable intermediate queue for vHost User port.
This commit refactors the __netdev_dpdk_vhost_send() and enables intermediate queue where in the packets are buffered till the threshold 'INTERIM_QUEUE_BURST_THRESHOLD[32] is hit and eventually gets transmitted. This commit improves the throughput as reported below in simple Physical to virtual testcase with higher flows @10G Line rate. Num FlowMaster Commit = = 10 5945899 7833914 32 3872211 6530133 50 3283713 6618711 100 3132540 5857226 500 2964499 5273006 10002931952 5178038 Latency stats: MASTER --- Pkt size min(ns) avg(ns) max(ns) 512 10,011 12,100 281,915 1024 7,8709,313 193,116 1280 7,8629,036 194,439 1518 8,2159,417 204,782 MASTER + COMMIT --- Pkt size min(ns) avg(ns) max(ns) 512 10,492 13,655 281,538 1024 8,4079,784 205,095 1280 8,3999,750 194,888 1518 8,3679,722 196,973 Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2017-May/332271.html [By Eelco Chaudron] Signed-off-by: Bhanuprakash Bodireddy Signed-off-by: Antonio Fischetti Co-authored-by: Antonio Fischetti Acked-by: Eelco Chaudron --- lib/netdev-dpdk.c | 38 +++--- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 47343e8..69cc5ff 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1794,16 +1794,21 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts; unsigned int total_pkts = cnt; unsigned int dropped = 0; -int i, retries = 0; +int i; qid = dev->tx_q[qid % netdev->n_txq].map; +struct dpdk_tx_queue *txq = >tx_q[qid]; if (OVS_UNLIKELY(!is_vhost_running(dev) || qid < 0 || !(dev->flags & NETDEV_UP))) { rte_spinlock_lock(>stats_lock); dev->stats.tx_dropped+= cnt; rte_spinlock_unlock(>stats_lock); -goto out; + +for (i = 0; i < total_pkts; i++) { +dp_packet_delete(pkts[i]); +} +return; } rte_spinlock_lock(>tx_q[qid].tx_lock); @@ -1813,34 +1818,21 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt); dropped = total_pkts - cnt; -do { -int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ; -unsigned int tx_pkts; - -tx_pkts = rte_vhost_enqueue_burst(netdev_dpdk_get_vid(dev), - vhost_qid, cur_pkts, cnt); -if (OVS_LIKELY(tx_pkts)) { -/* Packets have been sent.*/ -cnt -= tx_pkts; -/* Prepare for possible retry.*/ -cur_pkts = _pkts[tx_pkts]; -} else { -/* No packets sent - do not retry.*/ -break; +int idx = 0; +while (idx < cnt) { +txq->vhost_burst_pkts[txq->vhost_pkt_cnt++] = pkts[idx++]; + +if (txq->vhost_pkt_cnt >= INTERIM_QUEUE_BURST_THRESHOLD) { +dropped += netdev_dpdk_vhost_tx_burst(dev, qid); } -} while (cnt && (retries++ <= VHOST_ENQ_RETRY_NUM)); +} rte_spinlock_unlock(>tx_q[qid].tx_lock); rte_spinlock_lock(>stats_lock); netdev_dpdk_vhost_update_tx_counters(>stats, pkts, total_pkts, - cnt + dropped); + dropped); rte_spinlock_unlock(>stats_lock); - -out: -for (i = 0; i < total_pkts - dropped; i++) { -dp_packet_delete(pkts[i]); -} } /* Tx function. Transmit packets indefinitely */ -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 4/6] dpif-netdev: Flush the packets in intermediate queue.
Under low rate traffic conditions, there can be 2 issues. (1) Packets potentially can get stuck in the intermediate queue. (2) Latency of the packets can increase significantly due to buffering in intermediate queue. This commit handles the (1) issue by flushing the tx port queues from PMD processing loop. Also this commit addresses issue (2) by flushing the tx queues after every rxq port processing. This reduces the latency with out impacting the forwarding throughput. MASTER Pkt size min(ns) avg(ns) max(ns) 512 4,631 5,022309,914 1024 5,545 5,749104,294 1280 5,978 6,159 45,306 1518 6,419 6,774946,850 MASTER + COMMIT - Pkt size min(ns) avg(ns) max(ns) 512 4,711 5,064182,477 1024 5,601 5,888701,654 1280 6,018 6,491533,037 1518 6,467 6,734312,471 PMDs can be teared down and spawned at runtime and so the rxq and txq mapping of the PMD threads can change. In few cases packets can get stuck in the queue due to reconfiguration and this commit helps flush the queues. Suggested-by: Eelco ChaudronReported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2017-April/331039.html Signed-off-by: Bhanuprakash Bodireddy Signed-off-by: Antonio Fischetti Co-authored-by: Antonio Fischetti Signed-off-by: Markus Magnusson Co-authored-by: Markus Magnusson Acked-by: Eelco Chaudron --- lib/dpif-netdev.c | 5 + 1 file changed, 5 insertions(+) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index d59208e..dfd88aa 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -3761,6 +3761,8 @@ reload: for (i = 0; i < poll_cnt; i++) { dp_netdev_process_rxq_port(pmd, poll_list[i].rx, poll_list[i].port_no); + +dp_netdev_flush_txq_ports(pmd); } if (lc++ > 1024) { @@ -3781,6 +3783,9 @@ reload: } } +/* Flush the queues as part of reconfiguration logic. */ +dp_netdev_flush_txq_ports(pmd); + poll_cnt = pmd_load_queues_and_ports(pmd, _list); exiting = latch_is_set(>exit_latch); /* Signal here to make sure the pmd finishes -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 3/6] netdev-dpdk: Add intermediate queue support.
This commit introduces netdev_dpdk_eth_tx_queue() function that implements intermediate queue and packet buffering. The packets get buffered till the threshold 'INTERIM_QUEUE_BURST_THRESHOLD[32] is reached and eventually gets transmitted. To handle the case(eg: ping) where packets are sent at low rate and can potentially get stuck in the queue, flush logic is implemented that gets invoked from dp_netdev_flush_txq_ports() as part of PMD packet processing loop. Signed-off-by: Bhanuprakash BodireddySigned-off-by: Antonio Fischetti Co-authored-by: Antonio Fischetti Signed-off-by: Markus Magnusson Co-authored-by: Markus Magnusson Acked-by: Eelco Chaudron --- lib/dpif-netdev.c | 44 +++- lib/netdev-dpdk.c | 35 ++- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 2b65dc7..d59208e 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -332,6 +332,7 @@ enum pmd_cycles_counter_type { }; #define XPS_TIMEOUT_MS 500LL +#define LAST_USED_QID_NONE -1 /* Contained by struct dp_netdev_port's 'rxqs' member. */ struct dp_netdev_rxq { @@ -492,7 +493,13 @@ struct rxq_poll { struct tx_port { struct dp_netdev_port *port; int qid; -long long last_used; +int last_used_qid;/* Last queue id where packets got + enqueued. */ +long long last_used; /* In case XPS is enabled, it contains the + * timestamp of the last time the port was + * used by the thread to send data. After + * XPS_TIMEOUT_MS elapses the qid will be + * marked as -1. */ struct hmap_node node; }; @@ -3081,6 +3088,25 @@ cycles_count_end(struct dp_netdev_pmd_thread *pmd, } static void +dp_netdev_flush_txq_ports(struct dp_netdev_pmd_thread *pmd) +{ +struct tx_port *cached_tx_port; +int tx_qid; + +HMAP_FOR_EACH (cached_tx_port, node, >send_port_cache) { +tx_qid = cached_tx_port->last_used_qid; + +if (tx_qid != LAST_USED_QID_NONE) { +netdev_txq_flush(cached_tx_port->port->netdev, tx_qid, + cached_tx_port->port->dynamic_txqs); + +/* Queue flushed and mark it empty. */ +cached_tx_port->last_used_qid = LAST_USED_QID_NONE; +} +} +} + +static void dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, struct netdev_rxq *rx, odp_port_t port_no) @@ -4356,6 +4382,7 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, tx->port = port; tx->qid = -1; +tx->last_used_qid = LAST_USED_QID_NONE; hmap_insert(>tx_ports, >node, hash_port_no(tx->port->port_no)); pmd->need_reload = true; @@ -4926,6 +4953,14 @@ dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, dpif_netdev_xps_revalidate_pmd(pmd, now, false); +/* The tx queue can change in XPS case, make sure packets in previous + * queue is flushed properly. */ +if (tx->last_used_qid != LAST_USED_QID_NONE && + tx->qid != tx->last_used_qid) { +netdev_txq_flush(port->netdev, tx->last_used_qid, port->dynamic_txqs); +tx->last_used_qid = LAST_USED_QID_NONE; +} + VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.", pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev)); return min_qid; @@ -5021,6 +5056,13 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, tx_qid = pmd->static_tx_qid; } +/* In case these packets gets buffered into an intermediate + * queue and XPS is enabled the flush function could find a + * different tx qid assigned to its thread. We keep track + * of the qid we're now using, that will trigger the flush + * function and will select the right queue to flush. */ +p->last_used_qid = tx_qid; + netdev_send(p->port->netdev, tx_qid, packets_, may_steal, dynamic_txqs); return; diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 1e83116..50a9a2c 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1434,6 +1434,7 @@ static inline int netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, struct rte_mbuf **pkts, int cnt) { +struct dpdk_tx_queue *txq = >tx_q[qid]; uint32_t nb_tx = 0; while (nb_tx != cnt) { @@ -1457,6 +1458,7 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, } } +txq->dpdk_pkt_cnt = 0; return cnt - nb_tx; } @@ -1841,6 +1843,37 @@
[ovs-dev] [PATCH 2/6] netdev-dpdk: Add netdev_dpdk_txq_flush function.
This commit adds netdev_dpdk_txq_flush() function. If there are any packets waiting in the queue, they are transmitted instantly using the rte_eth_tx_burst function. In XPS enabled case, lock is taken on the tx queue before flushing the queue. Signed-off-by: Bhanuprakash BodireddySigned-off-by: Antonio Fischetti Co-authored-by: Antonio Fischetti Signed-off-by: Markus Magnusson Co-authored-by: Markus Magnusson Acked-by: Eelco Chaudron --- lib/netdev-dpdk.c | 31 +-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index cc84539..1e83116 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -293,6 +293,11 @@ struct dpdk_mp { struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex); }; +/* Queue 'INTERIM_QUEUE_BURST_THRESHOLD' packets before transmitting. + * Defaults to 'NETDEV_MAX_BURST'(32) packets. + */ +#define INTERIM_QUEUE_BURST_THRESHOLD NETDEV_MAX_BURST + /* There should be one 'struct dpdk_tx_queue' created for * each cpu core. */ struct dpdk_tx_queue { @@ -302,6 +307,12 @@ struct dpdk_tx_queue { * pmd threads (see 'concurrent_txq'). */ int map; /* Mapping of configured vhost-user queues * to enabled by guest. */ +int dpdk_pkt_cnt; /* Number of buffered packets waiting to + be sent on DPDK tx queue. */ +struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD]; + /* Intermediate queue where packets can +* be buffered to amortize the cost of MMIO +* writes. */ }; /* dpdk has no way to remove dpdk ring ethernet devices @@ -1897,9 +1908,25 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid, * few packets (< INTERIM_QUEUE_BURST_THRESHOLD) buffered in the queue. */ static int -netdev_dpdk_txq_flush(struct netdev *netdev OVS_UNUSED, - int qid OVS_UNUSED, bool concurrent_txq OVS_UNUSED) +netdev_dpdk_txq_flush(struct netdev *netdev, + int qid, bool concurrent_txq) { +struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); +struct dpdk_tx_queue *txq = >tx_q[qid]; + +if (OVS_LIKELY(txq->dpdk_pkt_cnt)) { +if (OVS_UNLIKELY(concurrent_txq)) { +qid = qid % dev->up.n_txq; +rte_spinlock_lock(>tx_q[qid].tx_lock); +} + +netdev_dpdk_eth_tx_burst(dev, qid, txq->dpdk_burst_pkts, + txq->dpdk_pkt_cnt); + +if (OVS_UNLIKELY(concurrent_txq)) { +rte_spinlock_unlock(>tx_q[qid].tx_lock); +} +} return 0; } -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [PATCH 1/6] netdev: Add netdev_txq_flush function.
Add netdev_txq_flush(), that flush packets on a queue. This is needed to transmit packets on the intermediate queue. Signed-off-by: Bhanuprakash BodireddySigned-off-by: Antonio Fischetti Co-authored-by: Antonio Fischetti Signed-off-by: Markus Magnusson Co-authored-by: Markus Magnusson Acked-by: Eelco Chaudron --- lib/netdev-bsd.c | 1 + lib/netdev-dpdk.c | 26 +- lib/netdev-dummy.c| 1 + lib/netdev-linux.c| 1 + lib/netdev-provider.h | 8 lib/netdev-vport.c| 2 +- lib/netdev.c | 9 + lib/netdev.h | 1 + 8 files changed, 43 insertions(+), 6 deletions(-) diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index f863a18..cb0edd6 100644 --- a/lib/netdev-bsd.c +++ b/lib/netdev-bsd.c @@ -1548,6 +1548,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off, netdev_bsd_rxq_recv, \ netdev_bsd_rxq_wait, \ netdev_bsd_rxq_drain,\ +NULL,\ \ NO_OFFLOAD_API \ } diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index bba4de3..cc84539 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1892,6 +1892,17 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid, } } +/* Flush tx queues + * This is done periodically to empty the intermediate queue in case of + * few packets (< INTERIM_QUEUE_BURST_THRESHOLD) buffered in the queue. + */ +static int +netdev_dpdk_txq_flush(struct netdev *netdev OVS_UNUSED, + int qid OVS_UNUSED, bool concurrent_txq OVS_UNUSED) +{ +return 0; +} + static int netdev_dpdk_eth_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, bool may_steal, @@ -3241,7 +3252,7 @@ unlock: SET_CONFIG, SET_TX_MULTIQ, SEND,\ GET_CARRIER, GET_STATS, \ GET_FEATURES, GET_STATUS, \ - RECONFIGURE, RXQ_RECV) \ + RECONFIGURE, RXQ_RECV, TXQ_FLUSH) \ { \ NAME, \ true, /* is_pmd */ \ @@ -3308,6 +3319,7 @@ unlock: RXQ_RECV, \ NULL, /* rx_wait */ \ NULL, /* rxq_drain */ \ +TXQ_FLUSH, /* txq_flush */ \ NO_OFFLOAD_API\ } @@ -3325,7 +3337,8 @@ static const struct netdev_class dpdk_class = netdev_dpdk_get_features, netdev_dpdk_get_status, netdev_dpdk_reconfigure, -netdev_dpdk_rxq_recv); +netdev_dpdk_rxq_recv, +netdev_dpdk_txq_flush); static const struct netdev_class dpdk_ring_class = NETDEV_DPDK_CLASS( @@ -3341,7 +3354,8 @@ static const struct netdev_class dpdk_ring_class = netdev_dpdk_get_features, netdev_dpdk_get_status, netdev_dpdk_reconfigure, -netdev_dpdk_rxq_recv); +netdev_dpdk_rxq_recv, +NULL); static const struct netdev_class dpdk_vhost_class = NETDEV_DPDK_CLASS( @@ -3357,7 +3371,8 @@ static const struct netdev_class dpdk_vhost_class = NULL, NULL, netdev_dpdk_vhost_reconfigure, -netdev_dpdk_vhost_rxq_recv); +netdev_dpdk_vhost_rxq_recv, +NULL); static const struct netdev_class dpdk_vhost_client_class = NETDEV_DPDK_CLASS( "dpdkvhostuserclient", @@ -3372,7 +3387,8 @@ static const struct netdev_class dpdk_vhost_client_class = NULL, NULL, netdev_dpdk_vhost_client_reconfigure, -netdev_dpdk_vhost_rxq_recv); +netdev_dpdk_vhost_rxq_recv, +NULL); void netdev_dpdk_register(void) diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index d189a86..216c98e 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -1414,6 +1414,7 @@ netdev_dummy_update_flags(struct netdev *netdev_, netdev_dummy_rxq_recv, \ netdev_dummy_rxq_wait, \ netdev_dummy_rxq_drain, \ +NULL, \ \ NO_OFFLOAD_API \ } diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index f5dc30f..a2775da 100644 ---
[ovs-dev] [PATCH 0/6 V2] netdev-dpdk: Use intermediate queue during packet transmission.
After packet classification, packets are queued in to batches depending on the matching netdev flow. Thereafter each batch is processed to execute the related actions. This becomes particularly inefficient if there are few packets in each batch as rte_eth_tx_burst() incurs expensive MMIO writes. This patch series implements intermediate queue for DPDK and vHost User ports. Packets are queued and burst when the packet count exceeds threshold. Also drain logic is implemented to handle cases where packets can get stuck in the tx queues at low rate traffic conditions. Care has been taken to see that latency is well with in the acceptable limits. Testing shows significant performance gains with this implementation. This path series combines the earlier 2 patches posted below. DPDK patch: https://mail.openvswitch.org/pipermail/ovs-dev/2017-April/331039.html vHost User patch: https://mail.openvswitch.org/pipermail/ovs-dev/2017-May/332271.html Also this series proposes to disable the retries on vHost User ports and make it configurable via ovsdb.(controversial?) Performance Numbers with intermediate queue: DPDK ports === Throughput for P2P scenario, for two 82599ES 10G port with 64 byte packets Number flows MASTER With PATCH ==== 10 1072728313393844 32704225311228799 507515491 9607791 1005838699 9430730 5005285066 7845807 10005226477 7135601 Latency test MASTER === Pkt size min(ns) avg(ns) max(ns) 512 4,631 5,022309,914 1024 5,545 5,749104,294 1280 5,978 6,159 45,306 1518 6,419 6,774946,850 PATCH = Pkt size min(ns) avg(ns) max(ns) 512 4,711 5,064182,477 1024 5,601 5,888701,654 1280 6,018 6,491533,037 1518 6,467 6,734312,471 vHost User ports == Throughput for PV scenario, with 64 byte packets Number flows MASTERWith PATCH = = 105945899 7833914 323872211 6530133 503283713 6618711 1003132540 5857226 5002964499 5273006 10002931952 5178038 Latency test. MASTER === Pkt size min(ns) avg(ns) max(ns) 512 10,011 12,100 281,915 1024 7,8709,313 193,116 1280 7,8629,036 194,439 1518 8,2159,417 204,782 PATCH === Pkt size min(ns) avg(ns) max(ns) 512 10,492 13,655 281,538 1024 8,4079,784 205,095 1280 8,3999,750 194,888 1518 8,3679,722 196,973 Performance number reported by Eelco Chaudron at https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/333949.html https://mail.openvswitch.org/pipermail/ovs-dev/2017-May/332271.html https://mail.openvswitch.org/pipermail/ovs-dev/2017-April/331039.html --- v1->v2 * Rebased on master due to HW offload changes. * Introduced union for packet count and buffers and changed the variable names appropriately. * No functional change changes. Bhanuprakash Bodireddy (6): netdev: Add netdev_txq_flush function. netdev-dpdk: Add netdev_dpdk_txq_flush function. netdev-dpdk: Add intermediate queue support. dpif-netdev: Flush the packets in intermediate queue. netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function. netdev-dpdk: Enable intermediate queue for vHost User port. lib/dpif-netdev.c | 49 - lib/netdev-bsd.c | 1 + lib/netdev-dpdk.c | 194 ++ lib/netdev-dummy.c| 1 + lib/netdev-linux.c| 1 + lib/netdev-provider.h | 8 +++ lib/netdev-vport.c| 2 +- lib/netdev.c | 9 +++ lib/netdev.h | 1 + 9 files changed, 235 insertions(+), 31 deletions(-) -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 17/18] keepalive: Display extended Keepalive status.
This commit adds support to display the extended keepalive status. The status can be displayed as follows. $ ovs-appctl keepalive/pmd-xstats-show keepAlive Status : Enabled keepAlive Interval: 1000 ms pmd64 PMD core_id : 0 PMD thread id : 1269 [ACTIVE] PMD heartbeats : enabled PMD state : ALIVE Last seen timestamp : 9123706507798853 PMD failure count : 0 pmd65 PMD core_id : 1 PMD thread id : 1270 [ACTIVE] PMD heartbeats : enabled PMD state : ALIVE Last seen timestamp : 9123706507801627 PMD failure count : 0 pmd64 PMD core_id : 2 PMD thread id : 1271 [ACTIVE] PMD heartbeats : enabled PMD state : ALIVE Last seen timestamp : 9125112827794550 PMD failure count : 0 PMD health check: enabled Packet Stats Port dpdk0, Queue: 1, Link status: up rx_packets : 1801284454 tx_packets : 0 Cycle Stats Polling cycles : 35426111637 Processing cycles : 10123697085 For PMD on core 2, on a heartbeat failure, health checks are enabled and additional stats(pkt stats, cpu cycles) are displayed as above. Signed-off-by: Bhanuprakash Bodireddy--- lib/keepalive.c | 112 1 file changed, 112 insertions(+) diff --git a/lib/keepalive.c b/lib/keepalive.c index d475ace..38bff91 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -730,6 +730,116 @@ ka_unixctl_status(struct unixctl_conn *conn, int argc OVS_UNUSED, ds_destroy(); } +static void +ka_unixctl_pmd_xstats_show(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, void *ka_info_) +{ +struct ds ds = DS_EMPTY_INITIALIZER; +ds_put_format(, + "\n\t\tKeepalive xstats\n\n"); + +ds_put_format(, "keepalive status : %s\n", + ka_is_enabled() ? "Enabled" : "Disabled"); + +if (!ka_is_enabled()) { +goto out; +} + +ds_put_format(, "keepalive interval: %"PRIu32" ms\n", + get_ka_interval()); + +struct keepalive_info *ka_info = (struct keepalive_info *)ka_info_; +if (!ka_info) { +goto out; +} + +ds_put_format(, "PMD threads : %"PRIu32" \n", ka_info->pmd_cnt); + +struct ka_process_info *pinfo, *pinfo_next; +ovs_mutex_lock(_info->proclist_mutex); +HMAP_FOR_EACH_SAFE (pinfo, pinfo_next, node, _info->process_list) { +char *state = NULL; +if (pinfo->core_state == KA_STATE_UNUSED || + pinfo->core_state == KA_STATE_SLEEP) +continue; + +switch (pinfo->core_state) { +case KA_STATE_ALIVE: +state = "ALIVE"; +break; +case KA_STATE_MISSING: +state = "MISSING"; +break; +case KA_STATE_DEAD: +state = "DEAD"; +break; +case KA_STATE_GONE: +state = "GONE"; +break; +case KA_STATE_DOZING: +state = "DOZING"; +break; +case KA_STATE_SLEEP: +state = "SLEEP"; +break; +case KA_STATE_CHECK: +state = "HEALTH_CHECK_RUNNING"; +break; +case KA_STATE_UNUSED: +break; +} + +ds_put_format(, "\n"); +ds_put_format(, " %s\n", pinfo->name); +ds_put_format(, "\tPMD core_id : %d\n", pinfo->core_id); +ds_put_format(, "\tPMD thread-id : %d [%s]\n", + pinfo->tid, process_is_active(pinfo->tid) ? + "ACTIVE" : "INACTIVE"); +ds_put_format(, "\tPMD heartbeats : %s\n", + ka_is_enabled() ? "enabled" : "disabled"); +ds_put_format(, "\tPMD state : %s\n", state); +ds_put_format(, "\tLast seen timestamp : %"PRIu64"\n", + pinfo->core_last_seen_times); + +ds_put_format(, "\tPMD failure count : %d\n", + pinfo->failures); + +int health_check = pinfo->healthcheck; +if (health_check) { +ds_put_format(, "\tPMD health check: %s\n", + health_check ? "enabled" : "disabled"); +ds_put_format(, "\tPacket Stats\n"); + +int cid = pinfo->core_id; +int n = ka_info->ext_stats[cid].num_poll_ports; +for (int idx = 0; idx < n; idx++) { +ds_put_format(, "\t\tPort %s, Link status: %s\n", +ka_info->ext_stats[cid].port_stats[idx].port, +ka_info->ext_stats[cid].port_stats[idx].link_state); +ds_put_format(, "\t\trx_packets : %"PRIu64"\n", + ka_info->ext_stats[cid]. \ +
[ovs-dev] [RFC PATCH v3 16/18] netdev-dpdk: Enable PMD health checks on heartbeat failure.
The keepalive thread sends heartbeats to PMD thread and when PMD fails to respond to successive heartbeats the PMD is potentially stalled. The PMD state transition is as below: ALIVE -> MISSING -> DEAD -> GONE This commit enables PMD healthchecks when PMD doesn't respond to heartbeats. This is needed to handle false negatives. With this commit the new state transition is as below: ALIVE -> MISSING -> DEAD -> CHECK -> GONE PMD Health checking state is introduced and will immediately kickin when the PMD gets in to DEAD state. As part of this below are considered. - Link status of the ports polled by PMD thread. - Statistics of the ports polled by PMD thread. - PMD polling and processing cycles. Signed-off-by: Bhanuprakash Bodireddy--- lib/keepalive.c | 16 ++ lib/keepalive.h | 2 ++ lib/netdev-dpdk.c | 62 +-- 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/lib/keepalive.c b/lib/keepalive.c index 3690b70..d475ace 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -108,6 +108,7 @@ ka_register_thread(int tid, bool thread_is_pmd) pinfo->heartbeats = true; pinfo->core_id = core_id; pinfo->healthcheck = PMD_HC_DISABLE; +pinfo->failures = 0; char *pname = get_process_name(tid); if (pname) { @@ -269,6 +270,21 @@ ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state, } void +ka_inc_pmd_failures(unsigned core_id) +{ +struct ka_process_info *pinfo; +int tid = ka_get_pmd_tid(core_id); +ovs_mutex_lock(_info->proclist_mutex); +HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0), + _info->process_list) { +if (pinfo->core_id == core_id) { +pinfo->failures++; +} +} +ovs_mutex_unlock(_info->proclist_mutex); +} + +void ka_load_process_list(struct hmap **process_list) { if (ka_is_enabled()) { diff --git a/lib/keepalive.h b/lib/keepalive.h index 1bd639b..4f30f36 100644 --- a/lib/keepalive.h +++ b/lib/keepalive.h @@ -64,6 +64,7 @@ struct ka_process_info { enum pmdhealth_check healthcheck; enum keepalive_state core_state; uint64_t core_last_seen_times; +int failures; struct hmap_node node; }; @@ -127,6 +128,7 @@ void ka_disable_pmd_health_check(unsigned); bool ka_is_pmdhealth_check_enabled(unsigned); enum pmdhealth_check ka_get_pmd_health_check_state(unsigned); void ka_set_pmd_health_check_state(unsigned, enum pmdhealth_check); +void ka_inc_pmd_failures(unsigned); void ka_store_pmd_id(unsigned core); uint32_t get_ka_interval(void); diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index f33eeff..f71b017 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -602,6 +602,52 @@ dpdk_failcore_cb(void *ptr_data OVS_UNUSED, const int core_id) } } +static void +dpdk_ka_handle_failure(enum keepalive_state fail_state, const int core_id, + const enum rte_keepalive_state core_state, + uint64_t last_alive) +{ +if (fail_state == KA_STATE_DEAD) { +/* If process is in DEFUNC/UNINTERRUPTIBLE/TRACED state it is inactive + * and no additional health checks are needed. */ +uint32_t tid = ka_get_pmd_tid(core_id); +if (process_is_active(tid)) { + /* Enable PMD health check only when PMD is in 'RUNNING' state and +* still doesn't respond to heartbeats. Health checks are needed to +* analyze other stats as we are in penultimate state of declaring +* PMD as failed. */ +ka_enable_pmd_health_check(core_id); +} +ka_set_pmd_state_ts(core_id, KA_STATE_DEAD, last_alive); +} + +if (fail_state == KA_STATE_GONE) { +int pmd_hc_state = ka_get_pmd_health_check_state(core_id); + +switch (pmd_hc_state) { +case PMD_HC_ENABLE: +break; +case PMD_HC_DISABLE: +VLOG_DBG_RL(, "PMD thread [%d] died, health check disabled", +core_id); +break; +case PMD_HC_PROGRESS: +ka_set_pmd_state_ts(core_id, KA_STATE_CHECK, last_alive); +break; + +case PMD_HC_COMPLETE: +ka_inc_pmd_failures(core_id); +ka_set_pmd_state_ts(core_id, core_state, last_alive); +ka_disable_pmd_health_check(core_id); +break; + +default: +VLOG_DBG_RL(, "Unknown health check state %d", pmd_hc_state); +OVS_NOT_REACHED(); +} +} +} + /* * This function shall be invoked periodically to write the core status and * last seen timestamp of the cores in to keepalive info structure. @@ -614,11 +660,23 @@ dpdk_ka_update_core_state(void *ptr_data OVS_UNUSED, const int core_id, case RTE_KA_STATE_ALIVE: case RTE_KA_STATE_MISSING: ka_set_pmd_state_ts(core_id, KA_STATE_ALIVE, last_alive); + +/*
[ovs-dev] [RFC PATCH v3 14/18] keepalive: Check the packet statistics as part of PMD health checks.
This commit adds the support to check the packet statistics on the port polled by PMD thread. If the packets aren't processed due to PMD thread stall/deadlock the statistics wont update and this can be used by monitoring framework to confirm PMD failure. This mechanism has limitation with MQ enabled. In some cases queues of the DPDK port can be polled by different PMD threads. Even if one PMD thread stalls the port statistics will be incremented due to an other queue processed by different PMD. The function can return active state considering the packets processed in this case. Signed-off-by: Bhanuprakash Bodireddy--- lib/dpif-netdev.c | 25 +++--- lib/keepalive.c | 97 +++ lib/keepalive.h | 5 +++ 3 files changed, 122 insertions(+), 5 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4d8d3e7..ad48ee5 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -975,8 +975,9 @@ sorted_poll_thread_list(struct dp_netdev *dp, static void pmd_health_check(struct dp_netdev_pmd_thread *pmd) { -int port_link_status = 0; struct rxq_poll *poll; +int port_link_status = 0; +int port_stats = 0; struct svec pmd_poll_list; svec_init(_poll_list); @@ -991,22 +992,36 @@ pmd_health_check(struct dp_netdev_pmd_thread *pmd) int i = 0; SVEC_FOR_EACH (i, port_name, _poll_list) { struct netdev *dev = netdev_from_name(port_name); +VLOG_DBG("Keepalive: Checking port %s", port_name); if (dev) { char *link_state = netdev_get_carrier(dev) ? "up" : "down"; ka_info_update_port_status(port_name, 0, link_state, pmd->core_id, i); +if (!strcmp(link_state, "up")) { +ka_info_update_port_statistics(dev, pmd->core_id, i); +} netdev_close(dev); } } svec_destroy(_poll_list); -port_link_status = ka_get_polled_ports_status(pmd->core_id); - int pmd_hc_state = ka_get_pmd_health_check_state(pmd->core_id); -if (PMD_HC_COMPLETE == pmd_hc_state) { -if (port_link_status == ACTIVE_RUN_STATE) { +switch (pmd_hc_state) { +case PMD_HC_ENABLE: +ka_set_pmd_health_check_state(pmd->core_id, PMD_HC_PROGRESS); +break; +case PMD_HC_PROGRESS: +ka_set_pmd_health_check_state(pmd->core_id, PMD_HC_COMPLETE); +break; +case PMD_HC_COMPLETE: +port_link_status = ka_get_polled_ports_status(pmd->core_id); +port_stats = ka_get_polled_ports_stats(pmd->core_id); + +if (port_link_status == ACTIVE_RUN_STATE && + port_stats == ACTIVE_RUN_STATE ) { ka_set_pmd_state_ts(pmd->core_id, KA_STATE_ALIVE, 0); } +break; } } diff --git a/lib/keepalive.c b/lib/keepalive.c index 9251849..4234912 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -24,6 +24,7 @@ #include "dpdk.h" #include "keepalive.h" #include "lib/vswitch-idl.h" +#include "netdev-dpdk.h" #include "openvswitch/dynamic-string.h" #include "openvswitch/vlog.h" #include "ovs-thread.h" @@ -31,6 +32,7 @@ #include "unixctl.h" VLOG_DEFINE_THIS_MODULE(keepalive); +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); static bool keepalive_enable = false;/* Keepalive disabled by default */ static bool ka_init_status = ka_init_failure; /* Keepalive initialization */ @@ -453,6 +455,31 @@ enum pmdhealth_status ka_get_polled_ports_status(unsigned core_id) } } +enum pmdhealth_status ka_get_polled_ports_stats(unsigned core_id) +{ +if (!ka_info) { +return FAILURE_STATE; +} + +int failed = 0; +int n_ports = ka_info->ext_stats[core_id].num_poll_ports; +for (int i = 0; i < n_ports; i++) { +int state; +state = + ka_info->ext_stats[core_id].port_stats[i].state[PORT_STATS_CHECK]; +if (state == FAILURE_STATE) { +failed = 1; +break; +} +} + +if (!failed) { +return ACTIVE_RUN_STATE; +} else { +return FAILURE_STATE; +} +} + void ka_info_update_port_status(const char *port, int qid OVS_UNUSED, char *link_state, int core_id, int idx) @@ -480,6 +507,76 @@ ka_info_update_port_status(const char *port, int qid OVS_UNUSED, state; } +void +ka_info_update_port_statistics(const struct netdev *netdev, + int core_id, int idx) +{ +int error; +int state = FAILURE_STATE; + +if (!ka_info) { +VLOG_ERR_RL(, "Keepalive disabled"); +return; +} +ka_info->ext_stats[core_id].num_poll_ports = idx; + +int pmd_hc_state = ka_get_pmd_health_check_state(core_id); +if (PMD_HC_ENABLE == pmd_hc_state) { +struct netdev_stats *stats; +stats =
[ovs-dev] [RFC PATCH v3 15/18] keepalive: Check the PMD cycle stats as part of PMD health checks.
This commit adds the support to check the PMD cycle stats. If the cycles aren't changing for a duration of time this can be flagged as possible PMD stall. Signed-off-by: Bhanuprakash Bodireddy--- lib/dpif-netdev.c | 16 +--- lib/dpif-netdev.h | 6 ++ lib/keepalive.c | 51 +++ lib/keepalive.h | 3 +++ 4 files changed, 69 insertions(+), 7 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index ad48ee5..b1a9fc4 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -328,12 +328,6 @@ enum dp_stat_type { DP_N_STATS }; -enum pmd_cycles_counter_type { -PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */ -PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */ -PMD_N_CYCLES -}; - #define XPS_TIMEOUT_MS 500LL /* Contained by struct dp_netdev_port's 'rxqs' member. */ @@ -978,6 +972,8 @@ pmd_health_check(struct dp_netdev_pmd_thread *pmd) struct rxq_poll *poll; int port_link_status = 0; int port_stats = 0; +int pmd_polling = 0; +uint64_t cycles[PMD_N_CYCLES]; struct svec pmd_poll_list; svec_init(_poll_list); @@ -1005,6 +1001,11 @@ pmd_health_check(struct dp_netdev_pmd_thread *pmd) } svec_destroy(_poll_list); +for (int idx = 0; idx < ARRAY_SIZE(cycles); idx++) { +atomic_read_relaxed(>cycles.n[idx], [idx]); +} +pmd_polling = ka_info_update_pmd_cycles(pmd->core_id, cycles); + int pmd_hc_state = ka_get_pmd_health_check_state(pmd->core_id); switch (pmd_hc_state) { case PMD_HC_ENABLE: @@ -1018,7 +1019,8 @@ pmd_health_check(struct dp_netdev_pmd_thread *pmd) port_stats = ka_get_polled_ports_stats(pmd->core_id); if (port_link_status == ACTIVE_RUN_STATE && - port_stats == ACTIVE_RUN_STATE ) { + port_stats == ACTIVE_RUN_STATE && +pmd_polling == ACTIVE_RUN_STATE) { ka_set_pmd_state_ts(pmd->core_id, KA_STATE_ALIVE, 0); } break; diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h index 6db6ed2..e7c2400 100644 --- a/lib/dpif-netdev.h +++ b/lib/dpif-netdev.h @@ -33,6 +33,12 @@ extern "C" { * headers to be aligned on a 4-byte boundary. */ enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN }; +enum pmd_cycles_counter_type { +PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */ +PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */ +PMD_N_CYCLES +}; + bool dpif_is_netdev(const struct dpif *); #define NR_QUEUE 1 diff --git a/lib/keepalive.c b/lib/keepalive.c index 4234912..3690b70 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -577,6 +577,57 @@ ka_info_update_port_statistics(const struct netdev *netdev, state; } +int +ka_info_update_pmd_cycles(int core_id, uint64_t cycles[PMD_N_CYCLES]) +{ +int pmd_state = ACTIVE_RUN_STATE; +if (!ka_info) { +return FAILURE_STATE; +} + +uint64_t total_cycles = 0; +for (int i = 0; i < PMD_N_CYCLES; i++) { +if (cycles[i] > 0) { +total_cycles += cycles[i]; +} +} + +if (!total_cycles) { +return -1; +} + +int pmd_hc_state = ka_get_pmd_health_check_state(core_id); +if (PMD_HC_ENABLE == pmd_hc_state) { +ka_info->ext_stats[core_id].cycles[PMD_CYCLES_POLLING] = + cycles[PMD_CYCLES_POLLING]; + +ka_info->ext_stats[core_id].cycles[PMD_CYCLES_PROCESSING] = + cycles[PMD_CYCLES_PROCESSING]; +} + +if (PMD_HC_PROGRESS == pmd_hc_state) { +uint64_t polling_cycles_cnt = 0, proc_cycles_cnt = 0; +uint64_t prev_poll_cycles = +ka_info->ext_stats[core_id].cycles[PMD_CYCLES_POLLING]; +uint64_t prev_proc_cycles = +ka_info->ext_stats[core_id].cycles[PMD_CYCLES_PROCESSING]; + +VLOG_DBG_RL(, "Keepalive: Going to check the PMD thresholds now."); + +polling_cycles_cnt = cycles[PMD_CYCLES_POLLING] - prev_poll_cycles; + +proc_cycles_cnt = cycles[PMD_CYCLES_PROCESSING] + - prev_proc_cycles; + +if (!polling_cycles_cnt && !proc_cycles_cnt) { +VLOG_DBG("PMD FAILURE!"); +pmd_state = FAILURE_STATE; +} +} + +return pmd_state; +} + static void ka_unixctl_pmd_health_show(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *ka_info_) diff --git a/lib/keepalive.h b/lib/keepalive.h index a132d74..1bd639b 100644 --- a/lib/keepalive.h +++ b/lib/keepalive.h @@ -27,6 +27,7 @@ #define KA_DP_MAXCORES 128 #endif /* DPDK_NETDEV */ +#include "dpif-netdev.h" #include "netdev.h" struct smap; @@ -76,6 +77,7 @@ struct poll_port_stats { struct pmd_extended_stats { char *health_status; struct poll_port_stats *port_stats; +uint64_t
[ovs-dev] [RFC PATCH v3 13/18] keepalive: Check the link status as part of PMD health checks.
This commit adds the initial support in to performing PMD health checks. The ports handled by the PMD threads are checked for the link status and the same is updated in to keepalive info structure. Signed-off-by: Bhanuprakash Bodireddy--- lib/dpif-netdev.c | 37 +++-- lib/keepalive.c | 52 lib/keepalive.h | 18 +- 3 files changed, 104 insertions(+), 3 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 06d2e23..4d8d3e7 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -51,6 +51,7 @@ #include "keepalive.h" #include "latch.h" #include "netdev.h" +#include "netdev-provider.h" #include "netdev-vport.h" #include "netlink.h" #include "odp-execute.h" @@ -972,9 +973,41 @@ sorted_poll_thread_list(struct dp_netdev *dp, } static void -pmd_health_check(struct dp_netdev_pmd_thread *pmd OVS_UNUSED) +pmd_health_check(struct dp_netdev_pmd_thread *pmd) { -/* Nothing */ +int port_link_status = 0; +struct rxq_poll *poll; + +struct svec pmd_poll_list; +svec_init(_poll_list); +HMAP_FOR_EACH (poll, node, >poll_list) { +svec_add(_poll_list, netdev_rxq_get_name(poll->rxq->rx)); +} + +/* With MQ enabled, remove the duplicates. */ +svec_sort_unique(_poll_list); + +const char *port_name; +int i = 0; +SVEC_FOR_EACH (i, port_name, _poll_list) { +struct netdev *dev = netdev_from_name(port_name); +if (dev) { +char *link_state = netdev_get_carrier(dev) ? "up" : "down"; +ka_info_update_port_status(port_name, 0, link_state, +pmd->core_id, i); +netdev_close(dev); +} +} +svec_destroy(_poll_list); + +port_link_status = ka_get_polled_ports_status(pmd->core_id); + +int pmd_hc_state = ka_get_pmd_health_check_state(pmd->core_id); +if (PMD_HC_COMPLETE == pmd_hc_state) { +if (port_link_status == ACTIVE_RUN_STATE) { +ka_set_pmd_state_ts(pmd->core_id, KA_STATE_ALIVE, 0); +} +} } static void diff --git a/lib/keepalive.c b/lib/keepalive.c index 997bebf..9251849 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -428,6 +428,58 @@ ka_stats_run(void) return ka_stats; } +enum pmdhealth_status ka_get_polled_ports_status(unsigned core_id) +{ +if (OVS_UNLIKELY(!ka_info)) { +return FAILURE_STATE; +} + +int failed = 0; +int n_ports = ka_info->ext_stats[core_id].num_poll_ports; +for (int i = 0; i < n_ports; i++) { +int state; +state = + ka_info->ext_stats[core_id].port_stats[i].state[PORT_LINK_CHECK]; +if (state == FAILURE_STATE) { +failed = 1; +break; +} +} + +if (!failed) { +return ACTIVE_RUN_STATE; +} else { +return FAILURE_STATE; +} +} + +void +ka_info_update_port_status(const char *port, int qid OVS_UNUSED, + char *link_state, int core_id, int idx) +{ +if (OVS_UNLIKELY(!ka_info)) { +return; +} + +ka_info->ext_stats[core_id].num_poll_ports = idx; + +if (OVS_LIKELY(core_id != NON_PMD_CORE_ID)) { +ka_info->ext_stats[core_id].port_stats[idx].port = port; +ka_info->ext_stats[core_id].port_stats[idx].link_state = + link_state; +} + +int state; +if (!strcmp(link_state, "down")) { +state = FAILURE_STATE; +} else { +state = ACTIVE_RUN_STATE; +} + +ka_info->ext_stats[core_id].port_stats[idx].state[PORT_LINK_CHECK] = + state; +} + static void ka_unixctl_pmd_health_show(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *ka_info_) diff --git a/lib/keepalive.h b/lib/keepalive.h index 8877ca6..69697bd 100644 --- a/lib/keepalive.h +++ b/lib/keepalive.h @@ -47,6 +47,12 @@ enum pmdhealth_check { PMD_HC_COMPLETE }; +enum port_health_check { +PORT_LINK_CHECK = 0, +PORT_STATS_CHECK, +PORT_NUM_CHECKS +}; + struct ka_process_info { char name[16]; int tid; @@ -60,10 +66,12 @@ struct ka_process_info { struct poll_port_stats { const char *port; -int qid; +char *link_state; +int state[PORT_NUM_CHECKS]; }; struct pmd_extended_stats { +char *health_status; struct poll_port_stats *port_stats; int num_poll_ports; }; @@ -92,6 +100,11 @@ enum keepalive_status { ka_init_success }; +enum pmdhealth_status { +FAILURE_STATE = 0, +ACTIVE_RUN_STATE +}; + void ka_init(const struct smap *); void ka_destroy(void); void ka_set_pmd_state_ts(unsigned, enum keepalive_state, uint64_t); @@ -121,4 +134,7 @@ struct smap *ka_stats_run(void); void ka_load_process_list(struct hmap **); void
[ovs-dev] [RFC PATCH v3 12/18] dpif-netdev: Add additional datapath health checks.
This commit enables additional datapath health checks. The checks are enabled only on a PMD heartbeat failure. On missing three successive heartbeats additional health checks needs to be performed on respective PMD thread to confirm the failure. The datapath health is monitored periodically from keepalive thread. It should be noted that the PMD health checks are only performed on the PMD threads whose health check is enabled. Signed-off-by: Bhanuprakash Bodireddy--- lib/dpif-netdev.c | 30 + lib/keepalive.c | 81 +++ lib/keepalive.h | 16 +++ 3 files changed, 127 insertions(+) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 93bda20..06d2e23 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -971,6 +971,35 @@ sorted_poll_thread_list(struct dp_netdev *dp, *n = k; } +static void +pmd_health_check(struct dp_netdev_pmd_thread *pmd OVS_UNUSED) +{ +/* Nothing */ +} + +static void +get_datapath_health(struct dp_netdev *dp) +{ +static struct hmap *process_list = NULL; +if (!process_list) { +ka_load_process_list(_list); +} + +struct ka_process_info *pinfo; +HMAP_FOR_EACH (pinfo, node, process_list) { +int core_id = pinfo->core_id; +struct dp_netdev_pmd_thread *pmd; + +/* Check only PMD threads whose health check is enabled. */ +if (OVS_LIKELY(pinfo->healthcheck == PMD_HC_DISABLE)) { +continue; +} + +pmd = dp_netdev_get_pmd(dp, core_id); +pmd_health_check(pmd); +} +} + static void * ovs_keepalive(void *f_) { @@ -982,6 +1011,7 @@ ovs_keepalive(void *f_) int n_pmds = cmap_count(>poll_threads) - 1; if (n_pmds > 0) { dispatch_heartbeats(); +get_datapath_health(dp); get_ka_stats(); } diff --git a/lib/keepalive.c b/lib/keepalive.c index 6edb440..997bebf 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -105,6 +105,7 @@ ka_register_thread(int tid, bool thread_is_pmd) pinfo->tid = tid; pinfo->heartbeats = true; pinfo->core_id = core_id; +pinfo->healthcheck = PMD_HC_DISABLE; char *pname = get_process_name(tid); if (pname) { @@ -176,6 +177,78 @@ ka_mark_pmd_thread_sleep(void) } void +ka_enable_pmd_health_check(unsigned core_id) +{ +if (ka_is_enabled()) { +struct ka_process_info *pinfo; +int tid = ka_get_pmd_tid(core_id); +ovs_mutex_lock(_info->proclist_mutex); +HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0), + _info->process_list) { +if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) { +pinfo->healthcheck = PMD_HC_ENABLE; +} +} +ovs_mutex_unlock(_info->proclist_mutex); +} +} + +void +ka_disable_pmd_health_check(unsigned core_id) +{ +if (ka_is_enabled()) { +struct ka_process_info *pinfo; +int tid = ka_get_pmd_tid(core_id); +ovs_mutex_lock(_info->proclist_mutex); +HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0), + _info->process_list) { +if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) { +pinfo->healthcheck = PMD_HC_DISABLE; +} +} +ovs_mutex_unlock(_info->proclist_mutex); +} +} + +enum pmdhealth_check +ka_get_pmd_health_check_state(unsigned core_id) +OVS_REQUIRES(ka_info->proclist_mutex) +{ +int hc = PMD_HC_DISABLE; +if (ka_is_enabled()) { +struct ka_process_info *pinfo; +int tid = ka_get_pmd_tid(core_id); +ovs_mutex_lock(_info->proclist_mutex); +HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0), + _info->process_list) { +if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) { +hc = pinfo->healthcheck; +} +} +ovs_mutex_unlock(_info->proclist_mutex); +} + +return hc; +} + +void +ka_set_pmd_health_check_state(unsigned core_id, enum pmdhealth_check state) +{ +if (ka_is_enabled()) { +struct ka_process_info *pinfo; +int tid = ka_get_pmd_tid(core_id); +ovs_mutex_lock(_info->proclist_mutex); +HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0), + _info->process_list) { +if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) { +pinfo->healthcheck = state; +} +} +ovs_mutex_unlock(_info->proclist_mutex); +} +} + +void ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state, uint64_t last_alive) { @@ -193,6 +266,14 @@ ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state, ovs_mutex_unlock(_info->proclist_mutex); } +void +ka_load_process_list(struct hmap
[ovs-dev] [RFC PATCH v3 11/18] keepalive: Add support to query keepalive status.
This commit adds support to query if keepalive status is enabled/disabled. $ ovs-appctl keepalive/status keepAlive Status: Enabled Signed-off-by: Bhanuprakash Bodireddy--- lib/keepalive.c | 15 +++ 1 file changed, 15 insertions(+) diff --git a/lib/keepalive.c b/lib/keepalive.c index a4b8d01..6edb440 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -420,6 +420,19 @@ out: ds_destroy(); } +static void +ka_unixctl_status(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) +{ +struct ds ds = DS_EMPTY_INITIALIZER; + +ds_put_format(, "keepAlive Status: %s", + ka_is_enabled() ? "Enabled" : "Disabled"); + +unixctl_command_reply(conn, ds_cstr()); +ds_destroy(); +} + static int ka_init__(void) { @@ -466,6 +479,8 @@ ka_init(const struct smap *ovs_other_config) unixctl_command_register("keepalive/pmd-health-show", "", 0, 0, ka_unixctl_pmd_health_show, ka_info); +unixctl_command_register("keepalive/status", "", 0, 0, + ka_unixctl_status, NULL); ovsthread_once_done(_enable); } -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 10/18] keepalive: Add support to query keepalive statistics.
This commit adds support to query keepalive statistics. Datapath health status can be retrieved as follows: $ ovs-appctl keepalive/pmd-health-show Keepalive status keepalive status : Enabled keepalive interval: 1000 ms PMD threads : 8 PMDCORESTATE LAST SEEN TIMESTAMP pmd620 ALIVE 8632183482028293 pmd631 ALIVE 8632183482028425 pmd642 ALIVE 8632190191004294 pmd653 ALIVE 8632183482028525 pmd664 GONE8612183482028117 pmd675 ALIVE 8632190191004984 pmd686 ALIVE 8632190191005713 pmd697 ALIVE 8632190191006555 Signed-off-by: Bhanuprakash Bodireddy--- lib/keepalive.c | 78 + 1 file changed, 78 insertions(+) diff --git a/lib/keepalive.c b/lib/keepalive.c index f0b75f0..a4b8d01 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -24,9 +24,11 @@ #include "dpdk.h" #include "keepalive.h" #include "lib/vswitch-idl.h" +#include "openvswitch/dynamic-string.h" #include "openvswitch/vlog.h" #include "ovs-thread.h" #include "process.h" +#include "unixctl.h" VLOG_DEFINE_THIS_MODULE(keepalive); @@ -345,6 +347,79 @@ ka_stats_run(void) return ka_stats; } +static void +ka_unixctl_pmd_health_show(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, void *ka_info_) +{ +struct ds ds = DS_EMPTY_INITIALIZER; +ds_put_format(, + "\n\t\tKeepalive status\n\n"); + +ds_put_format(, "keepalive status : %s\n", + ka_is_enabled() ? "Enabled" : "Disabled"); + +if (!ka_is_enabled()) { +goto out; +} + +ds_put_format(, "keepalive interval: %"PRIu32" ms\n", + get_ka_interval()); + +struct keepalive_info *ka_info = (struct keepalive_info *)ka_info_; +if (OVS_UNLIKELY(!ka_info)) { +goto out; +} + +ds_put_format(, "PMD threads : %"PRIu32" \n", ka_info->pmd_cnt); +ds_put_format(, + "\n PMD\tCORE\tSTATE\tLAST SEEN TIMESTAMP\n"); + +struct ka_process_info *pinfo, *pinfo_next; + +ovs_mutex_lock(_info->proclist_mutex); +HMAP_FOR_EACH_SAFE (pinfo, pinfo_next, node, _info->process_list) { +char *state = NULL; +if (pinfo->core_state == KA_STATE_UNUSED || + pinfo->core_state == KA_STATE_SLEEP) +continue; + +switch (pinfo->core_state) { +case KA_STATE_ALIVE: +state = "ALIVE"; +break; +case KA_STATE_MISSING: +state = "MISSING"; +break; +case KA_STATE_DEAD: +state = "DEAD"; +break; +case KA_STATE_GONE: +state = "GONE"; +break; +case KA_STATE_DOZING: +state = "DOZING"; +break; +case KA_STATE_SLEEP: +state = "SLEEP"; +break; +case KA_STATE_CHECK: +state = "HEALTH_CHECK_RUNNING"; +break; +case KA_STATE_UNUSED: +break; +} +ds_put_format(, "%s\t%2d\t%s\t%"PRIu64"\n", + pinfo->name, pinfo->core_id, state, + pinfo->core_last_seen_times); +} +ovs_mutex_unlock(_info->proclist_mutex); + +ds_put_format(, "\n"); +out: +unixctl_command_reply(conn, ds_cstr()); +ds_destroy(); +} + static int ka_init__(void) { @@ -389,6 +464,9 @@ ka_init(const struct smap *ovs_other_config) VLOG_INFO("OvS Keepalive disabled."); } +unixctl_command_register("keepalive/pmd-health-show", "", 0, 0, + ka_unixctl_pmd_health_show, ka_info); + ovsthread_once_done(_enable); } } -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 09/18] bridge: Update keepalive status in OVSDB
This commit allows vswitchd thread to update the OVSDB with the status of all registered PMD threads. The status can be monitored using ovsdb-client and the sample output is below. $ ovsdb-client monitor Open_vSwitch Open_vSwitch keepalive rowaction keepalive 7b746190-ee71-4dcc-becf-f8cb9c7cb909 old { "PMD62"="ALIVE,0,9226457935188922" "PMD63"="ALIVE,1,9226457935189628" "PMD64"="ALIVE,2,9226457935189897" "PMD65"="ALIVE,3,9226457935190127"} new { "PMD62"="ALIVE,0,9226460230167364" "PMD63"="ALIVE,1,9226460230168100" "PMD64"="ALIVE,2,9226460230168905" "PMD65"="ALIVE,3,9226460230169632"} Signed-off-by: Bhanuprakash Bodireddy--- lib/keepalive.c | 15 +++ lib/keepalive.h | 1 + vswitchd/bridge.c | 26 ++ 3 files changed, 42 insertions(+) diff --git a/lib/keepalive.c b/lib/keepalive.c index b437bef..f0b75f0 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -330,6 +330,21 @@ get_ka_stats(void) ovs_mutex_unlock(); } +struct smap * +ka_stats_run(void) +{ +struct smap *ka_stats = NULL; + +ovs_mutex_lock(); +if (keepalive_stats) { +ka_stats = keepalive_stats; +keepalive_stats = NULL; +} +ovs_mutex_unlock(); + +return ka_stats; +} + static int ka_init__(void) { diff --git a/lib/keepalive.h b/lib/keepalive.h index bdec34f..356e761 100644 --- a/lib/keepalive.h +++ b/lib/keepalive.h @@ -102,6 +102,7 @@ int ka_get_pmd_tid(unsigned core); int ka_alloc_portstats(unsigned, int); void ka_destroy_portstats(void); void get_ka_stats(void); +struct smap *ka_stats_run(void); void dispatch_heartbeats(void); #endif /* keepalive.h */ diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 3927b9f..4b6b528 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -286,6 +286,7 @@ static bool port_is_synthetic(const struct port *); static void reconfigure_system_stats(const struct ovsrec_open_vswitch *); static void run_system_stats(void); +static void run_keepalive_stats(void); static void bridge_configure_mirrors(struct bridge *); static struct mirror *mirror_create(struct bridge *, @@ -403,6 +404,7 @@ bridge_init(const char *remote) ovsdb_idl_omit_alert(idl, _open_vswitch_col_cur_cfg); ovsdb_idl_omit_alert(idl, _open_vswitch_col_statistics); +ovsdb_idl_omit_alert(idl, _open_vswitch_col_keepalive); ovsdb_idl_omit_alert(idl, _open_vswitch_col_datapath_types); ovsdb_idl_omit_alert(idl, _open_vswitch_col_iface_types); ovsdb_idl_omit(idl, _open_vswitch_col_external_ids); @@ -2690,6 +2692,29 @@ run_system_stats(void) } } +void +run_keepalive_stats(void) +{ +struct smap *ka_stats; +const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_first(idl); + +ka_stats = ka_stats_run(); +if (ka_stats && cfg) { +struct ovsdb_idl_txn *txn; +struct ovsdb_datum datum; + +txn = ovsdb_idl_txn_create(idl); +ovsdb_datum_from_smap(, ka_stats); +smap_destroy(ka_stats); +ovsdb_idl_txn_write(>header_, _open_vswitch_col_keepalive, +); +ovsdb_idl_txn_commit(txn); +ovsdb_idl_txn_destroy(txn); + +free(ka_stats); +} +} + static const char * ofp12_controller_role_to_str(enum ofp12_controller_role role) { @@ -3039,6 +3064,7 @@ bridge_run(void) run_stats_update(); run_status_update(); run_system_stats(); +run_keepalive_stats(); } void -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 08/18] keepalive: Retrieve PMD status periodically.
This commit implements APIs to retrieve the PMD thread status and return the status in the below format for each PMD thread. Format: PMDID="STATUS,core id,last_seen_timestamp" eg: PMD62="ALIVE,2,9220698256784207" PMD63="GONE,3,9220698256786231" The status is periodically retrieved by keepalive thread and stored in keepalive_stats struc which later shall be retrieved by vswitchd thread. In case of four PMD threads the status is as below: "PMD62"="ALIVE,0,9220698256784207" "PMD63"="ALIVE,1,9220698256784913" "PMD64"="ALIVE,2,9220698256785902" "PMD65"="ALIVE,3,9220698256786231" Signed-off-by: Bhanuprakash Bodireddy--- lib/dpif-netdev.c | 1 + lib/keepalive.c | 73 +++ lib/keepalive.h | 1 + 3 files changed, 75 insertions(+) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index cf4d68c..93bda20 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -982,6 +982,7 @@ ovs_keepalive(void *f_) int n_pmds = cmap_count(>poll_threads) - 1; if (n_pmds > 0) { dispatch_heartbeats(); +get_ka_stats(); } ovsrcu_quiesce_start(); diff --git a/lib/keepalive.c b/lib/keepalive.c index 353f1d1..b437bef 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -25,6 +25,7 @@ #include "keepalive.h" #include "lib/vswitch-idl.h" #include "openvswitch/vlog.h" +#include "ovs-thread.h" #include "process.h" VLOG_DEFINE_THIS_MODULE(keepalive); @@ -34,6 +35,9 @@ static bool ka_init_status = ka_init_failure; /* Keepalive initialization */ static uint32_t keepalive_timer_interval; /* keepalive timer interval */ static struct keepalive_info *ka_info = NULL; +static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER; +static struct smap *keepalive_stats OVS_GUARDED_BY(mutex); + inline bool ka_is_enabled(void) { @@ -257,6 +261,75 @@ keepalive_info_create(void) return ka_info; } +static void +get_pmd_status(struct smap *ka_pmd_stats) +OVS_REQUIRES(ka_info->proclist_mutex) +{ +if (OVS_UNLIKELY(!ka_info)) { +return; +} + +struct ka_process_info *pinfo, *pinfo_next; +HMAP_FOR_EACH_SAFE (pinfo, pinfo_next, node, _info->process_list) { +int core_id = pinfo->core_id; +char *state = NULL; +if (pinfo->core_state == KA_STATE_UNUSED || + pinfo->core_state == KA_STATE_SLEEP ) { +continue; +} + +switch (pinfo->core_state) { +case KA_STATE_ALIVE: +state = "ALIVE"; +break; +case KA_STATE_MISSING: +state = "MISSING"; +break; +case KA_STATE_DEAD: +state = "DEAD"; +break; +case KA_STATE_GONE: +state = "GONE"; +break; +case KA_STATE_DOZING: +state = "DOZING"; +break; +case KA_STATE_SLEEP: +state = "SLEEP"; +break; +case KA_STATE_CHECK: +state = "HEALTH_CHECK_RUNNING"; +break; +case KA_STATE_UNUSED: +break; +} + +smap_add_format(ka_pmd_stats, pinfo->name, "%s,%d,%ld", +state, core_id, pinfo->core_last_seen_times); +} +} + +void +get_ka_stats(void) +{ +struct smap *ka_pmd_stats; +ka_pmd_stats = xmalloc(sizeof *ka_pmd_stats); +smap_init(ka_pmd_stats); + +ovs_mutex_lock(_info->proclist_mutex); +get_pmd_status(ka_pmd_stats); +ovs_mutex_unlock(_info->proclist_mutex); + +ovs_mutex_lock(); +if (keepalive_stats) { +smap_destroy(keepalive_stats); +free(keepalive_stats); +keepalive_stats = NULL; +} +keepalive_stats = ka_pmd_stats; +ovs_mutex_unlock(); +} + static int ka_init__(void) { diff --git a/lib/keepalive.h b/lib/keepalive.h index cfe02e5..bdec34f 100644 --- a/lib/keepalive.h +++ b/lib/keepalive.h @@ -101,6 +101,7 @@ int get_ka_init_status(void); int ka_get_pmd_tid(unsigned core); int ka_alloc_portstats(unsigned, int); void ka_destroy_portstats(void); +void get_ka_stats(void); void dispatch_heartbeats(void); #endif /* keepalive.h */ -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 06/18] dpif-netdev: Register packet processing cores to KA framework.
This commit registers the packet processing PMD cores to keepalive framework. Only PMDs that have rxqs mapped will be registered and actively monitored by KA framework. This commit spawns a keepalive thread that will dispatch heartbeats to PMD cores. The pmd threads respond to heartbeats by marking themselves alive. As long as PMD responds to heartbeats it is considered 'healthy'. Signed-off-by: Bhanuprakash Bodireddy--- lib/dpif-netdev.c | 100 + lib/keepalive.c | 130 +++--- lib/keepalive.h | 25 ++- 3 files changed, 236 insertions(+), 19 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index ce141e8..4b7c835 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -72,6 +72,7 @@ #include "seq.h" #include "smap.h" #include "sset.h" +#include "svec.h" #include "timeval.h" #include "tnl-neigh-cache.h" #include "tnl-ports.h" @@ -970,6 +971,96 @@ sorted_poll_thread_list(struct dp_netdev *dp, *n = k; } +static void * +ovs_keepalive(void *f_ OVS_UNUSED) +{ +pthread_detach(pthread_self()); + +for (;;) { +ovsrcu_quiesce_start(); +usleep(get_ka_interval() * 1000); +ovsrcu_quiesce_end(); +} + +return NULL; +} + +static void +ka_thread_start(struct dp_netdev *dp) +{ +static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + +if (ovsthread_once_start()) { +ovs_thread_create("ovs_keepalive", ovs_keepalive, dp); + +ovsthread_once_done(); +} +} + +static void +pmd_num_poll_ports(struct dp_netdev_pmd_thread *pmd, int *num_poll_ports) +{ +struct svec pmd_port_poll_list; +svec_init(_port_poll_list); + +struct rxq_poll *poll; +const char *port_name; +int i = 0; + +HMAP_FOR_EACH (poll, node, >poll_list) { +svec_add(_port_poll_list, netdev_rxq_get_name(poll->rxq->rx)); +} +/* With MQ enabled, remove the duplicates. */ +svec_sort_unique(_port_poll_list); +SVEC_FOR_EACH (i, port_name, _port_poll_list) { +VLOG_DBG("%d Port:%s", i, port_name); +} +svec_destroy(_port_poll_list); + +*num_poll_ports = i; +VLOG_DBG("PMD thread [%d] polling [%d] ports", + pmd->core_id, *num_poll_ports); +} + +static void +ka_register_datapath_threads(struct dp_netdev *dp) +{ +int ka_init = get_ka_init_status(); +VLOG_DBG("Keepalive: Was initialization successful? [%s]", +ka_init ? "Success" : "Failure"); +if (!ka_init) { +return; +} + +ka_thread_start(dp); + +struct dp_netdev_pmd_thread *pmd; +CMAP_FOR_EACH (pmd, node, >poll_threads) { +/* Skip PMD thread with no rxqs mapping. */ +if (OVS_UNLIKELY(!hmap_count(>poll_list))) { +continue; +} + +/* Register only PMD threads. */ +if (pmd->core_id != NON_PMD_CORE_ID) { +int err; +int nports; +pmd_num_poll_ports(pmd, ); +err = ka_alloc_portstats(pmd->core_id, nports); +if (err) { +VLOG_FATAL("Unable to allocate memory for PMD core %d", +pmd->core_id); +return; +} + +int tid = ka_get_pmd_tid(pmd->core_id); +ka_register_thread(tid, true); +VLOG_DBG("Registered PMD thread [%d] on Core [%d] to KA framework", + tid, pmd->core_id); +} +} +} + static void dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], void *aux) @@ -3541,6 +3632,9 @@ reconfigure_datapath(struct dp_netdev *dp) /* Reload affected pmd threads. */ reload_affected_pmds(dp); + +/* Register datapath threads to KA monitoring. */ +ka_register_datapath_threads(dp); } /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */ @@ -3740,6 +3834,9 @@ reload: poll_list[i].port_no); } +/* Mark PMD thread alive. */ +ka_mark_pmd_thread_alive(); + if (lc++ > 1024) { bool reload; @@ -3770,6 +3867,9 @@ reload: goto reload; } +int tid = ka_get_pmd_tid(pmd->core_id); +ka_unregister_thread(tid, true); + free(poll_list); pmd_free_cached_ports(pmd); return NULL; diff --git a/lib/keepalive.c b/lib/keepalive.c index 54faf49..64ab117 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -25,6 +25,7 @@ #include "keepalive.h" #include "lib/vswitch-idl.h" #include "openvswitch/vlog.h" +#include "process.h" VLOG_DEFINE_THIS_MODULE(keepalive); @@ -76,21 +77,77 @@ ka_store_pmd_id(unsigned core_idx) } } -/* Register packet processing PMD thread to KA framework. */ +/* Register thread to KA framework. */ void -ka_register_pmd_thread(int tid OVS_UNUSED, unsigned core_id) +ka_register_thread(int tid, bool
[ovs-dev] [RFC PATCH v3 07/18] dpif-netdev: Enable heartbeats for DPDK datapath.
This commit adds heartbeat mechanism support for DPDK datapath. Heartbeats are sent to registered PMD threads at predefined intervals (as set in ovsdb with 'keepalive-interval'). The heartbeats are only enabled when there is atleast one port added to the bridge and with active PMD thread polling the port. Signed-off-by: Bhanuprakash Bodireddy--- lib/dpdk-stub.c | 6 ++ lib/dpdk.c| 7 +++ lib/dpdk.h| 2 ++ lib/dpif-netdev.c | 9 - lib/keepalive.c | 9 + lib/keepalive.h | 1 + 6 files changed, 33 insertions(+), 1 deletion(-) diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c index d7fb19b..bf7b891 100644 --- a/lib/dpdk-stub.c +++ b/lib/dpdk-stub.c @@ -72,3 +72,9 @@ dpdk_mark_pmd_core_sleep(void) { /* Nothing */ } + +void +dpdk_dispatch_pmd_hb(void) +{ +/* Nothing */ +} diff --git a/lib/dpdk.c b/lib/dpdk.c index 917ef58..231d045 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -537,3 +537,10 @@ dpdk_mark_pmd_core_sleep(void) { rte_keepalive_mark_sleep(rte_global_keepalive_info); } + +/* Dispatch pings */ +void +dpdk_dispatch_pmd_hb(void) +{ +rte_keepalive_dispatch_pings(NULL, rte_global_keepalive_info); +} diff --git a/lib/dpdk.h b/lib/dpdk.h index 177624d..9fb438d 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -48,4 +48,6 @@ void dpdk_unregister_pmd_core(unsigned core_id); void dpdk_mark_pmd_core_alive(void); void dpdk_mark_pmd_core_sleep(void); +void dpdk_dispatch_pmd_hb(void); + #endif /* dpdk.h */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4b7c835..cf4d68c 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -972,11 +972,18 @@ sorted_poll_thread_list(struct dp_netdev *dp, } static void * -ovs_keepalive(void *f_ OVS_UNUSED) +ovs_keepalive(void *f_) { +struct dp_netdev *dp = f_; + pthread_detach(pthread_self()); for (;;) { +int n_pmds = cmap_count(>poll_threads) - 1; +if (n_pmds > 0) { +dispatch_heartbeats(); +} + ovsrcu_quiesce_start(); usleep(get_ka_interval() * 1000); ovsrcu_quiesce_end(); diff --git a/lib/keepalive.c b/lib/keepalive.c index 64ab117..353f1d1 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -239,6 +239,15 @@ ka_destroy_portstats(void) } } +/* Dispatch pings */ +void +dispatch_heartbeats(void) +{ +#ifdef DPDK_NETDEV +dpdk_dispatch_pmd_hb(); +#endif +} + static struct keepalive_info * keepalive_info_create(void) { diff --git a/lib/keepalive.h b/lib/keepalive.h index f1e232d..cfe02e5 100644 --- a/lib/keepalive.h +++ b/lib/keepalive.h @@ -102,4 +102,5 @@ int ka_get_pmd_tid(unsigned core); int ka_alloc_portstats(unsigned, int); void ka_destroy_portstats(void); +void dispatch_heartbeats(void); #endif /* keepalive.h */ -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 05/18] dpif-netdev: Add helper function to store datapath tids.
This commit adds an API to store the PMD thread ids in to KA info struct. The thread ids shall be used to check false positives and for status and statistics reporting. Signed-off-by: Bhanuprakash Bodireddy--- lib/dpif-netdev.c | 3 +++ lib/keepalive.c | 13 + lib/keepalive.h | 1 + 3 files changed, 17 insertions(+) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 2b65dc7..ce141e8 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -48,6 +48,7 @@ #include "fat-rwlock.h" #include "flow.h" #include "hmapx.h" +#include "keepalive.h" #include "latch.h" #include "netdev.h" #include "netdev-vport.h" @@ -3708,6 +3709,8 @@ pmd_thread_main(void *f_) poll_list = NULL; +ka_store_pmd_id(pmd->core_id); + /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); ovs_numa_thread_setaffinity_core(pmd->core_id); diff --git a/lib/keepalive.c b/lib/keepalive.c index 747d947..54faf49 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -63,6 +63,19 @@ get_ka_init_status(void) return ka_init_status; } +void +ka_store_pmd_id(unsigned core_idx) +{ +int tid = -1; +#ifdef DPDK_NETDEV +tid = rte_sys_gettid(); +#endif + +if (ka_is_enabled()) { +ka_info->thread_id[core_idx] = tid; +} +} + /* Register packet processing PMD thread to KA framework. */ void ka_register_pmd_thread(int tid OVS_UNUSED, unsigned core_id) diff --git a/lib/keepalive.h b/lib/keepalive.h index a35b309..67f89da 100644 --- a/lib/keepalive.h +++ b/lib/keepalive.h @@ -76,6 +76,7 @@ void ka_unregister_pmd_thread(int, unsigned); void ka_mark_pmd_thread_alive(void); void ka_mark_pmd_thread_sleep(void); +void ka_store_pmd_id(unsigned core); uint32_t get_ka_interval(void); int get_ka_init_status(void); int ka_get_pmd_tid(unsigned core); -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 04/18] keepalive: Add more helper functions to KA framework.
This commit introduces helper functions in 'keepalive' module that are needed to register/unregister PMD threads to KA framework. Also introduce APIs to mark the PMD core states. Signed-off-by: Bhanuprakash Bodireddy--- lib/keepalive.c | 49 + lib/keepalive.h | 9 + 2 files changed, 58 insertions(+) diff --git a/lib/keepalive.c b/lib/keepalive.c index 7d1c01c..747d947 100644 --- a/lib/keepalive.c +++ b/lib/keepalive.c @@ -50,6 +50,55 @@ ka_get_pmd_tid(unsigned core_idx) return tid; } +/* Return the Keepalive timer interval. */ +inline uint32_t +get_ka_interval(void) +{ +return keepalive_timer_interval; +} + +inline int +get_ka_init_status(void) +{ +return ka_init_status; +} + +/* Register packet processing PMD thread to KA framework. */ +void +ka_register_pmd_thread(int tid OVS_UNUSED, unsigned core_id) +{ +if (ka_is_enabled()) { +dpdk_register_pmd_core(core_id); +} +} + +/* Unregister packet processing PMD thread from KA framework. */ +void +ka_unregister_pmd_thread(int tid OVS_UNUSED, unsigned core_id) +{ +if (ka_is_enabled()) { +dpdk_unregister_pmd_core(core_id); +} +} + +/* Mark packet processing core alive. */ +inline void +ka_mark_pmd_thread_alive(void) +{ +if (ka_is_enabled()) { +dpdk_mark_pmd_core_alive(); +} +} + +/* Mark packet processing core as idle. */ +inline void +ka_mark_pmd_thread_sleep(void) +{ +if (ka_is_enabled()) { +dpdk_mark_pmd_core_sleep(); +} +} + void ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state, uint64_t last_alive) diff --git a/lib/keepalive.h b/lib/keepalive.h index b87b66f..a35b309 100644 --- a/lib/keepalive.h +++ b/lib/keepalive.h @@ -71,4 +71,13 @@ void ka_set_pmd_state_ts(unsigned, enum keepalive_state, uint64_t); int ka_get_pmd_tid(unsigned core); bool ka_is_enabled(void); +void ka_register_pmd_thread(int, unsigned); +void ka_unregister_pmd_thread(int, unsigned); +void ka_mark_pmd_thread_alive(void); +void ka_mark_pmd_thread_sleep(void); + +uint32_t get_ka_interval(void); +int get_ka_init_status(void); +int ka_get_pmd_tid(unsigned core); + #endif /* keepalive.h */ -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 03/18] Keepalive: Add initial keepalive support.
This commit introduces the initial keepalive support by adding 'keepalive' module and also helper and initialization functions that will be invoked by later commits. This commit adds new ovsdb column "keepalive" that shows the status of the datapath threads. This is implemented for DPDK datapath and only status of PMD threads is reported. For eg: To enable keepalive feature. 'ovs-vsctl --no-wait set Open_vSwitch . other_config:enable-keepalive=true' To set timer interval of 5000ms for monitoring packet processing cores. 'ovs-vsctl --no-wait set Open_vSwitch . \ other_config:keepalive-interval="5000" Signed-off-by: Bhanuprakash Bodireddy--- lib/automake.mk| 2 + lib/dpdk.c | 17 + lib/dpdk.h | 2 + lib/keepalive.c| 160 + lib/keepalive.h| 74 + lib/netdev-dpdk.c | 61 - lib/netdev-dpdk.h | 5 ++ vswitchd/bridge.c | 5 ++ vswitchd/vswitch.ovsschema | 8 ++- vswitchd/vswitch.xml | 49 ++ 10 files changed, 380 insertions(+), 3 deletions(-) create mode 100644 lib/keepalive.c create mode 100644 lib/keepalive.h diff --git a/lib/automake.mk b/lib/automake.mk index 54a1032..8f6e146 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -110,6 +110,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/json.c \ lib/jsonrpc.c \ lib/jsonrpc.h \ + lib/keepalive.c \ + lib/keepalive.h \ lib/lacp.c \ lib/lacp.h \ lib/latch.h \ diff --git a/lib/dpdk.c b/lib/dpdk.c index 8db63bf..917ef58 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -32,6 +32,7 @@ #include "dirs.h" #include "fatal-signal.h" +#include "keepalive.h" #include "netdev-dpdk.h" #include "openvswitch/dynamic-string.h" #include "openvswitch/vlog.h" @@ -477,6 +478,22 @@ dpdk_init(const struct smap *ovs_other_config) } } +int +dpdk_ka_init(struct keepalive_info *ka_info) +{ +/* Initialize keepalive subsystem */ +if ((rte_global_keepalive_info = +rte_keepalive_create(_failcore_cb, ka_info)) == NULL) { +VLOG_ERR("Keepalive initialization failed."); +return -1; +} else { +rte_keepalive_register_relay_callback(rte_global_keepalive_info, +dpdk_ka_update_core_state, ka_info); +} + +return 0; +} + const char * dpdk_get_vhost_sock_dir(void) { diff --git a/lib/dpdk.h b/lib/dpdk.h index bdbb51b..177624d 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -34,9 +34,11 @@ #endif /* DPDK_NETDEV */ struct smap; +struct keepalive_info; struct rte_keepalive *rte_global_keepalive_info; void dpdk_init(const struct smap *ovs_other_config); +int dpdk_ka_init(struct keepalive_info *ka_info); void dpdk_set_lcore_id(unsigned cpu); const char *dpdk_get_vhost_sock_dir(void); diff --git a/lib/keepalive.c b/lib/keepalive.c new file mode 100644 index 000..7d1c01c --- /dev/null +++ b/lib/keepalive.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "dpdk.h" +#include "keepalive.h" +#include "lib/vswitch-idl.h" +#include "openvswitch/vlog.h" + +VLOG_DEFINE_THIS_MODULE(keepalive); + +static bool keepalive_enable = false;/* Keepalive disabled by default */ +static bool ka_init_status = ka_init_failure; /* Keepalive initialization */ +static uint32_t keepalive_timer_interval; /* keepalive timer interval */ +static struct keepalive_info *ka_info = NULL; + +inline bool +ka_is_enabled(void) +{ +return keepalive_enable; +} + +inline int +ka_get_pmd_tid(unsigned core_idx) +{ +int tid = -1; +if (ka_is_enabled()) { +tid = ka_info->thread_id[core_idx]; +} +ovs_assert(tid > 0); +return tid; +} + +void +ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state, +uint64_t last_alive) +{ +struct ka_process_info *pinfo; +int tid = ka_get_pmd_tid(core_id); + +ovs_mutex_lock(_info->proclist_mutex); +HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0), + _info->process_list) { +if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) { +pinfo->core_state = state; +pinfo->core_last_seen_times =
[ovs-dev] [RFC PATCH v3 01/18] dpdk: Add helper functions for DPDK datapath keepalive.
Introduce helper functions in 'dpdk' module that are needed for DPDK keepalive functionality. Also add dummy functions in 'dpdk-stub' module that are needed when DPDK datapath is not available. Signed-off-by: Bhanuprakash Bodireddy--- lib/dpdk-stub.c | 24 lib/dpdk.c | 31 +++ lib/dpdk.h | 10 ++ 3 files changed, 65 insertions(+) diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c index daef729..d7fb19b 100644 --- a/lib/dpdk-stub.c +++ b/lib/dpdk-stub.c @@ -48,3 +48,27 @@ dpdk_get_vhost_sock_dir(void) { return NULL; } + +void +dpdk_register_pmd_core(unsigned core_id OVS_UNUSED) +{ +/* Nothing */ +} + +void +dpdk_unregister_pmd_core(unsigned core_id OVS_UNUSED) +{ +/* Nothing */ +} + +void +dpdk_mark_pmd_core_alive(void) +{ +/* Nothing */ +} + +void +dpdk_mark_pmd_core_sleep(void) +{ +/* Nothing */ +} diff --git a/lib/dpdk.c b/lib/dpdk.c index 8da6c32..8db63bf 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #ifdef DPDK_PDUMP @@ -489,3 +490,33 @@ dpdk_set_lcore_id(unsigned cpu) ovs_assert(cpu != NON_PMD_CORE_ID); RTE_PER_LCORE(_lcore_id) = cpu; } + +/* Register packet processing core 'core_id' for liveness checks. */ +void +dpdk_register_pmd_core(unsigned core) +{ +rte_keepalive_register_core(rte_global_keepalive_info, core); +} + +void +dpdk_unregister_pmd_core(unsigned core OVS_UNUSED) +{ +/* XXX: DPDK unfortunately hasn't implemented unregister API + * This will be fixed later, instead use sleep API now. + */ +rte_keepalive_mark_sleep(rte_global_keepalive_info); +} + +/* Mark packet processing core alive. */ +void +dpdk_mark_pmd_core_alive(void) +{ +rte_keepalive_mark_alive(rte_global_keepalive_info); +} + +/* Mark packet processing core as idle. */ +void +dpdk_mark_pmd_core_sleep(void) +{ +rte_keepalive_mark_sleep(rte_global_keepalive_info); +} diff --git a/lib/dpdk.h b/lib/dpdk.h index 673a1f1..bdbb51b 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -17,6 +17,7 @@ #ifndef DPDK_H #define DPDK_H +#include #ifdef DPDK_NETDEV #include @@ -26,14 +27,23 @@ #else +#include + #define NON_PMD_CORE_ID UINT32_MAX #endif /* DPDK_NETDEV */ struct smap; +struct rte_keepalive *rte_global_keepalive_info; void dpdk_init(const struct smap *ovs_other_config); void dpdk_set_lcore_id(unsigned cpu); const char *dpdk_get_vhost_sock_dir(void); +/* Keepalive APIs */ +void dpdk_register_pmd_core(unsigned core_id); +void dpdk_unregister_pmd_core(unsigned core_id); +void dpdk_mark_pmd_core_alive(void); +void dpdk_mark_pmd_core_sleep(void); + #endif /* dpdk.h */ -- 2.4.11 ___ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev
[ovs-dev] [RFC PATCH v3 02/18] process: Add helper functions to retrieve process related info.
Implement helper functions to retrieve the process status, name and last core the process was scheduled. The APIs will be used by keepalive monitoring framework in future commits. Signed-off-by: Bhanuprakash Bodireddy--- lib/process.c | 152 ++ lib/process.h | 13 + 2 files changed, 165 insertions(+) diff --git a/lib/process.c b/lib/process.c index e9d0ba9..4c029c1 100644 --- a/lib/process.c +++ b/lib/process.c @@ -50,6 +50,20 @@ struct process { int status; }; +struct pstate2Num { +char *pidstate; +int num; +}; + +const struct pstate2Num pstate_map[] = { +{ "S", STOPPED_STATE }, +{ "R", ACTIVE_STATE }, +{ "t", TRACED_STATE }, +{ "Z", DEFUNC_STATE }, +{ "D", UNINTERRUPTIBLE_SLEEP_STATE }, +{ "NULL", UNUSED_STATE }, +}; + /* Pipe used to signal child termination. */ static int fds[2]; @@ -390,6 +404,144 @@ process_run(void) #endif } +int +get_process_status(int pid, int *pstate) +{ +#ifdef __linux__ +static char process_name[20]; +FILE *stream; +char line[75]; +char Name[15], value[5], status[20]; +int i, ln; + +snprintf(process_name, sizeof(process_name), + "/proc/%d/status", pid); +stream = fopen(process_name, "r"); +if (stream == NULL) { +VLOG_WARN_ONCE("%s: open failed: %s", process_name, +ovs_strerror(errno)); +return errno; +} + +ln=0; +while (fgets(line, sizeof line, stream)) { +if (!ovs_scan(line, + "%6s %2s %14s\n", + Name, value, status)) { +VLOG_WARN_ONCE("%s: could not parse line %d: %s", +process_name, ln, line); +continue; +} +if (!strcmp(Name, "State:")) { +for (i=0; pstate_map[i].pidstate != NULL; i++) { +if (strcmp(pstate_map[i].pidstate, value) == 0) { +VLOG_WARN_ONCE("The state is %s, status is %d\n", +pstate_map[i].pidstate, pstate_map[i].num); +*pstate = pstate_map[i].num; +break; +} +} +break; +} +ln++; + } + return 0; +#else + return ENOSYS; +#endif +} + +bool +process_is_active(int pid) +{ +#ifdef __linux__ +int pstate; +int err = get_process_status(pid, ); +if (!err) { +if (pstate == ACTIVE_STATE) { +return true; +} +} +return false; +#else + return false; +#endif +} + +char * +get_process_name(int pid) +{ +#ifdef __linux__ +static char proc_path[PATH_MAX]; +FILE *stream; +char line[20]; +char *pname = xmalloc(20); + +if (pid == -1) { + VLOG_ERR("Invalid process id : %d", pid); + return NULL; +} + +snprintf(proc_path, sizeof(proc_path), + "/proc/%d/task/%d/comm", pid, pid); +stream = fopen(proc_path, "r"); +if (!stream) { +VLOG_WARN("%s: open failed: %s", proc_path, ovs_strerror(errno)); +return NULL; +} + +if (fgets(line, sizeof line, stream) != NULL) { +if (ovs_scan(line, "%s", pname)) { +return pname; +} +} +return NULL; +#else +return NULL; +#endif +} + +/* Retrieve the last core id that executed the process. + * + * Refer http://man7.org/linux/man-pages/man5/proc.5.html + * and the processor field in /proc/[pid]/stat. + */ +int +get_cpu_num(int pid) +{ +#ifdef __linux__ +static char proc_path[PATH_MAX]; +FILE *stream; +char line[500]; + +snprintf(proc_path, sizeof(proc_path), + "/proc/%d/stat", pid); +stream = fopen(proc_path, "r"); +if (!stream) { +VLOG_WARN_ONCE("%s: open failed: %s", proc_path, ovs_strerror(errno)); +return errno; +} + +int i; +int cpu_id = -1; +if (fgets(line, sizeof line, stream) != NULL) { +char *tok, *endptr = NULL; +for (tok = strtok_r(line, " ", ), i = 1; tok != NULL; +tok = strtok_r(NULL, " ", ), i++) { +VLOG_DBG("token :%s", tok); +if (i == 39) { +cpu_id = atoi(tok); +break; +} +} +} + +ovs_assert(cpu_id >= 0) +return cpu_id; +#else +return ENOSYS; +#endif +} /* Causes the next call to poll_block() to wake up when process 'p' has * exited. */ diff --git a/lib/process.h b/lib/process.h index 3feac7e..041767d 100644 --- a/lib/process.h +++ b/lib/process.h @@ -20,6 +20,15 @@ #include #include +enum process_states { +UNUSED_STATE, +STOPPED_STATE, +ACTIVE_STATE, +TRACED_STATE, +DEFUNC_STATE, +UNINTERRUPTIBLE_SLEEP_STATE +}; + struct process; /* Starting and monitoring subprocesses. @@ -38,6 +47,10 @@ bool process_exited(struct process *); int process_status(const struct process *); void process_run(void); void process_wait(struct process *); +int
[ovs-dev] [RFC PATCH v3 00/18] Add OVS DPDK keep-alive functionality
Keepalive feature is aimed at achieving Fastpath Service Assurance in OVS-DPDK deployments. It adds support for monitoring the packet processing cores(PMD thread cores) by dispatching heartbeats at regular intervals. Incase of heartbeat misses additional health checks are enabled on the PMD thread to detect the failure and the same shall be reported to higher level fault management systems/frameworks. The implementation uses OVSDB for reporting the datapath status and the health of the PMD threads. Any external monitoring application can read the status from OVSDB at regular intervals (or) subscribe to the updates in OVSDB so that they get notified when the changes happen on OVSDB. keepalive info struct is created and initialized for storing the status of the PMD threads. This is initialized by main thread(vswitchd) as part of init process and will be periodically updated by 'keepalive' thread. keepalive feature can be enabled through below OVSDB settings. enable-keepalive=true - Keepalive feature is disabled by default. keepalive-interval="5000" - Timer interval in milliseconds for monitoring the packet processing cores. When KA is enabled, 'ovs-keepalive' thread shall be spawned that wakes up at regular intervals to update the timestamp and status of pmd cores in keepalive info struct. This information shall be read by vswitchd thread and write the status in to 'keepalive' column of Open_vSwitch table in OVSDB. An external monitoring framework like collectd with ovs events support can read (or) subscribe to the datapath status changes in ovsdb. When the state is updated, the collectd shall be notified and will eventually relay the status to ceilometer service running in the controller. Below is the high level overview of deployment model. Compute NodeControllerCompute Node Collectd <--> Ceilometer <> Collectd OvS DPDK OvS DPDK +-+ | VM | +--+--+ \---+---/ | +--+---+ ++--+ +--+---+ | OVS |-> | ovsevents plugin| --> | collectd | +--+---+ ++--+ +--+---+ +--+-+ +---++ | | Ceilometer | <-- | collectd ceilometer plugin | <--- +--+-+ +---++ Performance impact: No noticeable performance or latency impact is observed with KA feature enabled. - v2-> v3 * Remove POSIX shared memory block implementation (suggested by Aaron). * Rework the logic to register and track threads instead of cores. This way in the future any thread can be registered to KA framework. For now only PMD threads are tracked (suggested by Aaron). * Refactor few APIs and further clean up the code. v1-> v2 * Merged the xml and schema commits to later commit where the actual implementation is done(suggested by Ben). * Fix ovs-appctl keepalive/* hang issue when KA disabled. * Fixed memory leaks with appctl commands for keepalive/pmd-health-show, pmd-xstats-show. * Refactored code and fixed APIs dealing with PMD health monitoring. Bhanuprakash Bodireddy (18): [9] patches help update OVSDB with keepalive status dpdk: Add helper functions for DPDK datapath keepalive. process: Add helper functions to retrieve process related info. Keepalive: Add initial keepalive support. keepalive: Add more helper functions to KA framework. dpif-netdev: Add helper function to store datapath tids. dpif-netdev: Register packet processing cores to KA framework. dpif-netdev: Enable heartbeats for DPDK datapath. keepalive: Retrieve PMD status periodically. bridge: Update keepalive status in OVSDB keepalive: Add support to query keepalive statistics. keepalive: Add support to query keepalive status. [5] Patches add additional health checks in case of heartbeat failure. dpif-netdev: Add additional datapath health checks. keepalive: Check the link status as part of PMD health checks. keepalive: Check the packet statistics as part of PMD health checks. keepalive: Check the PMD cycle stats as part of PMD health checks. netdev-dpdk: Enable PMD health checks on heartbeat failure. keepalive: Display extended Keepalive status. Documentation: Update DPDK doc with Keepalive feature. Documentation/howto/dpdk.rst | 90 + lib/automake.mk | 2 + lib/dpdk-stub.c | 30 ++ lib/dpdk.c | 55 +++ lib/dpdk.h | 14 + lib/dpif-netdev.c| 203 +- lib/dpif-netdev.h| 6 + lib/keepalive.c | 917 +++ lib/keepalive.h | 150 +++ lib/netdev-dpdk.c| 119 +- lib/netdev-dpdk.h| 5 + lib/process.c| 152 +++