>From: O Mahony, Billy >Sent: Thursday, August 17, 2017 2:36 PM >To: Kavanagh, Mark B <[email protected]>; [email protected] >Subject: RE: [ovs-dev] [PATCH 2/4] netdev-dpdk: Apply ingress_sched config to >dpdk phy ports > >Hi Mark, > >Thanks for the very useful review comments.
Hi Billy, Apologies for the delayed response - some comments inline. Thanks, Mark > >I'll send a rev'd patch set shortly. > >/Billy > >> -----Original Message----- >> From: Kavanagh, Mark B >> Sent: Friday, August 4, 2017 4:14 PM >> To: O Mahony, Billy <[email protected]>; [email protected] >> Subject: RE: [ovs-dev] [PATCH 2/4] netdev-dpdk: Apply ingress_sched config >> to dpdk phy ports >> >> >From: [email protected] >> >[mailto:[email protected]] >> >On Behalf Of Billy O'Mahony >> >Sent: Thursday, July 20, 2017 5:11 PM >> >To: [email protected] >> >Subject: [ovs-dev] [PATCH 2/4] netdev-dpdk: Apply ingress_sched config >> >to dpdk phy ports >> > >> >Ingress scheduling configuration is given effect by way of Flow >> >Director filters. A small subset of the ingress scheduling possible is >> >implemented in this patch. >> > >> >Signed-off-by: Billy O'Mahony <[email protected]> >> >> Hi Billy, >> >> As a general comment, this patch doesn't apply to HEAD of master, so please >> rebase as part of rework. >> >> Review comments inline. >> >> Thanks, >> Mark >> >> >> >--- >> > include/openvswitch/ofp-parse.h | 3 + >> > lib/dpif-netdev.c | 1 + >> > lib/netdev-dpdk.c | 167 >> ++++++++++++++++++++++++++++++++++++++- >> >- >> > vswitchd/bridge.c | 2 + >> > 4 files changed, 166 insertions(+), 7 deletions(-) >> > >> >diff --git a/include/openvswitch/ofp-parse.h >> >b/include/openvswitch/ofp-parse.h index fc5784e..08d6086 100644 >> >--- a/include/openvswitch/ofp-parse.h >> >+++ b/include/openvswitch/ofp-parse.h >> >@@ -37,6 +37,9 @@ struct ofputil_table_mod; struct ofputil_bundle_msg; >> >struct ofputil_tlv_table_mod; struct simap; >> >+struct tun_table; >> >+struct flow_wildcards; >> >+struct ofputil_port_map; >> > enum ofputil_protocol; >> > >> > char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char >> >*str_, diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index >> >47a9fa0..d35566f 100644 >> >--- a/lib/dpif-netdev.c >> >+++ b/lib/dpif-netdev.c >> >@@ -44,6 +44,7 @@ >> > #include "dp-packet.h" >> > #include "dpif.h" >> > #include "dpif-provider.h" >> >+#include "netdev-provider.h" >> >> If a setter function for modifying netdev->ingress_sched_str is >> implemented, there is no need to include netdev-provider.h >> >> > #include "dummy.h" >> > #include "fat-rwlock.h" >> > #include "flow.h" >> >diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index >> >e74c50f..e393abf 100644 >> >--- a/lib/netdev-dpdk.c >> >+++ b/lib/netdev-dpdk.c >> >@@ -33,6 +33,8 @@ >> > #include <rte_meter.h> >> > #include <rte_virtio_net.h> >> > >> >+#include <openvswitch/ofp-parse.h> >> >+#include <openvswitch/ofp-util.h> >> >> Move these include below, with the other openvswitch include file. >> >> > #include "dirs.h" >> > #include "dp-packet.h" >> > #include "dpdk.h" >> >@@ -169,6 +171,10 @@ static const struct rte_eth_conf port_conf = { >> > .txmode = { >> > .mq_mode = ETH_MQ_TX_NONE, >> > }, >> >+ .fdir_conf = { >> >+ .mode = RTE_FDIR_MODE_PERFECT, >> >> As you mentioned in your cover letter, you've only tested on a Fortville >NIC. >> How widely supported are the Flow Director features across DPDK- >> supported NICs? >[[BO'M]] I'm not sure. Probably many NICs have the capabilities required - but >I don't know if the dpdk drivers expose it. Okay - this would obviously limit the viability of this patchset. I believe that you're taking a different approach going forward though... >> >> >+ }, >> >+ >> > }; >> > >> > enum { DPDK_RING_SIZE = 256 }; >> >@@ -330,6 +336,11 @@ enum dpdk_hw_ol_features { >> > NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0, }; >> > >> >+union ingress_filter { >> >+ struct rte_eth_ethertype_filter ethertype; >> >+ struct rte_eth_fdir_filter fdir; >> >+}; >> >+ >> > struct netdev_dpdk { >> > struct netdev up; >> > dpdk_port_t port_id; >> >@@ -369,8 +380,11 @@ struct netdev_dpdk { >> > /* If true, device was attached by rte_eth_dev_attach(). */ >> > bool attached; >> > >> >- /* Ingress Scheduling config */ >> >+ /* Ingress Scheduling config & state. */ >> > char *ingress_sched_str; >> >+ bool ingress_sched_changed; >> >+ enum rte_filter_type ingress_filter_type; >> >+ union ingress_filter ingress_filter; >> > >> > /* In dpdk_list. */ >> > struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); @@ -653,6 >> >+667,15 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int >> n_rxq, >> >int n_txq) >> > int i; >> > struct rte_eth_conf conf = port_conf; >> > >> >+ /* Ingress scheduling requires ETH_MQ_RX_NONE so limit it to when >> exactly >> >+ * two rxqs are defined. Otherwise MQ will not work as expected. >> >+ */ >> >> So if a user later enables multiq (i.e. > 2 rxqs), that implicitly disables >ingress >> scheduling? >[[BO'M]] Yes. RSS and Ingress Scheduling are incompatible - at least I >couldn't figure out how to make them compatible. It could be setup so that RSS >would be disabled but that would be favoring the existing common use case of >the less common new use case. Okay. >> >> In any event, the comment here needs to be much clearer, as this is the >first >> mention within the code that ingress scheduling is limited to just 2 rxqs. >Is >> that limitation just for the PoC? >[[BO'M]] Unless I can figure out how to do stop RSS polluting the priority >queue. Sure. >> >> It may also be useful to briefly explain why RSS prevents ingress >scheduling. >[[BO'M]] I've expanded the comments to include an explanation of the issue. Great - thanks! >> >> >+ if (dev->ingress_sched_str && n_rxq == 2) { >> >+ conf.rxmode.mq_mode = ETH_MQ_RX_NONE; >> >+ } >> >+ else { >> >+ conf.rxmode.mq_mode = ETH_MQ_RX_RSS; >> >+ } >> >+ >> > if (dev->mtu > ETHER_MTU) { >> > conf.rxmode.jumbo_frame = 1; >> > conf.rxmode.max_rx_pkt_len = dev->max_packet_len; @@ -730,6 >> >+753,121 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) >> >OVS_REQUIRES(dev->mutex) >> > } >> > } >> > >> >+static void >> >> Why no return value? >[[BO'M]] The caller has nothing to do whether the config works or fails. Which >makes sense as Ingress scheduling is best-effort. So if the config fails to be >applied for any reason the rest of dpdk_eth_dev_init should just continue. Ah okay, since it's best-effort, that's fine. >> >> >+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq) { >> >+ if (!dev->ingress_sched_str) { >> >+ return; >> >+ } >> >+ >> >+ if (n_rxq != 2) { >> >> If n_rxq is always 2, then remove that parameter from this function, and >> perform the check (i.e. n_rxq ==2) in dpdk_eth_dev_init instead. >> >> >+ VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \ >> >+ "Requires n_rxq==2.", dev->up.name); >> >+ } >> >+ >> >+ int priority_q_id = n_rxq-1; >> >> So, this is always 1, right? >[[BO'M]] Yes. Your right that it's pointless to carry around n_rxq if the >value must always be two. But I'm going to leave it as is for now in case I >figure out how to get past that limitation. Hmmm, I'm not sure that I'd agree with shaping the API on the basis that a solution _might_ exist at some point in the future. I'd prefer to see n_rxq removed now, and the code subsequently refactored if/when the RSS limitation is overcome. >> >> >+ char *key, *val, *str, *iter; >> >+ >> >+ ovs_be32 ip_src, ip_dst; >> >+ ip_src = ip_dst = 0; >> >+ >> >+ uint16_t eth_type, port_src, port_dst; >> >+ eth_type = port_src = port_dst = 0; >> >+ uint8_t ip_proto = 0; >> >+ int diag = 0; >> >+ >> >+ /* delete any existing filter */ >> >+ if (dev->ingress_filter_type == RTE_ETH_FILTER_FDIR) { >> >+ diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_FDIR, >> >+ RTE_ETH_FILTER_DELETE, &dev->ingress_filter.fdir); >> >+ } else if (dev->ingress_filter_type == RTE_ETH_FILTER_ETHERTYPE) { >> >+ diag = rte_eth_dev_filter_ctrl(dev->port_id, >> >RTE_ETH_FILTER_ETHERTYPE, >> >+ RTE_ETH_FILTER_DELETE, &dev->ingress_filter.ethertype); >> >+ } >> >+ >> >+ char *mallocd_str; /* str_to_x returns malloc'd str we'll need to >> >+ free */ >> >> Move inline comment above the line. Also, malloc'd string is only returned >in >> the event of error (otherwise NULL), so you might want to clarify the >> comment. >[[BO'M]] Done. Thanks. >> >> >+ /* Parse the configuration into local vars */ >> >+ iter = str = xstrdup(dev->ingress_sched_str); >> >> Why are two strings (i.e. iter and str) needed here? >[[BO'M]] str remembers the malloc'd str for freeing. Iter is modified by each >call in the loop to ofputil_parse_key_value. >> >> >+ while (ofputil_parse_key_value (&iter, &key, &val)) { >> >+ if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) { >> >+ mallocd_str = str_to_ip(val, &ip_src); >> >+ } else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst") >> >+ == 0) >> >{ >> >+ mallocd_str = str_to_ip(val, &ip_dst); >> >+ } else if (strcmp(key, "dl_type") == 0 || >> >+ strcmp(key, "eth_type") == 0) { >> >+ mallocd_str = str_to_u16(val, "eth_type/dl_type", ð_type); >> >+ } else if (strcmp(key, "tcp_src") == 0 || >> >+ strcmp(key, "tp_src") == 0 || >> >+ strcmp(key, "udp_src") == 0) { >> >+ mallocd_str = str_to_u16(val, "tcp/udp_src", &port_src); >> >+ } else if (strcmp(key, "tcp_dst") == 0 || >> >+ strcmp(key, "tp_dst") == 0 || >> >+ strcmp(key, "udp_dst") == 0) { >> >+ mallocd_str = str_to_u16(val, "tcp/udp_dst", &port_dst); >> >+ } else if (strcmp(key, "ip") == 0) { >> >> Use defined macros for well-known values - examples below. >> >> >+ eth_type = 0x0800; >> >> ETH_P_IP <linux/if_ether.h> >[[BO'M]] Thanks for #include tips Np. >> >> >+ } else if (strcmp(key, "udp") == 0) { >> >+ eth_type = 0x0800; >> >+ ip_proto = 17; >> >> IPPROTO_UDP <netinet/in.h> >> >> >+ } else if (strcmp(key, "tcp") == 0) { >> >+ eth_type = 0x0800; >> >+ ip_proto = 6; >> >> IPPROTO_TCP <netinet/in.h> >> >> >+ } else { >> >+ VLOG_WARN("Ignoring unsupported ingress scheduling field >'%s'", \ >> >+ key); >> >+ } >> >+ if (mallocd_str) { >> >+ VLOG_ERR ("%s", mallocd_str); >> >+ free(mallocd_str); >> >+ mallocd_str = NULL; >> >+ } >> >+ } >> >+ free (str); >> >> Breaking out the parsing into a separate function would make this function >> more readable. >[[BO'M]] I'm not sure. It would make this function shorter sure. But a reader >has to jump back and forth between fn bodies to follow what is a sequential >procedure. Sure, but the reader may not care _how_ the string is parsed, just that it _is_ parsed (e.g. parse_sched_str(dev->ingress_sched_str)). This function is already quite lengthy, and some brevity could be advantageous. >> >> >+ >> >+ /* Set the filters */ >> >+ if (eth_type && ip_src && ip_dst && port_src && port_dst && ip_proto) >> { >> >+ struct rte_eth_fdir_filter entry = { 0 }; >> >> This causes a compiler error for me. >[[BO'M]] What is the error, compiler version and build/config line? >I'm using gcc version 5.4.0 20160609 and ./boot.sh && ./configure --enable- >Werror --with-dpdk=...dpdk/x86_64-native-linuxapp-gcc && make -j >> >> >+ entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP; >> >> Is only UDP flow supported for the POC? >[[BO'M]] Good catch - that's a bug actually! Okay. >> >> >+ entry.input.flow.udp4_flow.ip.src_ip = ip_src; >> >+ entry.input.flow.udp4_flow.ip.dst_ip = ip_dst; >> >+ entry.input.flow.udp4_flow.src_port = htons(port_src); >> >+ entry.input.flow.udp4_flow.dst_port = htons(port_dst); >> >+ entry.action.rx_queue = priority_q_id; >> >+ entry.action.behavior = RTE_ETH_FDIR_ACCEPT; >> >+ entry.action.report_status = RTE_ETH_FDIR_REPORT_ID; >> >+ diag = rte_eth_dev_filter_ctrl(dev->port_id, >> >+ RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_ADD, &entry); >> >+ dev->ingress_filter_type = RTE_ETH_FILTER_FDIR; >> >+ dev->ingress_filter.fdir = entry; >> >+ } >> >+ else if (eth_type && !ip_src && !ip_dst && !port_src >> >+ && !port_dst && !ip_proto) { >> >+ struct rte_eth_ethertype_filter entry = {0}; >> >+ memset (&entry, 0, sizeof entry); >> >+ entry.ether_type = eth_type; >> >+ entry.flags = 0; >> >+ entry.queue = priority_q_id; >> >+ diag = rte_eth_dev_filter_ctrl(dev->port_id, >> >+ RTE_ETH_FILTER_ETHERTYPE, RTE_ETH_FILTER_ADD, &entry); >> >+ dev->ingress_filter.ethertype = entry; >> >+ dev->ingress_filter_type = RTE_ETH_FILTER_ETHERTYPE; >> >+ } >> >+ else { >> >+ VLOG_ERR("Unsupported ingress scheduling match-field >> combination."); >> >+ dev->ingress_filter_type = RTE_ETH_FILTER_NONE; >> >+ return; >> >+ } >> >+ >> >+ if (diag) { >> >+ dev->ingress_filter_type = RTE_ETH_FILTER_NONE; >> >+ VLOG_ERR("Failed to add ingress scheduling filter."); >> >+ } >> >+ else { >> >+ /* Mark the appropriate q as prioritized */ >> >+ dev->up.priority_rxq = priority_q_id; >> >+ } >> >+} >> >+ >> > static int >> > dpdk_eth_dev_init(struct netdev_dpdk *dev) >> > OVS_REQUIRES(dev->mutex) >> >@@ -764,6 +902,8 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) >> > return -diag; >> > } >> > >> >+ dpdk_apply_ingress_scheduling(dev, n_rxq); >> >+ >> > diag = rte_eth_dev_start(dev->port_id); >> > if (diag) { >> > VLOG_ERR("Interface %s start error: %s", dev->up.name, @@ >> >-870,6 +1010,9 @@ common_construct(struct netdev *netdev, >> dpdk_port_t >> >port_no, >> > dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE; >> > dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE; >> > >> >+ dev->ingress_sched_str = NULL; >> >+ dev->ingress_sched_changed = false; >> >+ dev->ingress_filter_type = RTE_ETH_FILTER_NONE; >> > /* Initialize the flow control to NULL */ >> > memset(&dev->fc_conf, 0, sizeof dev->fc_conf); >> > >> >@@ -1950,11 +2093,20 @@ netdev_dpdk_set_ingress_sched(struct netdev >> >*netdev, { >> > struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); >> > >> >- free(dev->ingress_sched_str); >> >- if (ingress_sched_str) { >> >- dev->ingress_sched_str = xstrdup(ingress_sched_str); >> >+ if ((ingress_sched_str && dev->ingress_sched_str && >> >+ strcmp(ingress_sched_str, dev->ingress_sched_str) == 0) || >> >+ (!ingress_sched_str && !dev->ingress_sched_str)) { >> >+ /* no-op; new cfg == old cfg or else both are NULL */ >> >> Just one may be NULL, not necessarily both. >[[BO'M]] If just one is null it's treated as an update being required ie the >else section Oh, I was actually referring to the comment that says 'or else both are NULL'. What I meant was that it should be updated to say 'or else one (or both) are NULL. Sorry for the confusion! >> >> >+ return 0; >> >+ } else { >> >+ /* free the old, copy in the new */ >> >+ free(dev->ingress_sched_str); >> >+ if (ingress_sched_str) { >> >+ dev->ingress_sched_str = xstrdup(ingress_sched_str); >> >+ } >> >+ dev->ingress_sched_changed = true; >> >+ netdev_request_reconfigure(netdev); >> >> Why trigger a netdev_request_reconfigure here? The ingress_sched_str has >> already been set, so it doesn't make sense. > >[[BO'M]] Short answer is that the reconfiguration won't work without this. >When updated configuration from ovsdb is percolated across the various objects >(bridges, ports, interfaces...) they each look at the new configuration and >decide if the change to the config requires them to be stopped reconfigured >and restarted. It's the same for the requested_txq_size and friends in >netdev_dpdk_set_config which also ends up calling netdev_request_reconfigure >if the configuration has changed. > >The ingress scheduling config is held in 'other_config' section of Interface >table - the request_txq_size and friends come from the main config section >which is passed to netdev_dpdk_set_config but the other_config section is not >passed down to that function. So it's not available there. > >It would've been easier if the ingress scheduling was placed in the main >Interface configuration section but it was proposed to be put in other_config >section initially and I stuck with that. > >> A solution may be to add another netdev member >> 'requested_ingress_sched_str', which is assigned the value of >> 'ingress_sched_str' in this function. >> Then, in netdev_dpdk_reconfigure, dev->ingress_sched_str = dev- >> >requested_ingress_sched_str. Any comment on this suggestion? >> >> > } >> >- >> > return 0; >> > } >> > >> >@@ -3112,12 +3264,13 @@ netdev_dpdk_reconfigure(struct netdev >> *netdev) >> > && dev->mtu == dev->requested_mtu >> > && dev->rxq_size == dev->requested_rxq_size >> > && dev->txq_size == dev->requested_txq_size >> >- && dev->socket_id == dev->requested_socket_id) { >> >+ && dev->socket_id == dev->requested_socket_id >> >+ && !dev->ingress_sched_changed) { >> >> If you make the changes I've suggested above, the last line above should >> read 'dev->requested_ingress_sched_str == dev->ingress_sched_str'. >> >> >> > /* Reconfiguration is unnecessary */ >> >- >> > goto out; >> > } >> > >> >+ dev->ingress_sched_changed = false; >> > rte_eth_dev_stop(dev->port_id); >> > >> > if (dev->mtu != dev->requested_mtu diff --git a/vswitchd/bridge.c >> >b/vswitchd/bridge.c index 9113195..2c5dfd3 100644 >> >--- a/vswitchd/bridge.c >> >+++ b/vswitchd/bridge.c >> >@@ -831,6 +831,8 @@ bridge_delete_or_reconfigure_ports(struct bridge >> *br) >> > } >> > >> > iface_set_netdev_mtu(iface->cfg, iface->netdev); >> >+ netdev_set_ingress_sched(iface->netdev, >> >+ smap_get(&iface->cfg->other_config, "ingress_sched")); >> > >> > /* If the requested OpenFlow port for 'iface' changed, and it's >not >> > * already the correct port, then we might want to temporarily >> >delete >> >-- >> >2.7.4 >> > >> >_______________________________________________ >> >dev mailing list >> >[email protected] >> >https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
