From: Finn Christensen <[email protected]> The basic yet the major part of this patch is to translate the "match" to rte flow patterns. And then, we create a rte flow with a MARK action. Afterwards, all pkts matches the flow will have the mark id in the mbuf.
For any unsupported flows, such as MPLS, -1 is returned, meaning the flow offload is failed and then skipped. Signed-off-by: Finn Christensen <[email protected]> Signed-off-by: Yuanhan Liu <[email protected]> --- lib/netdev-dpdk.c | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 384 insertions(+), 1 deletion(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 15fef1e..8089da8 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -58,6 +58,7 @@ #include "smap.h" #include "sset.h" #include "unaligned.h" +#include "uuid.h" #include "timeval.h" #include "unixctl.h" @@ -3398,6 +3399,388 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid) } +#define MAX_RTE_FLOW_ITEMS 100 +#define MAX_RTE_FLOW_ACTIONS 100 + +static int +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev, + const struct match *match, + struct nlattr *nl_actions OVS_UNUSED, + size_t actions_len, + const ovs_u128 *ufid, + struct offload_info *info) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + const struct rte_flow_attr flow_attr = { + .group = 0, + .priority = 0, + .ingress = 1, + .egress = 0 + }; + struct flow_patterns { + struct rte_flow_item items[MAX_RTE_FLOW_ITEMS]; + uint32_t cnt; + } patterns; + struct flow_actions { + struct rte_flow_action actions[MAX_RTE_FLOW_ACTIONS]; + uint32_t cnt; + } actions; + struct rte_flow *flow; + struct rte_flow_error error; + + uint8_t *ipv4_next_proto_mask = NULL; + +#define ADD_FLOW_PATTERN(type_, spec_, mask_) do { \ + patterns.items[patterns.cnt].type = type_; \ + patterns.items[patterns.cnt].spec = spec_; \ + patterns.items[patterns.cnt].mask = mask_; \ + patterns.items[patterns.cnt].last = NULL; \ + if (patterns.cnt < MAX_RTE_FLOW_ITEMS) { \ + patterns.cnt++; \ + } else { \ + VLOG_ERR("too many rte flow patterns (> %d)", MAX_RTE_FLOW_ITEMS); \ + goto err; \ + } \ +} while (0) + +#define ADD_FLOW_ACTION(type_, conf_) do { \ + actions.actions[actions.cnt].type = type_; \ + actions.actions[actions.cnt].conf = conf_; \ + actions.cnt++; \ + ovs_assert(actions.cnt < MAX_RTE_FLOW_ACTIONS); \ +} while (0) + + patterns.cnt = 0; + actions.cnt = 0; + + /* Eth */ + struct rte_flow_item_eth eth_spec; + struct rte_flow_item_eth eth_mask; + memset(ð_mask, 0, sizeof(eth_mask)); + if (match->wc.masks.dl_src.be16[0] || + match->wc.masks.dl_src.be16[1] || + match->wc.masks.dl_src.be16[2] || + match->wc.masks.dl_dst.be16[0] || + match->wc.masks.dl_dst.be16[1] || + match->wc.masks.dl_dst.be16[2]) { + rte_memcpy(ð_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst)); + rte_memcpy(ð_spec.src, &match->flow.dl_src, sizeof(eth_spec.src)); + eth_spec.type = match->flow.dl_type; + + rte_memcpy(ð_mask.dst, &match->wc.masks.dl_dst, sizeof(eth_mask.dst)); + rte_memcpy(ð_mask.src, &match->wc.masks.dl_src, sizeof(eth_mask.src)); + eth_mask.type = match->wc.masks.dl_type; + + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, ð_spec, ð_mask); + } else { + /* + * If user specifies a flow (like UDP flow) without L2 patterns, + * OVS will at least set the dl_type. Normally, it's enough to + * create an eth pattern just with it. Unluckily, some Intel's + * NIC (such as XL710) doesn't support that. Below is a workaround, + * which simply matches any L2 pkts. + */ + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, ð_spec, ð_mask); + } + + /* VLAN */ + struct rte_flow_item_vlan vlan_spec; + struct rte_flow_item_vlan vlan_mask; + memset(&vlan_mask, 0, sizeof(vlan_mask)); + if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) { + vlan_spec.tci = match->flow.vlans[0].tci; + vlan_mask.tci = match->wc.masks.vlans[0].tci; + + /* match any protocols */ + vlan_mask.tpid = 0; + + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_VLAN, &vlan_spec, &vlan_mask); + } + + /* IP v4 */ + uint8_t proto = 0; + struct rte_flow_item_ipv4 ipv4_spec; + struct rte_flow_item_ipv4 ipv4_mask; + memset(&ipv4_mask, 0, sizeof(ipv4_mask)); + if ((match->flow.dl_type == ntohs(ETH_TYPE_IP)) && + (match->wc.masks.nw_src || match->wc.masks.nw_dst || + match->wc.masks.nw_tos || match->wc.masks.nw_ttl || + match->wc.masks.nw_proto)) { + ipv4_spec.hdr.type_of_service = match->flow.nw_tos; + ipv4_spec.hdr.time_to_live = match->flow.nw_tos; + ipv4_spec.hdr.next_proto_id = match->flow.nw_proto; + ipv4_spec.hdr.src_addr = match->flow.nw_src; + ipv4_spec.hdr.dst_addr = match->flow.nw_dst; + + ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos; + ipv4_mask.hdr.time_to_live = match->wc.masks.nw_tos; + ipv4_mask.hdr.next_proto_id = match->wc.masks.nw_proto; + ipv4_mask.hdr.src_addr = match->wc.masks.nw_src; + ipv4_mask.hdr.dst_addr = match->wc.masks.nw_dst; + + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_IPV4, &ipv4_spec, &ipv4_mask); + + /* Save proto for L4 protocol setup */ + proto = ipv4_spec.hdr.next_proto_id & ipv4_mask.hdr.next_proto_id; + + /* Remember proto mask address for later modification */ + ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id; + } + + if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP && + proto != IPPROTO_SCTP && proto != IPPROTO_TCP && + (match->wc.masks.tp_src || + match->wc.masks.tp_dst || + match->wc.masks.tcp_flags)) { + VLOG_INFO("L4 Protocol (%u) not supported", proto); + goto err; + } + + struct rte_flow_item_udp udp_spec; + struct rte_flow_item_udp udp_mask; + memset(&udp_mask, 0, sizeof(udp_mask)); + if ((proto == IPPROTO_UDP) && + (match->wc.masks.tp_src || match->wc.masks.tp_dst)) { + udp_spec.hdr.src_port = match->flow.tp_src; + udp_spec.hdr.dst_port = match->flow.tp_dst; + + udp_mask.hdr.src_port = match->wc.masks.tp_src; + udp_mask.hdr.dst_port = match->wc.masks.tp_dst; + + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_UDP, &udp_spec, &udp_mask); + + /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */ + if (ipv4_next_proto_mask) { + *ipv4_next_proto_mask = 0; + } + } + + struct rte_flow_item_sctp sctp_spec; + struct rte_flow_item_sctp sctp_mask; + memset(&sctp_mask, 0, sizeof(sctp_mask)); + if ((proto == IPPROTO_SCTP) && + (match->wc.masks.tp_src || match->wc.masks.tp_dst)) { + sctp_spec.hdr.src_port = match->flow.tp_src; + sctp_spec.hdr.dst_port = match->flow.tp_dst; + + sctp_mask.hdr.src_port = match->wc.masks.tp_src; + sctp_mask.hdr.dst_port = match->wc.masks.tp_dst; + + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_SCTP, &sctp_spec, &sctp_mask); + + /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */ + if (ipv4_next_proto_mask) { + *ipv4_next_proto_mask = 0; + } + } + + struct rte_flow_item_icmp icmp_spec; + struct rte_flow_item_icmp icmp_mask; + memset(&icmp_mask, 0, sizeof(icmp_mask)); + if ((proto == IPPROTO_ICMP) && + (match->wc.masks.tp_src || match->wc.masks.tp_dst)) { + icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src); + icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst); + + icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src); + icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst); + + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ICMP, &icmp_spec, &icmp_mask); + + /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */ + if (ipv4_next_proto_mask) { + *ipv4_next_proto_mask = 0; + } + } + + struct rte_flow_item_tcp tcp_spec; + struct rte_flow_item_tcp tcp_mask; + memset(&tcp_mask, 0, sizeof(tcp_mask)); + if ((proto == IPPROTO_TCP) && + (match->wc.masks.tp_src || + match->wc.masks.tp_dst || + match->wc.masks.tcp_flags)) { + tcp_spec.hdr.src_port = match->flow.tp_src; + tcp_spec.hdr.dst_port = match->flow.tp_dst; + tcp_spec.hdr.data_off = ntohs(match->flow.tcp_flags) >> 8; + tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff; + + tcp_mask.hdr.src_port = match->wc.masks.tp_src; + tcp_mask.hdr.dst_port = match->wc.masks.tp_dst; + tcp_mask.hdr.data_off = ntohs(match->wc.masks.tcp_flags) >> 8; + tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff; + + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_TCP, &tcp_spec, &tcp_mask); + + /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */ + if (ipv4_next_proto_mask) { + *ipv4_next_proto_mask = 0; + } + } + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_END, NULL, NULL); + + struct rte_flow_action_mark mark; + if (actions_len) { + mark.id = info->flow_mark; + ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_MARK, &mark); + } else { + ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_DROP, NULL); + VLOG_INFO("no action given; drop pkts in hardware\n"); + } + ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_END, NULL); + + flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items, + actions.actions, &error); + if (!flow) { + VLOG_ERR("rte flow creat error: %u : message : %s\n", + error.type, error.message); + goto err; + } + add_ufid_dpdk_flow_mapping(ufid, flow); + VLOG_INFO("installed flow %p by ufid "UUID_FMT"\n", + flow, UUID_ARGS((struct uuid *)ufid)); + + return 0; + +err: + return -1; +} + +/* + * Validate for later rte flow offload creation. If any unsupported + * flow are specified, return -1. + */ +static int +netdev_dpdk_validate_flow(const struct match *match) +{ + struct match match_zero_wc; + + /* Create a wc-zeroed version of flow */ + match_init(&match_zero_wc, &match->flow, &match->wc); + +#define CHECK_NONZERO(addr, size) do { \ + uint8_t *padr = (uint8_t *)(addr); \ + int i; \ + for (i = 0; i < (size); i++) { \ + if (padr[i] != 0) { \ + goto err; \ + } \ + } \ +} while (0) + +#define CHECK_NONZEROSIMPLE(var) do { \ + if ((var) != 0) { \ + goto err; \ + } \ +} while (0) + + CHECK_NONZERO(&match_zero_wc.flow.tunnel, sizeof(match_zero_wc.flow.tunnel)); + CHECK_NONZEROSIMPLE(match->wc.masks.metadata); + CHECK_NONZEROSIMPLE(match->wc.masks.skb_priority); + CHECK_NONZEROSIMPLE(match->wc.masks.pkt_mark); + CHECK_NONZEROSIMPLE(match->wc.masks.dp_hash); + + /* recirc id must be zero */ + CHECK_NONZEROSIMPLE(match_zero_wc.flow.recirc_id); + + CHECK_NONZEROSIMPLE(match->wc.masks.ct_state); + CHECK_NONZEROSIMPLE(match->wc.masks.ct_zone); + CHECK_NONZEROSIMPLE(match->wc.masks.ct_mark); + CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.hi); + CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.lo); + CHECK_NONZEROSIMPLE(match->wc.masks.ct_nw_proto); + CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_src); + CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_dst); + CHECK_NONZEROSIMPLE(match->wc.masks.conj_id); + CHECK_NONZEROSIMPLE(match->wc.masks.actset_output); + + /* unsupported L2 */ + CHECK_NONZERO(&match->wc.masks.mpls_lse, + sizeof(match_zero_wc.flow.mpls_lse) / sizeof(ovs_be32)); + + /* unsupported L3 */ + CHECK_NONZERO(&match->wc.masks.ipv6_src, sizeof(struct in6_addr)); + CHECK_NONZERO(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr)); + CHECK_NONZEROSIMPLE(match->wc.masks.ipv6_label); + CHECK_NONZERO(&match->wc.masks.nd_target, sizeof(struct in6_addr)); + CHECK_NONZERO(&match->wc.masks.arp_sha, sizeof(struct eth_addr)); + CHECK_NONZERO(&match->wc.masks.arp_tha, sizeof(struct eth_addr)); + + /* If fragmented, then don't HW accelerate - for now */ + CHECK_NONZEROSIMPLE(match_zero_wc.flow.nw_frag); + + /* unsupported L4 */ + CHECK_NONZEROSIMPLE(match->wc.masks.igmp_group_ip4); + + return 0; + +err: + VLOG_INFO("Cannot HW accelerate this flow"); + return -1; +} + +static int +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev, + const ovs_u128 *ufid, + struct rte_flow *rte_flow) +{ + struct rte_flow_error error; + int ret; + + ret = rte_flow_destroy(dev->port_id, rte_flow, &error); + if (ret == 0) { + del_ufid_dpdk_flow_mapping(ufid); + VLOG_INFO("removed rte flow %p associated with ufid " UUID_FMT "\n", + rte_flow, UUID_ARGS((struct uuid *)ufid)); + } else { + VLOG_ERR("rte flow destroy error: %u : message : %s\n", + error.type, error.message); + } + + return ret; +} + +static int +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match, + struct nlattr *actions, size_t actions_len, + const ovs_u128 *ufid, struct offload_info *info, + struct dpif_flow_stats *stats OVS_UNUSED) +{ + struct rte_flow *rte_flow; + int ret; + + /* + * If an old rte_flow exists, it means it's a flow modification. + * Here destroy the old rte flow first before adding a new one. + */ + rte_flow = get_rte_flow_by_ufid(ufid); + if (rte_flow) { + ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev), + ufid, rte_flow); + if (ret < 0) + return ret; + } + + ret = netdev_dpdk_validate_flow(match); + if (ret < 0) { + return ret; + } + + return netdev_dpdk_add_rte_flow_offload(netdev, match, actions, + actions_len, ufid, info); +} + +#define DPDK_FLOW_OFFLOAD_API \ + NULL, \ + NULL, \ + NULL, \ + NULL, \ + netdev_dpdk_flow_put, \ + NULL, \ + NULL, \ + NULL + + #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \ SET_CONFIG, SET_TX_MULTIQ, SEND, \ GET_CARRIER, GET_STATS, \ @@ -3470,7 +3853,7 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid) RXQ_RECV, \ NULL, /* rx_wait */ \ NULL, /* rxq_drain */ \ - NO_OFFLOAD_API \ + DPDK_FLOW_OFFLOAD_API \ } static const struct netdev_class dpdk_class = -- 2.7.4 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
