As mentioned earlier in the cover letter we also have a similar implementation to do the flow translate. I feel its good to make bit more modular for this translation. The reason being its easy to extend and add more protocols in the future.
Please find below our similar implementation. We have started with single function implementation as in patch series, and eventually moved to this approach. I can share the code as a patch if you are interested. Again the approach below work well for our use case. Feel free to discard if it doesn't make any sense/it's an overhead. The flow and actions translate were invoked from dpdkhw_rte_flow_xlate and ' dpdkhw_rte_action_xlate'. struct flow_xlate_dic { enum rte_flow_item_type rte_flow_type; /* * Flow xlate function to translate specific header match into rtl format. * Each rte_flow_item_type, it is necessary to define a corresponding * xlate function in this structure. Return 0 if the flow is being translated * successfully and error code otherwise. */ int (*flow_xlate)(struct match *match, struct rte_flow_item *hw_flow_item, const void *md); }; static int do_inport_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md); static int do_l2_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md); static int do_l3_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md); static int do_l4_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md); static int do_end_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md); struct flow_xlate_dic PORT_FLOW_XLATE = { RTE_FLOW_ITEM_TYPE_PORT, do_inport_flow_xlate }; struct flow_xlate_dic L2_FLOW_XLATE = { RTE_FLOW_ITEM_TYPE_ETH, do_l2_flow_xlate }; struct flow_xlate_dic L3_FLOW_XLATE = { RTE_FLOW_ITEM_TYPE_VOID, /* L3 flow item can be different */ do_l3_flow_xlate }; struct flow_xlate_dic L4_FLOW_XLATE = { RTE_FLOW_ITEM_TYPE_VOID, /* Can be UDP/TCP */ do_l4_flow_xlate }; struct flow_xlate_dic END_FLOW_XLATE = { RTE_FLOW_ITEM_TYPE_END, do_end_flow_xlate }; /* * Convert the mac address to little endian for flow programming */ static void ntoh_mac(struct eth_addr *src, struct eth_addr *dst) { int i; for (i = 0; i < 6; i++) { dst->ea[6 - i - 1] = src->ea[i]; } } static int do_inport_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md) { struct flow *flow; struct rte_flow_item_port *port_flow_item; struct rte_flow_item_port *port_flow_item_mask; struct netdev *netdev; uint16_t hw_portno; struct offload_info *ofld_info = (struct offload_info *)md; flow = &match->flow; port_flow_item = xzalloc (sizeof *port_flow_item); port_flow_item_mask = xzalloc(sizeof *port_flow_item_mask); if(!port_flow_item) { VLOG_ERR("Failed to allocate the memory for hardware flow item"); return ENOMEM; } netdev = get_hw_netdev(flow->in_port.odp_port, ofld_info->port_hmap_obj); if (!netdev) { VLOG_WARN("Inport %u is not a valid hardware accelerated port.", odp_to_u32(flow->in_port.odp_port)); return EOPNOTSUPP; } /* The inport should be the hardware port number, not the ovs portno */ hw_portno = netdev_get_hw_portno(netdev); port_flow_item->index = hw_portno; hw_flow_item->type = PORT_FLOW_XLATE.rte_flow_type; hw_flow_item->spec = port_flow_item; hw_flow_item->last = NULL; /* Set the mask for the rte port flow */ port_flow_item_mask->index = 0xFFFFFFFF; hw_flow_item->mask = port_flow_item_mask; return 0; } static int do_l2_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md) { struct flow *flow, *mask; struct rte_flow_item_eth *eth_flow_item; struct rte_flow_item_eth *eth_flow_mask; struct eth_addr mac_le; flow = &match->flow; mask = &match->wc.masks; bool is_l2_zero = 0; is_l2_zero = eth_addr_is_zero(mask->dl_dst); is_l2_zero &= eth_addr_is_zero(mask->dl_src); if(is_l2_zero) { VLOG_INFO("Cannot install flow with zero eth addr"); return EOPNOTSUPP; } eth_flow_item = xzalloc(sizeof *eth_flow_item); eth_flow_mask = xzalloc(sizeof *eth_flow_mask); if(!eth_flow_item || !eth_flow_mask) { VLOG_ERR("Failed to allocate the memory for flow item"); return ENOMEM; } ntoh_mac(&flow->dl_dst, &mac_le); memcpy(ð_flow_item->dst, &mac_le, sizeof(eth_flow_item->dst)); ntoh_mac(&flow->dl_src, &mac_le); memcpy(ð_flow_item->src, &mac_le, sizeof(eth_flow_item->src)); eth_flow_item->type = ntohs(flow->dl_type); /* Copy the address mask too */ ntoh_mac(&mask->dl_dst, &mac_le); memcpy(ð_flow_mask->dst, &mac_le, sizeof(eth_flow_mask->dst)); ntoh_mac(&mask->dl_src, &mac_le); memcpy(ð_flow_mask->src, &mac_le, sizeof(eth_flow_mask->src)); eth_flow_mask->type = ntohs(mask->dl_type); hw_flow_item->type = L2_FLOW_XLATE.rte_flow_type; hw_flow_item->spec = eth_flow_item; hw_flow_item->last = NULL; hw_flow_item->mask = eth_flow_mask; return 0; } static int do_l3_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md) { struct flow *flow, *mask; /*Currently supports only ipv4 */ flow = &match->flow; mask = &match->wc.masks; if(flow->dl_type == htons(ETH_TYPE_IP)) { struct rte_flow_item_ipv4 *ipv4_flow_item, *ipv4_flow_mask; ipv4_flow_item = xzalloc(sizeof *ipv4_flow_item); ipv4_flow_mask = xzalloc(sizeof *ipv4_flow_mask); /* Xlate the ip flow entries */ ipv4_flow_item->hdr.src_addr = ntohl(flow->nw_src); ipv4_flow_item->hdr.dst_addr = ntohl(flow->nw_dst); ipv4_flow_item->hdr.next_proto_id = flow->nw_proto; ipv4_flow_item->hdr.type_of_service = flow->nw_tos; ipv4_flow_item->hdr.version_ihl = 4; hw_flow_item->type = RTE_FLOW_ITEM_TYPE_IPV4; hw_flow_item->spec = ipv4_flow_item; hw_flow_item->last = NULL; /* Xlate ipv4 mask entries */ ipv4_flow_mask->hdr.src_addr = ntohl(mask->nw_src); ipv4_flow_mask->hdr.dst_addr = ntohl(mask->nw_dst); ipv4_flow_mask->hdr.next_proto_id = mask->nw_proto; ipv4_flow_mask->hdr.type_of_service = mask->nw_tos; ipv4_flow_mask->hdr.version_ihl = 0xFF; hw_flow_item->mask = ipv4_flow_mask; } else { VLOG_INFO("Not a IP flow, %d", flow->dl_type); return EOPNOTSUPP; } return 0; } static int do_l4_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md) { struct flow *flow, *mask; /*Currently supports only UDP/TCP */ flow = &match->flow; mask = &match->wc.masks; if(flow->nw_proto == IPPROTO_TCP) { struct rte_flow_item_tcp *tcp_flow_item, *tcp_flow_mask; tcp_flow_item = xzalloc(sizeof *tcp_flow_item); tcp_flow_mask = xzalloc(sizeof *tcp_flow_mask); /* Xlate tcp flow entries */ tcp_flow_item->hdr.src_port = ntohs(flow->tp_src); tcp_flow_item->hdr.dst_port = ntohs(flow->tp_dst); hw_flow_item->type = RTE_FLOW_ITEM_TYPE_TCP; hw_flow_item->spec = tcp_flow_item; hw_flow_item->last = NULL; /* Xlate tcp flow mask entries */ tcp_flow_mask->hdr.src_port = ntohs(mask->tp_src); tcp_flow_mask->hdr.dst_port = ntohs(mask->tp_dst); hw_flow_item->mask = tcp_flow_mask; } else if (flow->nw_proto == IPPROTO_UDP) { struct rte_flow_item_udp *udp_flow_item, *udp_flow_mask; udp_flow_item = xzalloc(sizeof *udp_flow_item); udp_flow_mask = xzalloc(sizeof *udp_flow_mask); /* xlate UDP flow entries */ udp_flow_item->hdr.src_port = ntohs(flow->tp_src); udp_flow_item->hdr.dst_port = ntohs(flow->tp_dst); hw_flow_item->type = RTE_FLOW_ITEM_TYPE_UDP; hw_flow_item->spec = udp_flow_item; hw_flow_item->last = NULL; /* Xlate UDP mask entries */ udp_flow_mask->hdr.src_port = ntohs(mask->tp_src); udp_flow_mask->hdr.dst_port = ntohs(mask->tp_dst); hw_flow_item->mask = udp_flow_mask; } else { VLOG_INFO("Not a TCP/UDP flow"); return ENOTSUP; } return 0; } static int do_end_flow_xlate(struct match *match, struct rte_flow_item *hw_flow_item, const void *md) { hw_flow_item->type = RTE_FLOW_ITEM_TYPE_END; return 0; } static int do_flow_xlate_helper(struct flow_xlate_dic xlate_dic_entry, struct match *match, struct rte_flow_item *hw_flow_item, int *index, const void *md, int max_flow_entry) { int ret = 0; if(*index < max_flow_entry) { ret = xlate_dic_entry.flow_xlate(match, hw_flow_item, md); } (*index) = ret ? (*index) : *index + 1; return ret; } #define DO_FLOW_XLATE(XLATE_DIC_ENTRY, MATCH_ENTRY, FLOW_ITEM, IDX_PTR, MD_PTR,\ MAX_FLOW) \ do_flow_xlate_helper(XLATE_DIC_ENTRY, MATCH_ENTRY, FLOW_ITEM, IDX_PTR, \ MD_PTR, MAX_FLOW) int dpdkhw_rte_flow_xlate(struct match *match, struct rte_flow_attr *hw_flow_attr, struct rte_flow_item hw_flow_batch[], const struct offload_info *ofld_info) { int i = 0; /* Keep the last one for the END FLOW type */ int max_flow_entry = MAX_DPDKHW_RTE_FLOW_SIZE - 1; hw_flow_attr->group = 0; hw_flow_attr->priority = 0; hw_flow_attr->ingress = 1; /* Supports only ingress flow rules now */ int ret = 0; /* * List of supported rte flow entries are populated below. Each header * has its own xlate function to generate the corresponding rte_flow entry. * The translate functions operate on each header fields independently in * the stack. It is possible that the flow can have match entry for port and * ip address when the mac flow translation is failed to do. */ ret |= DO_FLOW_XLATE(PORT_FLOW_XLATE, match, &hw_flow_batch[i], &i, ofld_info, max_flow_entry); ret |= DO_FLOW_XLATE(L2_FLOW_XLATE, match, &hw_flow_batch[i], &i, NULL, max_flow_entry); ret |= DO_FLOW_XLATE(L3_FLOW_XLATE, match, &hw_flow_batch[i], &i, NULL, max_flow_entry); ret |= DO_FLOW_XLATE(L4_FLOW_XLATE, match, &hw_flow_batch[i], &i, NULL, max_flow_entry); /* Always the END function is called as a last function, * DO NOT ADD ANY TRANSLATE FUNCTION POST END XLATE. */ DO_FLOW_XLATE(END_FLOW_XLATE, match, &hw_flow_batch[i], &i, NULL, MAX_DPDKHW_RTE_FLOW_SIZE); return ret; } int dpdkhw_rte_action_xlate(struct rte_flow_action hw_action_batch[], const struct nlattr *actions, size_t actions_len, const struct offload_info *ofld_info) { const struct nlattr *a; unsigned int left; int i = 0; int max_action_entry = MAX_DPDKHW_RTE_FLOW_SIZE - 1; if (!actions_len || !actions) { VLOG_WARN_RL(&rl, "No actions to offload, not installing flows"); return EPERM; } NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len) { int type = nl_attr_type(a); if(i >= max_action_entry) { VLOG_ERR("Max action entry limit reached, cannot add more actions"); return EPERM; } switch ((enum ovs_action_attr) type) { case OVS_ACTION_ATTR_OUTPUT: { /* * Current POC only supports the output action to a port. * TODO :: rte_flow supports only output to a vf function. Need a * port id based output action. Lets use the vf output actions with * a port_id now. */ struct rte_flow_action_vf *rte_action_vf = xmalloc(sizeof *rte_action_vf); odp_port_t out_port = nl_attr_get_odp_port(a); /* Output port should be hardware port number. */ struct netdev *netdev = get_hw_netdev(out_port, ofld_info->port_hmap_obj); if (!netdev) { VLOG_WARN("Cannot offload a flow with non accelerated output" " port %u", odp_to_u32(out_port)); return EPERM; } uint16_t hw_portno = netdev_get_hw_portno(netdev); rte_action_vf->original = 1; rte_action_vf->id = hw_portno; hw_action_batch[i].type = RTE_FLOW_ACTION_TYPE_VF; hw_action_batch[i].conf = rte_action_vf; i++; break; } case OVS_ACTION_ATTR_TUNNEL_PUSH: case OVS_ACTION_ATTR_TUNNEL_POP: case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_POP_VLAN: case OVS_ACTION_ATTR_PUSH_MPLS: case OVS_ACTION_ATTR_POP_MPLS: case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_HASH: case OVS_ACTION_ATTR_UNSPEC: case OVS_ACTION_ATTR_TRUNC: case __OVS_ACTION_ATTR_MAX: case OVS_ACTION_ATTR_USERSPACE: case OVS_ACTION_ATTR_RECIRC: case OVS_ACTION_ATTR_CT: VLOG_INFO_RL(&rl, "TODO actions"); OVS_NOT_REACHED(); break; default: VLOG_INFO_RL(&rl, "Unsupported action to offload"); break; } } /* Add end action as a last action */ hw_action_batch[i].type = RTE_FLOW_ACTION_TYPE_END; hw_action_batch[i].conf = NULL; return 0; } Regards _Sugesh > -----Original Message----- > From: ovs-dev-boun...@openvswitch.org [mailto:ovs-dev- > boun...@openvswitch.org] On Behalf Of Yuanhan Liu > Sent: Tuesday, September 5, 2017 10:23 AM > To: d...@openvswitch.org > Subject: [ovs-dev] [PATCH v2 4/8] netdev-dpdk: implement flow put with rte > flow > > From: Finn Christensen <f...@napatech.com> > > The basic yet the major part of this patch is to translate the "match" > to rte flow patterns. And then, we create a rte flow with a MARK action. > Afterwards, all pkts matches the flow will have the mark id in the mbuf. > > For any unsupported flows, such as MPLS, -1 is returned, meaning the flow > offload is failed and then skipped. > > Co-authored-by: Yuanhan Liu <y...@fridaylinux.org> > Signed-off-by: Finn Christensen <f...@napatech.com> > Signed-off-by: Yuanhan Liu <y...@fridaylinux.org> > --- > > v2: - convert some macros to functions > - do not hardcode the max number of flow/action > - fix L2 patterns for Intel nic > - add comments for not implemented offload methods > --- > lib/netdev-dpdk.c | 421 > +++++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 420 insertions(+), 1 deletion(-) > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 46f9885..37b0f99 > 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c [Snip] > + if (!flow) { > + VLOG_ERR("rte flow creat error: %u : message : %s\n", > + error.type, error.message); > + ret = -1; > + goto out; > + } > + add_ufid_dpdk_flow_mapping(ufid, flow); > + VLOG_INFO("installed flow %p by ufid "UUID_FMT"\n", > + flow, UUID_ARGS((struct uuid *)ufid)); > + > +out: > + free(patterns.items); > + free(actions.actions); > + return ret; > +} > + > +/* > + * Validate for later rte flow offload creation. If any unsupported > + * flow are specified, return -1. > + */ [Sugesh] I feel this is very hardware centric. There are chances hardware can support Ipv6 or other fields in a packet. This has to be based on what flow fields a hardware can support. > +static int > +netdev_dpdk_validate_flow(const struct match *match) { > + struct match match_zero_wc; > + > + /* Create a wc-zeroed version of flow */ > + match_init(&match_zero_wc, &match->flow, &match->wc); > + > +#define CHECK_NONZERO_BYTES(addr, size) do { \ > + uint8_t *padr = (uint8_t *)(addr); \ > + int i; \ > + for (i = 0; i < (size); i++) { \ > + if (padr[i] != 0) { \ > + goto err; \ > + } \ > + } \ > +} while (0) > + > +#define CHECK_NONZERO(var) do { \ > + if ((var) != 0) { \ > + goto err; \ > + } \ > +} while (0) > + > + CHECK_NONZERO_BYTES(&match_zero_wc.flow.tunnel, > + sizeof(match_zero_wc.flow.tunnel)); > + CHECK_NONZERO(match->wc.masks.metadata); > + CHECK_NONZERO(match->wc.masks.skb_priority); > + CHECK_NONZERO(match->wc.masks.pkt_mark); > + CHECK_NONZERO(match->wc.masks.dp_hash); > + > + /* recirc id must be zero */ > + CHECK_NONZERO(match_zero_wc.flow.recirc_id); > + > + CHECK_NONZERO(match->wc.masks.ct_state); > + CHECK_NONZERO(match->wc.masks.ct_zone); > + CHECK_NONZERO(match->wc.masks.ct_mark); > + CHECK_NONZERO(match->wc.masks.ct_label.u64.hi); > + CHECK_NONZERO(match->wc.masks.ct_label.u64.lo); > + CHECK_NONZERO(match->wc.masks.ct_nw_proto); > + CHECK_NONZERO(match->wc.masks.ct_tp_src); > + CHECK_NONZERO(match->wc.masks.ct_tp_dst); > + CHECK_NONZERO(match->wc.masks.conj_id); > + CHECK_NONZERO(match->wc.masks.actset_output); > + > + /* unsupported L2 */ > + CHECK_NONZERO_BYTES(&match->wc.masks.mpls_lse, > + sizeof(match_zero_wc.flow.mpls_lse) / > + sizeof(ovs_be32)); > + > + /* unsupported L3 */ > + CHECK_NONZERO_BYTES(&match->wc.masks.ipv6_src, sizeof(struct > in6_addr)); > + CHECK_NONZERO_BYTES(&match->wc.masks.ipv6_dst, sizeof(struct > in6_addr)); > + CHECK_NONZERO(match->wc.masks.ipv6_label); > + CHECK_NONZERO_BYTES(&match->wc.masks.nd_target, sizeof(struct > in6_addr)); > + CHECK_NONZERO_BYTES(&match->wc.masks.arp_sha, sizeof(struct > eth_addr)); > + CHECK_NONZERO_BYTES(&match->wc.masks.arp_tha, sizeof(struct > + eth_addr)); > + > + /* If fragmented, then don't HW accelerate - for now */ > + CHECK_NONZERO(match_zero_wc.flow.nw_frag); > + > + /* unsupported L4 */ > + CHECK_NONZERO(match->wc.masks.igmp_group_ip4); > + > + return 0; > + > +err: > + VLOG_INFO("Cannot HW accelerate this flow"); > + return -1; > +} > + > +static int > +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev, > + const ovs_u128 *ufid, > + struct rte_flow *rte_flow) { > + struct rte_flow_error error; > + int ret; > + > + ret = rte_flow_destroy(dev->port_id, rte_flow, &error); > + if (ret == 0) { > + del_ufid_dpdk_flow_mapping(ufid); > + VLOG_INFO("removed rte flow %p associated with ufid " UUID_FMT > "\n", > + rte_flow, UUID_ARGS((struct uuid *)ufid)); > + } else { > + VLOG_ERR("rte flow destroy error: %u : message : %s\n", > + error.type, error.message); > + } > + > + return ret; > +} > + > +static int > +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match, > + struct nlattr *actions, size_t actions_len, > + const ovs_u128 *ufid, struct offload_info *info, > + struct dpif_flow_stats *stats OVS_UNUSED) { > + struct rte_flow *rte_flow; > + int ret; > + > + /* > + * If an old rte_flow exists, it means it's a flow modification. > + * Here destroy the old rte flow first before adding a new one. > + */ > + rte_flow = get_rte_flow_by_ufid(ufid); > + if (rte_flow) { > + ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev), > + ufid, rte_flow); > + if (ret < 0) > + return ret; > + } > + > + ret = netdev_dpdk_validate_flow(match); > + if (ret < 0) { > + return ret; > + } > + > + return netdev_dpdk_add_rte_flow_offload(netdev, match, actions, > + actions_len, ufid, info); } > + > +#define DPDK_FLOW_OFFLOAD_API \ > + NULL, /* flow_flush */ \ > + NULL, /* flow_dump_create */ \ > + NULL, /* flow_dump_destroy */ \ > + NULL, /* flow_dump_next */ \ > + netdev_dpdk_flow_put, \ > + NULL, /* flow_get */ \ > + NULL, /* flow_del */ \ > + NULL /* init_flow_api */ > + > + > #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \ > SET_CONFIG, SET_TX_MULTIQ, SEND, \ > GET_CARRIER, GET_STATS, \ > @@ -3472,7 +3891,7 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid) > RXQ_RECV, \ > NULL, /* rx_wait */ \ > NULL, /* rxq_drain */ \ > - NO_OFFLOAD_API \ > + DPDK_FLOW_OFFLOAD_API \ > } > > static const struct netdev_class dpdk_class = > -- > 2.7.4 > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev