On 8/29/17, 5:15 AM, "Yuanhan Liu" <[email protected]> wrote:
On Tue, Aug 29, 2017 at 08:11:23AM +0000, Darrell Ball wrote:
>
> On 8/22/17, 11:24 PM, "Yuanhan Liu" <[email protected]> wrote:
>
> From: Finn Christensen <[email protected]>
>
> The basic yet the major part of this patch is to translate the
"match"
> to rte flow patterns. And then, we create a rte flow with a MARK
action.
> Afterwards, all pkts matches the flow will have the mark id in
the mbuf.
>
> For any unsupported flows, such as MPLS, -1 is returned, meaning
the
> flow offload is failed and then skipped.
>
> Signed-off-by: Finn Christensen <[email protected]>
> Signed-off-by: Yuanhan Liu <[email protected]>
> ---
> lib/netdev-dpdk.c | 385
+++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 384 insertions(+), 1 deletion(-)
>
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 15fef1e..8089da8 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -58,6 +58,7 @@
> #include "smap.h"
> #include "sset.h"
> #include "unaligned.h"
> +#include "uuid.h"
> #include "timeval.h"
> #include "unixctl.h"
>
> @@ -3398,6 +3399,388 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid)
> }
>
>
> +#define MAX_RTE_FLOW_ITEMS 100
> +#define MAX_RTE_FLOW_ACTIONS 100
>
> I guess these are temporary
Yes, the hardcoded number is really hacky.
> Do we need to do a rte query during initialization ?
query on what?
[Darrell]
I mean somehow the max hardware resources available at
dev initialization time ? I realize this is non-trivial overall.
> +
> +static int
> +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
> + const struct match *match,
> + struct nlattr *nl_actions
OVS_UNUSED,
> + size_t actions_len,
> + const ovs_u128 *ufid,
> + struct offload_info *info)
> +{
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> + const struct rte_flow_attr flow_attr = {
> + .group = 0,
> + .priority = 0,
> + .ingress = 1,
> + .egress = 0
> + };
> + struct flow_patterns {
> + struct rte_flow_item items[MAX_RTE_FLOW_ITEMS];
> + uint32_t cnt;
> + } patterns;
> + struct flow_actions {
> + struct rte_flow_action actions[MAX_RTE_FLOW_ACTIONS];
> + uint32_t cnt;
> + } actions;
> + struct rte_flow *flow;
> + struct rte_flow_error error;
> +
> + uint8_t *ipv4_next_proto_mask = NULL;
> +
> +#define ADD_FLOW_PATTERN(type_, spec_, mask_) do { \
> + patterns.items[patterns.cnt].type = type_; \
> + patterns.items[patterns.cnt].spec = spec_; \
> + patterns.items[patterns.cnt].mask = mask_; \
> + patterns.items[patterns.cnt].last = NULL; \
> + if (patterns.cnt < MAX_RTE_FLOW_ITEMS) { \
> + patterns.cnt++; \
> + } else { \
> + VLOG_ERR("too many rte flow patterns (> %d)",
MAX_RTE_FLOW_ITEMS); \
> + goto err; \
> + } \
> +} while (0)
>
> static (inline) function maybe ?
Indeed. I'm not a big fan of macro like this. Let me turn it to function
next time. I see no reason to make it inline (explicitly) though: it's
not in data path. Moreover, it's likely it will be inlined implicitly by
compiler.
[Darrell]
I put ‘(inline)’ in parentheses because we would never specify it explicitly,
since gcc would likely inline anyways.
>
>
> +
> +#define ADD_FLOW_ACTION(type_, conf_) do { \
> + actions.actions[actions.cnt].type = type_; \
> + actions.actions[actions.cnt].conf = conf_; \
> + actions.cnt++; \
> + ovs_assert(actions.cnt < MAX_RTE_FLOW_ACTIONS); \
> +} while (0)
>
>
> static (inline) function maybe ?
Ditto.
>
> +
> + patterns.cnt = 0;
> + actions.cnt = 0;
> +
> + /* Eth */
> + struct rte_flow_item_eth eth_spec;
> + struct rte_flow_item_eth eth_mask;
> + memset(ð_mask, 0, sizeof(eth_mask));
> + if (match->wc.masks.dl_src.be16[0] ||
> + match->wc.masks.dl_src.be16[1] ||
> + match->wc.masks.dl_src.be16[2] ||
> + match->wc.masks.dl_dst.be16[0] ||
> + match->wc.masks.dl_dst.be16[1] ||
> + match->wc.masks.dl_dst.be16[2]) {
> + rte_memcpy(ð_spec.dst, &match->flow.dl_dst,
sizeof(eth_spec.dst));
> + rte_memcpy(ð_spec.src, &match->flow.dl_src,
sizeof(eth_spec.src));
> + eth_spec.type = match->flow.dl_type;
> +
> + rte_memcpy(ð_mask.dst, &match->wc.masks.dl_dst,
sizeof(eth_mask.dst));
> + rte_memcpy(ð_mask.src, &match->wc.masks.dl_src,
sizeof(eth_mask.src));
> + eth_mask.type = match->wc.masks.dl_type;
> +
> + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, ð_spec,
ð_mask);
> + } else {
> + /*
> + * If user specifies a flow (like UDP flow) without L2
patterns,
> + * OVS will at least set the dl_type. Normally, it's
enough to
> + * create an eth pattern just with it. Unluckily, some
Intel's
> + * NIC (such as XL710) doesn't support that. Below is a
workaround,
> + * which simply matches any L2 pkts.
> + */
> + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, ð_spec,
ð_mask);
> + }
> +
> + /* VLAN */
> + struct rte_flow_item_vlan vlan_spec;
> + struct rte_flow_item_vlan vlan_mask;
> + memset(&vlan_mask, 0, sizeof(vlan_mask));
> + if (match->wc.masks.vlans[0].tci &&
match->flow.vlans[0].tci) {
> + vlan_spec.tci = match->flow.vlans[0].tci;
> + vlan_mask.tci = match->wc.masks.vlans[0].tci;
> +
> + /* match any protocols */
> + vlan_mask.tpid = 0;
> +
> + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_VLAN, &vlan_spec,
&vlan_mask);
> + }
> +
> + /* IP v4 */
> + uint8_t proto = 0;
> + struct rte_flow_item_ipv4 ipv4_spec;
> + struct rte_flow_item_ipv4 ipv4_mask;
> + memset(&ipv4_mask, 0, sizeof(ipv4_mask));
> + if ((match->flow.dl_type == ntohs(ETH_TYPE_IP)) &&
> + (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
> + match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
> + match->wc.masks.nw_proto)) {
> + ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
> + ipv4_spec.hdr.time_to_live = match->flow.nw_tos;
> + ipv4_spec.hdr.next_proto_id = match->flow.nw_proto;
> + ipv4_spec.hdr.src_addr = match->flow.nw_src;
> + ipv4_spec.hdr.dst_addr = match->flow.nw_dst;
> +
> + ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
> + ipv4_mask.hdr.time_to_live = match->wc.masks.nw_tos;
> + ipv4_mask.hdr.next_proto_id = match->wc.masks.nw_proto;
> + ipv4_mask.hdr.src_addr = match->wc.masks.nw_src;
> + ipv4_mask.hdr.dst_addr = match->wc.masks.nw_dst;
> +
> + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_IPV4, &ipv4_spec,
&ipv4_mask);
> +
> + /* Save proto for L4 protocol setup */
> + proto = ipv4_spec.hdr.next_proto_id &
ipv4_mask.hdr.next_proto_id;
> +
> + /* Remember proto mask address for later modification */
> + ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
> + }
> +
> + if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP &&
> + proto != IPPROTO_SCTP && proto != IPPROTO_TCP &&
> + (match->wc.masks.tp_src ||
> + match->wc.masks.tp_dst ||
> + match->wc.masks.tcp_flags)) {
> + VLOG_INFO("L4 Protocol (%u) not supported", proto);
> + goto err;
> + }
> +
> + struct rte_flow_item_udp udp_spec;
> + struct rte_flow_item_udp udp_mask;
> + memset(&udp_mask, 0, sizeof(udp_mask));
> + if ((proto == IPPROTO_UDP) &&
> + (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> + udp_spec.hdr.src_port = match->flow.tp_src;
> + udp_spec.hdr.dst_port = match->flow.tp_dst;
> +
> + udp_mask.hdr.src_port = match->wc.masks.tp_src;
> + udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> +
> + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_UDP, &udp_spec,
&udp_mask);
> +
> + /* proto == UDP and ITEM_TYPE_UDP, thus no need for
proto match */
> + if (ipv4_next_proto_mask) {
> + *ipv4_next_proto_mask = 0;
> + }
> + }
> +
> + struct rte_flow_item_sctp sctp_spec;
> + struct rte_flow_item_sctp sctp_mask;
> + memset(&sctp_mask, 0, sizeof(sctp_mask));
> + if ((proto == IPPROTO_SCTP) &&
> + (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> + sctp_spec.hdr.src_port = match->flow.tp_src;
> + sctp_spec.hdr.dst_port = match->flow.tp_dst;
> +
> + sctp_mask.hdr.src_port = match->wc.masks.tp_src;
> + sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> +
> + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_SCTP, &sctp_spec,
&sctp_mask);
> +
> + /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for
proto match */
> + if (ipv4_next_proto_mask) {
> + *ipv4_next_proto_mask = 0;
> + }
> + }
> +
> + struct rte_flow_item_icmp icmp_spec;
> + struct rte_flow_item_icmp icmp_mask;
> + memset(&icmp_mask, 0, sizeof(icmp_mask));
> + if ((proto == IPPROTO_ICMP) &&
> + (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> + icmp_spec.hdr.icmp_type =
(uint8_t)ntohs(match->flow.tp_src);
> + icmp_spec.hdr.icmp_code =
(uint8_t)ntohs(match->flow.tp_dst);
> +
> + icmp_mask.hdr.icmp_type =
(uint8_t)ntohs(match->wc.masks.tp_src);
> + icmp_mask.hdr.icmp_code =
(uint8_t)ntohs(match->wc.masks.tp_dst);
> +
> + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ICMP, &icmp_spec,
&icmp_mask);
> +
> + /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for
proto match */
> + if (ipv4_next_proto_mask) {
> + *ipv4_next_proto_mask = 0;
> + }
> + }
> +
> + struct rte_flow_item_tcp tcp_spec;
> + struct rte_flow_item_tcp tcp_mask;
> + memset(&tcp_mask, 0, sizeof(tcp_mask));
> + if ((proto == IPPROTO_TCP) &&
> + (match->wc.masks.tp_src ||
> + match->wc.masks.tp_dst ||
> + match->wc.masks.tcp_flags)) {
> + tcp_spec.hdr.src_port = match->flow.tp_src;
> + tcp_spec.hdr.dst_port = match->flow.tp_dst;
> + tcp_spec.hdr.data_off = ntohs(match->flow.tcp_flags) >>
8;
> + tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) &
0xff;
> +
> + tcp_mask.hdr.src_port = match->wc.masks.tp_src;
> + tcp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> + tcp_mask.hdr.data_off =
ntohs(match->wc.masks.tcp_flags) >> 8;
> + tcp_mask.hdr.tcp_flags =
ntohs(match->wc.masks.tcp_flags) & 0xff;
> +
> + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_TCP, &tcp_spec,
&tcp_mask);
> +
> + /* proto == TCP and ITEM_TYPE_TCP, thus no need for
proto match */
> + if (ipv4_next_proto_mask) {
> + *ipv4_next_proto_mask = 0;
> + }
> + }
> + ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
> +
> + struct rte_flow_action_mark mark;
> + if (actions_len) {
> + mark.id = info->flow_mark;
> + ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_MARK, &mark);
> + } else {
> + ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_DROP, NULL);
> + VLOG_INFO("no action given; drop pkts in hardware\n");
> + }
> + ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_END, NULL);
> +
> + flow = rte_flow_create(dev->port_id, &flow_attr,
patterns.items,
> + actions.actions, &error);
> + if (!flow) {
> + VLOG_ERR("rte flow creat error: %u : message : %s\n",
> + error.type, error.message);
> + goto err;
> + }
> + add_ufid_dpdk_flow_mapping(ufid, flow);
> + VLOG_INFO("installed flow %p by ufid "UUID_FMT"\n",
> + flow, UUID_ARGS((struct uuid *)ufid));
> +
> + return 0;
> +
> +err:
> + return -1;
> +}
> +
> +/*
> + * Validate for later rte flow offload creation. If any
unsupported
> + * flow are specified, return -1.
> + */
> +static int
> +netdev_dpdk_validate_flow(const struct match *match)
> +{
> + struct match match_zero_wc;
> +
> + /* Create a wc-zeroed version of flow */
> + match_init(&match_zero_wc, &match->flow, &match->wc);
> +
> +#define CHECK_NONZERO(addr, size) do { \
> + uint8_t *padr = (uint8_t *)(addr); \
> + int i; \
> + for (i = 0; i < (size); i++) { \
> + if (padr[i] != 0) { \
> + goto err; \
> + } \
> + } \
> +} while (0)
> +
> +#define CHECK_NONZEROSIMPLE(var) do { \
>
> CHECK_NONZERO ?
Yep, it's better (as it's shorter; also, SIMPLE is too vague).
>
> + if ((var) != 0) { \
> + goto err; \
> + } \
> +} while (0)
> +
> + CHECK_NONZERO(&match_zero_wc.flow.tunnel,
sizeof(match_zero_wc.flow.tunnel));
> + CHECK_NONZEROSIMPLE(match->wc.masks.metadata);
> + CHECK_NONZEROSIMPLE(match->wc.masks.skb_priority);
> + CHECK_NONZEROSIMPLE(match->wc.masks.pkt_mark);
> + CHECK_NONZEROSIMPLE(match->wc.masks.dp_hash);
> +
> + /* recirc id must be zero */
> + CHECK_NONZEROSIMPLE(match_zero_wc.flow.recirc_id);
> +
> + CHECK_NONZEROSIMPLE(match->wc.masks.ct_state);
> + CHECK_NONZEROSIMPLE(match->wc.masks.ct_zone);
> + CHECK_NONZEROSIMPLE(match->wc.masks.ct_mark);
> + CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.hi);
> + CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.lo);
> + CHECK_NONZEROSIMPLE(match->wc.masks.ct_nw_proto);
> + CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_src);
> + CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_dst);
> + CHECK_NONZEROSIMPLE(match->wc.masks.conj_id);
> + CHECK_NONZEROSIMPLE(match->wc.masks.actset_output);
> +
> + /* unsupported L2 */
> + CHECK_NONZERO(&match->wc.masks.mpls_lse,
> + sizeof(match_zero_wc.flow.mpls_lse) /
sizeof(ovs_be32));
> +
> + /* unsupported L3 */
> + CHECK_NONZERO(&match->wc.masks.ipv6_src, sizeof(struct
in6_addr));
> + CHECK_NONZERO(&match->wc.masks.ipv6_dst, sizeof(struct
in6_addr));
> + CHECK_NONZEROSIMPLE(match->wc.masks.ipv6_label);
> + CHECK_NONZERO(&match->wc.masks.nd_target, sizeof(struct
in6_addr));
> + CHECK_NONZERO(&match->wc.masks.arp_sha, sizeof(struct
eth_addr));
> + CHECK_NONZERO(&match->wc.masks.arp_tha, sizeof(struct
eth_addr));
>
> CHECK_NONZERO_BYTES ?
Again, better. Thanks!
> +
> + /* If fragmented, then don't HW accelerate - for now */
> + CHECK_NONZEROSIMPLE(match_zero_wc.flow.nw_frag);
> +
> + /* unsupported L4 */
> + CHECK_NONZEROSIMPLE(match->wc.masks.igmp_group_ip4);
> +
> + return 0;
> +
> +err:
> + VLOG_INFO("Cannot HW accelerate this flow");
> + return -1;
> +}
> +
> +static int
> +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
> + const ovs_u128 *ufid,
> + struct rte_flow *rte_flow)
> +{
> + struct rte_flow_error error;
> + int ret;
> +
> + ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
> + if (ret == 0) {
> + del_ufid_dpdk_flow_mapping(ufid);
> + VLOG_INFO("removed rte flow %p associated with ufid "
UUID_FMT "\n",
> + rte_flow, UUID_ARGS((struct uuid *)ufid));
> + } else {
> + VLOG_ERR("rte flow destroy error: %u : message : %s\n",
> + error.type, error.message);
> + }
> +
> + return ret;
> +}
> +
> +static int
> +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
> + struct nlattr *actions, size_t actions_len,
> + const ovs_u128 *ufid, struct offload_info
*info,
> + struct dpif_flow_stats *stats OVS_UNUSED)
> +{
> + struct rte_flow *rte_flow;
> + int ret;
> +
> + /*
> + * If an old rte_flow exists, it means it's a flow
modification.
> + * Here destroy the old rte flow first before adding a new
one.
> + */
> + rte_flow = get_rte_flow_by_ufid(ufid);
> + if (rte_flow) {
> + ret =
netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
> + ufid, rte_flow);
> + if (ret < 0)
> + return ret;
> + }
> +
> + ret = netdev_dpdk_validate_flow(match);
> + if (ret < 0) {
> + return ret;
> + }
> +
> + return netdev_dpdk_add_rte_flow_offload(netdev, match,
actions,
> + actions_len, ufid,
info);
> +}
> +
>
> Would you mind adding some comments for below NULL elements ?
Like what the method it's pointing to?
[Darrell]
exactly
--yliu
>
>
> +#define DPDK_FLOW_OFFLOAD_API \
> + NULL, \
> + NULL, \
> + NULL, \
> + NULL, \
> + netdev_dpdk_flow_put, \
> + NULL, \
> + NULL, \
> + NULL
> +
> +
> #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \
> SET_CONFIG, SET_TX_MULTIQ, SEND, \
> GET_CARRIER, GET_STATS, \
> @@ -3470,7 +3853,7 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid)
> RXQ_RECV, \
> NULL, /* rx_wait */ \
> NULL, /* rxq_drain */ \
> - NO_OFFLOAD_API \
> + DPDK_FLOW_OFFLOAD_API \
> }
>
> static const struct netdev_class dpdk_class =
> --
> 2.7.4
>
>
>
>
>
> _______________________________________________
> dev mailing list
> [email protected]
>
https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwIBAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=dfWCL5vBH3eyoegyoQj-Ue82sTh5Z8aC0rJoUlNTPag&s=F90aGyXQASuSFbqM4N5JSxXju2QooA3333vvKNgYLmw&e=
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev