On 10/01/2017 23:58, Joe Stringer wrote:
On 10 January 2017 at 06:36, Paul Blakey <pa...@mellanox.com> wrote:


On 06/01/2017 01:28, Joe Stringer wrote:

On 25 December 2016 at 03:39, Paul Blakey <pa...@mellanox.com> wrote:

Using the new netdev flow api operate will now try and
offload flows to the relevant netdev of the input port.
Other operate methods flows will come in later patches.

Signed-off-by: Paul Blakey <pa...@mellanox.com>
Reviewed-by: Roi Dayan <r...@mellanox.com>
---
  lib/dpif-netlink.c | 232
++++++++++++++++++++++++++++++++++++++++++++++++++++-
  1 file changed, 228 insertions(+), 4 deletions(-)

diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 3d8940e..717af90 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink
*dpif,
      return n_ops;
  }

+static int
+parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
+                            const struct nlattr *mask, size_t mask_len,
+                            struct match *match)
+{
+    enum odp_key_fitness fitness;
+
+    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
+    if (fitness) {
+        /* This should not happen: it indicates that
odp_flow_key_from_flow()
+         * and odp_flow_key_to_flow() disagree on the acceptable form of
a
+         * flow.  Log the problem as an error, with enough details to
enable
+         * debugging. */
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+        if (!VLOG_DROP_ERR(&rl)) {
+            struct ds s;
+
+            ds_init(&s);
+            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
+            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
+            ds_destroy(&s);
+        }
+
+        return EINVAL;
+    }
+
+    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc,
&match->flow);
+    if (fitness) {
+        /* This should not happen: it indicates that
+         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
+         * disagree on the acceptable form of a mask.  Log the problem
+         * as an error, with enough details to enable debugging. */
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+        if (!VLOG_DROP_ERR(&rl)) {
+            struct ds s;
+
+            VLOG_ERR("internal error parsing flow mask %s (%s)",
+                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
+            ds_destroy(&s);
+        }
+
+        return EINVAL;
+    }
+
+    return 0;
+}
+
+static bool
+parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
+{
+    struct match match;
+    odp_port_t in_port;
+    const struct nlattr *nla;
+    size_t left;
+    int outputs = 0;
+    struct ofpbuf buf;
+    uint64_t act_stub[1024 / 8];
+    size_t offset;
+    struct nlattr *act;
+    struct netdev *dev;
+    int err;
+
+    /* 0x1234 - fake eth type sent to probe feature */
+    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type ==
htons(0x1234)) {
+        return false;
+    }
+
+    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
+                                put->mask_len, &match)) {
+        return false;
+    }
+
+    in_port = match.flow.in_port.odp_port;
+    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
+    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
+    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
+        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
+            struct netdev *outdev;
+            int ifindex_out;
+            const struct netdev_tunnel_config *tnl_cfg;
+            size_t out_off;
+            odp_port_t out_port;
+
+            outputs++;
+            if (outputs > 1) {
+                break;
+            }
+
+            out_port = nl_attr_get_u32(nla);
+            outdev = netdev_hmap_port_get(out_port,
dpif->dpif.dpif_class);
+            tnl_cfg = netdev_get_tunnel_config(outdev);
+
+            out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
+            ifindex_out = netdev_get_ifindex(outdev);
+            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
+            if (tnl_cfg && tnl_cfg->dst_port != 0) {
+                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST,
tnl_cfg->dst_port);
+            }
+            nl_msg_end_nested(&buf, out_off);
+
+            if (outdev) {
+                netdev_close(outdev);
+            }
+        } else {
+            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
+                              nl_attr_get_size(nla));
+        }
+    }
+    nl_msg_end_nested(&buf, offset);
+
+    if (outputs > 1) {
+        return false;
+    }
+
+    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
+    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
+    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
+                                                  nl_attr_get(act)),
+                          nl_attr_get_size(act), put->stats,
+                          CONST_CAST(ovs_u128 *, put->ufid));
+    netdev_close(dev);
+
+    if (!err) {
+        if (put->flags & DPIF_FP_MODIFY) {
+            struct dpif_op *opp;
+            struct dpif_op op;
+
+            op.type = DPIF_OP_FLOW_DEL;
+            op.u.flow_del.key = put->key;
+            op.u.flow_del.key_len = put->key_len;
+            op.u.flow_del.ufid = put->ufid;
+            op.u.flow_del.pmd_id = put->pmd_id;
+            op.u.flow_del.stats = NULL;
+            op.u.flow_del.terse = false;
+
+            opp = &op;
+            dpif_netlink_operate__(dpif, &opp, 1);
+        }
+        VLOG_DBG("added flow");
+        return true;
+    }
+    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
+
+    return false;
+}
+
+static void
+dbg_print_flow(const struct nlattr *key, size_t key_len,
+               const struct nlattr *mask, size_t mask_len,
+               const struct nlattr *actions, size_t actions_len,
+               const ovs_u128 *ufid,
+               const char *op)
+{
+        struct ds s;
+
+        ds_init(&s);
+        ds_put_cstr(&s, op);
+        ds_put_cstr(&s, " (");
+        odp_format_ufid(ufid, &s);
+        ds_put_cstr(&s, ")");
+        if (key_len) {
+            ds_put_cstr(&s, "\nflow (verbose): ");
+            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
true);
+            ds_put_cstr(&s, "\nflow: ");
+            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
false);
+        }
+        if (actions_len) {
+            ds_put_cstr(&s, "\nactions: ");
+            format_odp_actions(&s, actions, actions_len);
+        }
+        VLOG_DBG("\n%s", ds_cstr(&s));
+        ds_destroy(&s);
+}
+
+static bool
+try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
+{
+    switch (op->type) {
+    case DPIF_OP_FLOW_PUT: {
+        struct dpif_flow_put *put = &op->u.flow_put;
+
+        if (!put->ufid) {
+            return false;
+        }
+        dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
+                       put->actions, put->actions_len, put->ufid,
"PUT");
+        return parse_flow_put(dpif, put);
+    }
+    case DPIF_OP_FLOW_DEL:
+    case DPIF_OP_FLOW_GET:
+    case DPIF_OP_EXECUTE:
+    default:
+        break;
+    }
+    return false;
+}
+
  static void
  dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t
n_ops)
  {
      struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
+    struct dpif_op **new_ops;
+    int n_new_ops = 0;
+    int i = 0;
+
+    if (!netdev_flow_api_enabled) {
+        new_ops = ops;
+        n_new_ops = n_ops;
+    } else {
+        new_ops = xmalloc((sizeof *new_ops) * n_ops);
+        n_new_ops = 0;
+
+        for (i = 0; i < n_ops; i++) {
+            if (try_send_to_netdev(dpif, ops[i])) {
+                ops[i]->error = 0;

What if the hardware returns EEXIST? Shouldn't we return EEXIST?

Right it should, we'll fix that.


What if the hardware reaches some resource constraint? This isn't
required for an initial implementation, but it may be nice to have
some heuristic to try to cut down on the failed syscalls if userspace
has become aware that the hardware is out of resources. (Getting good
visibility on this would also matter if you tried to deploy this).

Right, do you mean that if certain kinds of flow fail (a specific mask
type), don't try again (with the same mask)?
Is it done in kernel?

There's a couple of things: On the side of resource constraints, there
is currently a ceiling of about 200,000 flows, above which userspace
will not attempt to install a new flow. This logic is in
ofproto-dpif-upcall. Depending on how complex your constraints are,
maybe there is a way to model the hardware resource in the userspace
so that once the limits are hit, userspace minimizes the number of
failed syscalls due to hardware constraints. Simplest model would be
something like, when TC starts returning error codes for ENOSPC or
whatever the "out of hardware resource" error codes are, then you take
the current number of hardware flows and choose that as the maximum
number of hardware flows. Future flow installs to hardware will fail
out early based on this "n_flows > max_flows" logic. Obviously
depending on how constrained your hardware is, and what the
constraints look like, this may be useful or useless. If hardware
supports 2K flows, then you're more likely to get benefit out of being
aware of this than if the hardware allows 200K arbitrary flows. Also I
recognise that hardware may arrange flows differently so two flows may
consume different amounts of hardware resources.

The second part is if it's certain kinds of flow. The "probe"s in
ofproto-dpif allow datapath feature detection at runtime, which can
then be used to change the behaviour. For instance, if there is no
support in datapath for a particular action, then the OpenFlow layer
will return errors when a controller attempts to use that action (as
we can't satisfy the flow mod request). For TC, it may be more like,
during datapath initialization we detect the supported features so
that flows may be checked against this feature support before going
down to the kernel to install the flow (which would fail if, for
instance, you tried to use one of the newer fields against an older
kernel that has only the initial flower support).


Thanks for the suggestions. We need to look into that but I think
we can postpone this for the current patch set, and do these optimizations later.


One thing I'm considering to avoid is where ovs-vswitchd logs end up
getting flooded with all of these "hardware flow failed to be
installed" errors, and you spend extra CPU attempting things that we
can easily detect and avoid. Something to consider.

While I remember, please check all the VLOG calls and use ratelimiters
for them. (I think I provided this feedback somewhere, but I put it
here as well just in case I forgot to).


_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to