The tc module combines the use of the `tc_transact` helper
function for communication with the in-kernel tc infrastructure
with assertions on the reply data by `ofpbuf_at_assert` on the
received data prior to further processing.

`tc_transact` in turn calls `nl_transact`, which via
`nl_transact_multiple__` ultimately calls and handles return
value from `recvmsg`.  On error a check for EAGAIN is performed
and a consequence of this condition is effectively to provide a
non-error (0) result and an empty reply buffer.

Before this change, the `tc_transact` and, consumers of it, were
unaware of this condition.  The use of assertions on the reply
buffer can as such lead to a fatal crash of OVS.

To be fair, the behavior of `nl_transact` when handling an EAGAIN
return is currently not documented, so this change also adds that
documentation.

While fixing the problem, it led me to find potential problems
with the one-time initialization functions in the netdev-offload-tc
module.  Make use of the information now available about an EAGAIN
condition to retry one-time initialization, and resort to logging
a warning if probing of tc features fails due to temporary
situations such as resource depletion.

For the record, the symptom of the crash is this in the log:
EMER|../include/openvswitch/ofpbuf.h:194: assertion offset + size <= b->size 
failed in ofpbuf_at_assert()

And an excerpt of the backtrace looks like this:
0x0000561dac1396d1 in ofpbuf_at_assert (b=0x7fb650005d20, b=0x7fb650005d20, 
offset=16, size=20) at ../include/openvswitch/ofpbuf.h:194
tc_replace_flower (id=<optimized out>, flower=<optimized out>) at 
../lib/tc.c:3223
0x0000561dac128155 in netdev_tc_flow_put (netdev=0x561dacf91840, 
match=<optimized out>, actions=<optimized out>, actions_len=<optimized out>,
ufid=<optimized out>, info=<optimized out>, stats=<optimized out>) at 
../lib/netdev-offload-tc.c:2096
0x0000561dac117541 in netdev_flow_put (stats=<optimized out>, 
info=0x7fb65b7ba780, ufid=<optimized out>, act_len=<optimized out>, 
actions=<optimized out>,
match=0x7fb65b7ba980, netdev=0x561dacf91840) at ../lib/netdev-offload.c:257
parse_flow_put (put=0x7fb65b7bcc50, dpif=0x561dad0ad550) at 
../lib/dpif-netlink.c:2297
try_send_to_netdev (op=0x7fb65b7bcc48, dpif=0x561dad0ad550) at 
../lib/dpif-netlink.c:2384

Reported-At: https://launchpad.net/bugs/2018500
Fixes f98e418fbdb6 (tc: Add tc flower functions)
Fixes 407556ac6c90 (netlink-socket: Avoid forcing a reply for final message in 
a transaction.)
Signed-off-by: Frode Nordahl <[email protected]>
---
 lib/dpif-netlink.c      |  4 +-
 lib/dpif.c              |  2 +-
 lib/netdev-linux.c      | 34 +++++++++-------
 lib/netdev-offload-tc.c | 86 +++++++++++++++++++++++++++++++++++------
 lib/netlink-socket.c    |  5 +++
 lib/tc.c                | 18 +++++++++
 6 files changed, 122 insertions(+), 27 deletions(-)

diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 60bd39643..75537b2f8 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -2340,7 +2340,9 @@ parse_flow_put(struct dpif_netlink *dpif, struct 
dpif_flow_put *put)
             }
             netdev_set_hw_info(oor_netdev, HW_INFO_TYPE_OOR, true);
         }
-        level = (err == ENOSPC || err == EOPNOTSUPP) ? VLL_DBG : VLL_ERR;
+        level = (err == EAGAIN ||
+                 err == ENOSPC ||
+                 err == EOPNOTSUPP) ? VLL_DBG : VLL_ERR;
         VLOG_RL(&rl, level, "failed to offload flow: %s: %s",
                 ovs_strerror(err),
                 (oor_netdev ? oor_netdev->name : dev->name));
diff --git a/lib/dpif.c b/lib/dpif.c
index 3305401fe..2bbb0aac4 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1751,7 +1751,7 @@ flow_message_log_level(int error)
      * Kernels that support flow wildcarding will reject these flows as
      * duplicates (EEXIST), so lower the log level to debug for these
      * types of messages. */
-    return (error && error != EEXIST) ? VLL_WARN : VLL_DBG;
+    return (error && error != EEXIST && error != EAGAIN) ? VLL_WARN : VLL_DBG;
 }
 
 static bool
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 36620199e..a0294ca00 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -5817,8 +5817,9 @@ tc_get_policer_action(uint32_t index, struct 
ofputil_meter_stats *stats)
 
     error = tc_transact(&request, &replyp);
     if (error) {
-        VLOG_ERR_RL(&rl, "Failed to dump police action (index: %u), err=%d",
-                    index, error);
+        VLOG_RL(&rl, error == EAGAIN ? VLL_DBG : VLL_ERR,
+                "Failed to dump police action (index: %u), err=%d",
+                index, error);
         return error;
     }
 
@@ -5849,8 +5850,9 @@ tc_del_policer_action(uint32_t index, struct 
ofputil_meter_stats *stats)
 
     error = tc_transact(&request, &replyp);
     if (error) {
-        VLOG_ERR_RL(&rl, "Failed to delete police action (index: %u), err=%d",
-                    index, error);
+        VLOG_RL(&rl, error == EAGAIN ? VLL_DBG : VLL_ERR,
+                "Failed to delete police action (index: %u), err=%d",
+                index, error);
         return error;
     }
 
@@ -6114,11 +6116,12 @@ tc_query_class(const struct netdev *netdev,
 
     error = tc_transact(&request, replyp);
     if (error) {
-        VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
-                     netdev_get_name(netdev),
-                     tc_get_major(handle), tc_get_minor(handle),
-                     tc_get_major(parent), tc_get_minor(parent),
-                     ovs_strerror(error));
+        VLOG_RL(&rl, error == EAGAIN ? VLL_DBG : VLL_WARN,
+                "query %s class %u:%u (parent %u:%u) failed (%s)",
+                netdev_get_name(netdev),
+                tc_get_major(handle), tc_get_minor(handle),
+                tc_get_major(parent), tc_get_minor(parent),
+                ovs_strerror(error));
     }
     return error;
 }
@@ -6140,10 +6143,11 @@ tc_delete_class(const struct netdev *netdev, unsigned 
int handle)
 
     error = tc_transact(&request, NULL);
     if (error) {
-        VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
-                     netdev_get_name(netdev),
-                     tc_get_major(handle), tc_get_minor(handle),
-                     ovs_strerror(error));
+        VLOG_RL(&rl, error == EAGAIN ? VLL_DBG : VLL_WARN,
+                "delete %s class %u:%u failed (%s)",
+                netdev_get_name(netdev),
+                tc_get_major(handle), tc_get_minor(handle),
+                ovs_strerror(error));
     }
     return error;
 }
@@ -6262,7 +6266,9 @@ tc_query_qdisc(const struct netdev *netdev_)
                 ops = &tc_ops_other;
             }
         }
-    } else if ((!error && !qdisc->size) || error == ENOENT) {
+    } else if ((!error && !qdisc->size) ||
+               error == ENOENT || error == EAGAIN)
+    {
         /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
          * set up by some other entity that doesn't have a handle 1:0.  We will
          * assume that it's the system default qdisc. */
diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
index 4f26dd8cc..4fb9dc239 100644
--- a/lib/netdev-offload-tc.c
+++ b/lib/netdev-offload-tc.c
@@ -2505,10 +2505,10 @@ netdev_tc_flow_get(struct netdev *netdev,
 
     err = tc_get_flower(&id, &flower);
     if (err) {
-        VLOG_ERR_RL(&error_rl,
-                    "flow get failed (dev %s prio %d handle %d): %s",
-                    netdev_get_name(netdev), id.prio, id.handle,
-                    ovs_strerror(err));
+        VLOG_RL(&error_rl, err == EAGAIN ? VLL_DBG : VLL_ERR,
+                "flow get failed (dev %s prio %d handle %d): %s",
+                netdev_get_name(netdev), id.prio, id.handle,
+                ovs_strerror(err));
         return err;
     }
 
@@ -2571,11 +2571,36 @@ netdev_tc_get_n_flows(struct netdev *netdev, uint64_t 
*n_flows)
     return 0;
 }
 
+/* This macro is for use by one-time initialization functions, where we have
+ * one shot per thread/process to perform a pertinent initialization task that
+ * may return a temporary error (EAGAIN).
+ *
+ * With min/max values of 1/64 we would retry 7 times, spending at the
+ * most 127 * 1E6 nsec (0.127s) sleeping.
+ */
+#define NETDEV_OFFLOAD_TC_BACKOFF_MIN 1
+#define NETDEV_OFFLOAD_TC_BACKOFF_MAX 64
+#define NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(BACKOFF, ERROR, CONDITION,       \
+                                             FUNCTION, ...)                   \
+    BACKOFF = NETDEV_OFFLOAD_TC_BACKOFF_MIN;                                  \
+    do {                                                                      \
+        ERROR = (FUNCTION)(__VA_ARGS__);                                      \
+        if (CONDITION) {                                                      \
+            xnanosleep(BACKOFF * 1E6);                                        \
+            if (BACKOFF < NETDEV_OFFLOAD_TC_BACKOFF_MAX) {                    \
+                BACKOFF <<= 1;                                                \
+            } else {                                                          \
+                break;                                                        \
+            }                                                                 \
+        }                                                                     \
+    } while (CONDITION);
+
 static void
 probe_multi_mask_per_prio(int ifindex)
 {
     struct tc_flower flower;
     struct tcf_id id1, id2;
+    uint64_t backoff;
     int block_id = 0;
     int prio = 1;
     int error;
@@ -2594,8 +2619,13 @@ probe_multi_mask_per_prio(int ifindex)
     memset(&flower.mask.dst_mac, 0xff, sizeof flower.mask.dst_mac);
 
     id1 = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
-    error = tc_replace_flower(&id1, &flower);
+    NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
+                                         tc_replace_flower, &id1, &flower);
     if (error) {
+        if (error == EAGAIN) {
+            VLOG_WARN("probe tc: unable to probe for multiple mask "
+                      "support: %s", ovs_strerror(error));
+        }
         goto out;
     }
 
@@ -2603,10 +2633,15 @@ probe_multi_mask_per_prio(int ifindex)
     memset(&flower.mask.src_mac, 0xff, sizeof flower.mask.src_mac);
 
     id2 = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
-    error = tc_replace_flower(&id2, &flower);
+    NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
+                                         tc_replace_flower, &id2, &flower);
     tc_del_flower_filter(&id1);
 
     if (error) {
+        if (error == EAGAIN) {
+            VLOG_WARN("probe tc: unable to probe for multiple mask "
+                      "support: %s", ovs_strerror(error));
+        }
         goto out;
     }
 
@@ -2642,6 +2677,7 @@ probe_ct_state_support(int ifindex)
 {
     struct tc_flower flower;
     uint16_t ct_state;
+    uint64_t backoff;
     struct tcf_id id;
     int error;
 
@@ -2657,8 +2693,13 @@ probe_ct_state_support(int ifindex)
         goto out;
     }
 
-    error = tc_get_flower(&id, &flower);
+    NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
+                                         tc_get_flower, &id, &flower);
     if (error || flower.mask.ct_state != ct_state) {
+        if (error == EAGAIN) {
+            VLOG_WARN("probe tc: unable to probe ct_state support: %s",
+                      ovs_strerror(error));
+        }
         goto out_del;
     }
 
@@ -2670,10 +2711,16 @@ probe_ct_state_support(int ifindex)
 
     /* Test for reject, ct_state >= MAX */
     ct_state = ~0;
-    error = probe_insert_ct_state_rule(ifindex, ct_state, &id);
+    NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
+                                         probe_insert_ct_state_rule, ifindex,
+                                         ct_state, &id);
     if (!error) {
         /* No reject, can't continue probing other flags */
         goto out_del;
+    } else if (error == EAGAIN) {
+        VLOG_WARN("probe tc: unable to probe ct_state support: %s",
+                  ovs_strerror(error));
+        goto out_del;
     }
 
     tc_del_flower_filter(&id);
@@ -2682,8 +2729,14 @@ probe_ct_state_support(int ifindex)
     memset(&flower, 0, sizeof flower);
     ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
                TCA_FLOWER_KEY_CT_FLAGS_INVALID;
-    error = probe_insert_ct_state_rule(ifindex, ct_state, &id);
+    NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
+                                         probe_insert_ct_state_rule, ifindex,
+                                         ct_state, &id);
     if (error) {
+        if (error == EAGAIN) {
+            VLOG_WARN("probe tc: unable to probe ct_state support: %s",
+                      ovs_strerror(error));
+        }
         goto out;
     }
 
@@ -2695,8 +2748,14 @@ probe_ct_state_support(int ifindex)
     ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
                TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED |
                TCA_FLOWER_KEY_CT_FLAGS_REPLY;
-    error = probe_insert_ct_state_rule(ifindex, ct_state, &id);
+    NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
+                                         probe_insert_ct_state_rule, ifindex,
+                                         ct_state, &id);
     if (error) {
+        if (error == EAGAIN) {
+            VLOG_WARN("probe tc: unable to probe ct_state support: %s",
+                      ovs_strerror(error));
+        }
         goto out;
     }
 
@@ -2714,6 +2773,7 @@ probe_tc_block_support(int ifindex)
 {
     struct tc_flower flower;
     uint32_t block_id = 1;
+    uint64_t backoff;
     struct tcf_id id;
     int prio = 0;
     int error;
@@ -2732,13 +2792,17 @@ probe_tc_block_support(int ifindex)
     memset(&flower.mask.dst_mac, 0xff, sizeof flower.mask.dst_mac);
 
     id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
-    error = tc_replace_flower(&id, &flower);
+    NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
+                                         tc_replace_flower, &id, &flower);
 
     tc_add_del_qdisc(ifindex, false, block_id, TC_INGRESS);
 
     if (!error) {
         block_support = true;
         VLOG_INFO("probe tc: block offload is supported.");
+    } else if (error == EAGAIN) {
+        VLOG_WARN("probe tc: unable to probe block offload support: %s",
+                  ovs_strerror(error));
     }
 }
 
diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
index 80da20d9f..dea060fc3 100644
--- a/lib/netlink-socket.c
+++ b/lib/netlink-socket.c
@@ -1798,6 +1798,11 @@ nl_pool_release(struct nl_sock *sock)
  *
  *      2. Resending the request causes it to be re-executed, so the request
  *         needs to be idempotent.
+ *
+ *      3. In the event that the kernel is too busy to handle the request to
+ *         receive the response (i.e. EAGAIN), this function will still return
+ *         0.  The caller can check for this condition by checking for a zero
+ *         size of the 'replyp' ofpbuf buffer.
  */
 int
 nl_transact(int protocol, const struct ofpbuf *request,
diff --git a/lib/tc.c b/lib/tc.c
index 5c32c6f97..7df6a10c5 100644
--- a/lib/tc.c
+++ b/lib/tc.c
@@ -237,11 +237,29 @@ static void request_from_tcf_id(struct tcf_id *id, 
uint16_t eth_type,
     }
 }
 
+/* The `tc_transact` function is a wrapper around `nl_transact` with the
+ * addition of:
+ *
+ * 1. the 'request' ofpbuf buffer is freed after `nl_transact` returns,
+ *    regardless of success or failure.
+ *
+ * 2. When a 'replyp' pointer is provided; in the event of the kernel
+ *    being too busy to process the request for the response, a positive
+ *    error return will be provided with the value of EAGAIN.
+ *
+ * Please acquaint yourself with the documentation of the `nl_transact`
+ * function in the netlink-sock module before making use of this function.
+ */
 int
 tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
 {
     int error = nl_transact(NETLINK_ROUTE, request, replyp);
     ofpbuf_uninit(request);
+
+    if (!error && replyp && !(*replyp)->size) {
+        return EAGAIN;
+    }
+
     return error;
 }
 
-- 
2.39.2

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to