On 15/05/2023 11:04, Frode Nordahl wrote:
> The tc module combines the use of the `tc_transact` helper
> function for communication with the in-kernel tc infrastructure
> with assertions on the reply data by `ofpbuf_at_assert` on the
> received data prior to further processing.
>
> `tc_transact` in turn calls `nl_transact`, which via
> `nl_transact_multiple__` ultimately calls and handles return
> value from `recvmsg`. On error a check for EAGAIN is performed
> and a consequence of this condition is effectively to provide a
> non-error (0) result and an empty reply buffer.
>
> Before this change, the `tc_transact` and, consumers of it, were
> unaware of this condition. The use of assertions on the reply
> buffer can as such lead to a fatal crash of OVS.
>
> To be fair, the behavior of `nl_transact` when handling an EAGAIN
> return is currently not documented, so this change also adds that
> documentation.
>
> While fixing the problem, it led me to find potential problems
> with the one-time initialization functions in the netdev-offload-tc
> module. Make use of the information now available about an EAGAIN
> condition to retry one-time initialization, and resort to logging
> a warning if probing of tc features fails due to temporary
> situations such as resource depletion.
>
> For the record, the symptom of the crash is this in the log:
> EMER|../include/openvswitch/ofpbuf.h:194: assertion offset + size <= b->size
> failed in ofpbuf_at_assert()
>
> And an excerpt of the backtrace looks like this:
> 0x0000561dac1396d1 in ofpbuf_at_assert (b=0x7fb650005d20, b=0x7fb650005d20,
> offset=16, size=20) at ../include/openvswitch/ofpbuf.h:194
> tc_replace_flower (id=<optimized out>, flower=<optimized out>) at
> ../lib/tc.c:3223
> 0x0000561dac128155 in netdev_tc_flow_put (netdev=0x561dacf91840,
> match=<optimized out>, actions=<optimized out>, actions_len=<optimized out>,
> ufid=<optimized out>, info=<optimized out>, stats=<optimized out>) at
> ../lib/netdev-offload-tc.c:2096
> 0x0000561dac117541 in netdev_flow_put (stats=<optimized out>,
> info=0x7fb65b7ba780, ufid=<optimized out>, act_len=<optimized out>,
> actions=<optimized out>,
> match=0x7fb65b7ba980, netdev=0x561dacf91840) at ../lib/netdev-offload.c:257
> parse_flow_put (put=0x7fb65b7bcc50, dpif=0x561dad0ad550) at
> ../lib/dpif-netlink.c:2297
> try_send_to_netdev (op=0x7fb65b7bcc48, dpif=0x561dad0ad550) at
> ../lib/dpif-netlink.c:2384
>
> Reported-At: https://launchpad.net/bugs/2018500
> Fixes f98e418fbdb6 (tc: Add tc flower functions)
> Fixes 407556ac6c90 (netlink-socket: Avoid forcing a reply for final message
> in a transaction.)
> Signed-off-by: Frode Nordahl <[email protected]>
> ---
> lib/dpif-netlink.c | 4 +-
> lib/dpif.c | 2 +-
> lib/netdev-linux.c | 34 +++++++++-------
> lib/netdev-offload-tc.c | 86 +++++++++++++++++++++++++++++++++++------
> lib/netlink-socket.c | 5 +++
> lib/tc.c | 18 +++++++++
> 6 files changed, 122 insertions(+), 27 deletions(-)
>
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 60bd39643..75537b2f8 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -2340,7 +2340,9 @@ parse_flow_put(struct dpif_netlink *dpif, struct
> dpif_flow_put *put)
> }
> netdev_set_hw_info(oor_netdev, HW_INFO_TYPE_OOR, true);
> }
> - level = (err == ENOSPC || err == EOPNOTSUPP) ? VLL_DBG : VLL_ERR;
> + level = (err == EAGAIN ||
> + err == ENOSPC ||
> + err == EOPNOTSUPP) ? VLL_DBG : VLL_ERR;
> VLOG_RL(&rl, level, "failed to offload flow: %s: %s",
> ovs_strerror(err),
> (oor_netdev ? oor_netdev->name : dev->name));
> diff --git a/lib/dpif.c b/lib/dpif.c
> index 3305401fe..2bbb0aac4 100644
> --- a/lib/dpif.c
> +++ b/lib/dpif.c
> @@ -1751,7 +1751,7 @@ flow_message_log_level(int error)
> * Kernels that support flow wildcarding will reject these flows as
> * duplicates (EEXIST), so lower the log level to debug for these
> * types of messages. */
> - return (error && error != EEXIST) ? VLL_WARN : VLL_DBG;
> + return (error && error != EEXIST && error != EAGAIN) ? VLL_WARN :
> VLL_DBG;
> }
>
> static bool
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 36620199e..a0294ca00 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -5817,8 +5817,9 @@ tc_get_policer_action(uint32_t index, struct
> ofputil_meter_stats *stats)
>
> error = tc_transact(&request, &replyp);
> if (error) {
> - VLOG_ERR_RL(&rl, "Failed to dump police action (index: %u), err=%d",
> - index, error);
> + VLOG_RL(&rl, error == EAGAIN ? VLL_DBG : VLL_ERR,
> + "Failed to dump police action (index: %u), err=%d",
> + index, error);
> return error;
> }
>
> @@ -5849,8 +5850,9 @@ tc_del_policer_action(uint32_t index, struct
> ofputil_meter_stats *stats)
>
> error = tc_transact(&request, &replyp);
> if (error) {
> - VLOG_ERR_RL(&rl, "Failed to delete police action (index: %u),
> err=%d",
> - index, error);
> + VLOG_RL(&rl, error == EAGAIN ? VLL_DBG : VLL_ERR,
> + "Failed to delete police action (index: %u), err=%d",
> + index, error);
> return error;
> }
>
> @@ -6114,11 +6116,12 @@ tc_query_class(const struct netdev *netdev,
>
> error = tc_transact(&request, replyp);
> if (error) {
> - VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
> - netdev_get_name(netdev),
> - tc_get_major(handle), tc_get_minor(handle),
> - tc_get_major(parent), tc_get_minor(parent),
> - ovs_strerror(error));
> + VLOG_RL(&rl, error == EAGAIN ? VLL_DBG : VLL_WARN,
> + "query %s class %u:%u (parent %u:%u) failed (%s)",
> + netdev_get_name(netdev),
> + tc_get_major(handle), tc_get_minor(handle),
> + tc_get_major(parent), tc_get_minor(parent),
> + ovs_strerror(error));
> }
> return error;
> }
> @@ -6140,10 +6143,11 @@ tc_delete_class(const struct netdev *netdev, unsigned
> int handle)
>
> error = tc_transact(&request, NULL);
> if (error) {
> - VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
> - netdev_get_name(netdev),
> - tc_get_major(handle), tc_get_minor(handle),
> - ovs_strerror(error));
> + VLOG_RL(&rl, error == EAGAIN ? VLL_DBG : VLL_WARN,
> + "delete %s class %u:%u failed (%s)",
> + netdev_get_name(netdev),
> + tc_get_major(handle), tc_get_minor(handle),
> + ovs_strerror(error));
> }
> return error;
> }
> @@ -6262,7 +6266,9 @@ tc_query_qdisc(const struct netdev *netdev_)
> ops = &tc_ops_other;
> }
> }
> - } else if ((!error && !qdisc->size) || error == ENOENT) {
> + } else if ((!error && !qdisc->size) ||
> + error == ENOENT || error == EAGAIN)
> + {
> /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a
> qdisc
> * set up by some other entity that doesn't have a handle 1:0. We
> will
> * assume that it's the system default qdisc. */
> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
> index 4f26dd8cc..4fb9dc239 100644
> --- a/lib/netdev-offload-tc.c
> +++ b/lib/netdev-offload-tc.c
> @@ -2505,10 +2505,10 @@ netdev_tc_flow_get(struct netdev *netdev,
>
> err = tc_get_flower(&id, &flower);
> if (err) {
> - VLOG_ERR_RL(&error_rl,
> - "flow get failed (dev %s prio %d handle %d): %s",
> - netdev_get_name(netdev), id.prio, id.handle,
> - ovs_strerror(err));
> + VLOG_RL(&error_rl, err == EAGAIN ? VLL_DBG : VLL_ERR,
> + "flow get failed (dev %s prio %d handle %d): %s",
> + netdev_get_name(netdev), id.prio, id.handle,
> + ovs_strerror(err));
> return err;
> }
>
> @@ -2571,11 +2571,36 @@ netdev_tc_get_n_flows(struct netdev *netdev, uint64_t
> *n_flows)
> return 0;
> }
>
> +/* This macro is for use by one-time initialization functions, where we have
> + * one shot per thread/process to perform a pertinent initialization task
> that
> + * may return a temporary error (EAGAIN).
> + *
> + * With min/max values of 1/64 we would retry 7 times, spending at the
> + * most 127 * 1E6 nsec (0.127s) sleeping.
> + */
> +#define NETDEV_OFFLOAD_TC_BACKOFF_MIN 1
> +#define NETDEV_OFFLOAD_TC_BACKOFF_MAX 64
> +#define NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(BACKOFF, ERROR, CONDITION,
> \
> + FUNCTION, ...)
> \
> + BACKOFF = NETDEV_OFFLOAD_TC_BACKOFF_MIN;
> \
> + do {
> \
> + ERROR = (FUNCTION)(__VA_ARGS__);
> \
> + if (CONDITION) {
> \
> + xnanosleep(BACKOFF * 1E6);
> \
> + if (BACKOFF < NETDEV_OFFLOAD_TC_BACKOFF_MAX) {
> \
> + BACKOFF <<= 1;
> \
> + } else {
> \
> + break;
> \
> + }
> \
> + }
> \
> + } while (CONDITION);
> +
> static void
> probe_multi_mask_per_prio(int ifindex)
> {
> struct tc_flower flower;
> struct tcf_id id1, id2;
> + uint64_t backoff;
> int block_id = 0;
> int prio = 1;
> int error;
> @@ -2594,8 +2619,13 @@ probe_multi_mask_per_prio(int ifindex)
> memset(&flower.mask.dst_mac, 0xff, sizeof flower.mask.dst_mac);
>
> id1 = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
> - error = tc_replace_flower(&id1, &flower);
> + NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
> + tc_replace_flower, &id1, &flower);
> if (error) {
> + if (error == EAGAIN) {
> + VLOG_WARN("probe tc: unable to probe for multiple mask "
> + "support: %s", ovs_strerror(error));
> + }
> goto out;
> }
>
> @@ -2603,10 +2633,15 @@ probe_multi_mask_per_prio(int ifindex)
> memset(&flower.mask.src_mac, 0xff, sizeof flower.mask.src_mac);
>
> id2 = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
> - error = tc_replace_flower(&id2, &flower);
> + NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
> + tc_replace_flower, &id2, &flower);
> tc_del_flower_filter(&id1);
>
> if (error) {
> + if (error == EAGAIN) {
> + VLOG_WARN("probe tc: unable to probe for multiple mask "
> + "support: %s", ovs_strerror(error));
> + }
> goto out;
> }
>
> @@ -2642,6 +2677,7 @@ probe_ct_state_support(int ifindex)
> {
> struct tc_flower flower;
> uint16_t ct_state;
> + uint64_t backoff;
> struct tcf_id id;
> int error;
>
> @@ -2657,8 +2693,13 @@ probe_ct_state_support(int ifindex)
> goto out;
> }
>
> - error = tc_get_flower(&id, &flower);
> + NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
> + tc_get_flower, &id, &flower);
> if (error || flower.mask.ct_state != ct_state) {
> + if (error == EAGAIN) {
> + VLOG_WARN("probe tc: unable to probe ct_state support: %s",
> + ovs_strerror(error));
> + }
> goto out_del;
> }
>
> @@ -2670,10 +2711,16 @@ probe_ct_state_support(int ifindex)
>
> /* Test for reject, ct_state >= MAX */
> ct_state = ~0;
> - error = probe_insert_ct_state_rule(ifindex, ct_state, &id);
> + NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
> + probe_insert_ct_state_rule, ifindex,
> + ct_state, &id);
> if (!error) {
> /* No reject, can't continue probing other flags */
> goto out_del;
> + } else if (error == EAGAIN) {
> + VLOG_WARN("probe tc: unable to probe ct_state support: %s",
> + ovs_strerror(error));
> + goto out_del;
> }
>
> tc_del_flower_filter(&id);
> @@ -2682,8 +2729,14 @@ probe_ct_state_support(int ifindex)
> memset(&flower, 0, sizeof flower);
> ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
> TCA_FLOWER_KEY_CT_FLAGS_INVALID;
> - error = probe_insert_ct_state_rule(ifindex, ct_state, &id);
> + NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
> + probe_insert_ct_state_rule, ifindex,
> + ct_state, &id);
> if (error) {
> + if (error == EAGAIN) {
> + VLOG_WARN("probe tc: unable to probe ct_state support: %s",
> + ovs_strerror(error));
> + }
> goto out;
> }
>
> @@ -2695,8 +2748,14 @@ probe_ct_state_support(int ifindex)
> ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
> TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED |
> TCA_FLOWER_KEY_CT_FLAGS_REPLY;
> - error = probe_insert_ct_state_rule(ifindex, ct_state, &id);
> + NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
> + probe_insert_ct_state_rule, ifindex,
> + ct_state, &id);
> if (error) {
> + if (error == EAGAIN) {
> + VLOG_WARN("probe tc: unable to probe ct_state support: %s",
> + ovs_strerror(error));
> + }
> goto out;
> }
>
> @@ -2714,6 +2773,7 @@ probe_tc_block_support(int ifindex)
> {
> struct tc_flower flower;
> uint32_t block_id = 1;
> + uint64_t backoff;
> struct tcf_id id;
> int prio = 0;
> int error;
> @@ -2732,13 +2792,17 @@ probe_tc_block_support(int ifindex)
> memset(&flower.mask.dst_mac, 0xff, sizeof flower.mask.dst_mac);
>
> id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
> - error = tc_replace_flower(&id, &flower);
> + NETDEV_OFFLOAD_TC_RETRY_WITH_BACKOFF(backoff, error, error == EAGAIN,
> + tc_replace_flower, &id, &flower);
>
> tc_add_del_qdisc(ifindex, false, block_id, TC_INGRESS);
>
> if (!error) {
> block_support = true;
> VLOG_INFO("probe tc: block offload is supported.");
> + } else if (error == EAGAIN) {
> + VLOG_WARN("probe tc: unable to probe block offload support: %s",
> + ovs_strerror(error));
> }
> }
>
> diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
> index 80da20d9f..dea060fc3 100644
> --- a/lib/netlink-socket.c
> +++ b/lib/netlink-socket.c
> @@ -1798,6 +1798,11 @@ nl_pool_release(struct nl_sock *sock)
> *
> * 2. Resending the request causes it to be re-executed, so the request
> * needs to be idempotent.
> + *
> + * 3. In the event that the kernel is too busy to handle the request to
> + * receive the response (i.e. EAGAIN), this function will still
> return
> + * 0. The caller can check for this condition by checking for a zero
> + * size of the 'replyp' ofpbuf buffer.
> */
> int
> nl_transact(int protocol, const struct ofpbuf *request,
> diff --git a/lib/tc.c b/lib/tc.c
> index 5c32c6f97..7df6a10c5 100644
> --- a/lib/tc.c
> +++ b/lib/tc.c
> @@ -237,11 +237,29 @@ static void request_from_tcf_id(struct tcf_id *id,
> uint16_t eth_type,
> }
> }
>
> +/* The `tc_transact` function is a wrapper around `nl_transact` with the
> + * addition of:
> + *
> + * 1. the 'request' ofpbuf buffer is freed after `nl_transact` returns,
> + * regardless of success or failure.
> + *
> + * 2. When a 'replyp' pointer is provided; in the event of the kernel
> + * being too busy to process the request for the response, a positive
> + * error return will be provided with the value of EAGAIN.
> + *
> + * Please acquaint yourself with the documentation of the `nl_transact`
> + * function in the netlink-sock module before making use of this function.
> + */
> int
> tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
> {
> int error = nl_transact(NETLINK_ROUTE, request, replyp);
> ofpbuf_uninit(request);
> +
> + if (!error && replyp && !(*replyp)->size) {
> + return EAGAIN;
> + }
> +
> return error;
> }
>
looks fine to me. thanks.
Acked-by: Roi Dayan <[email protected]>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev