On Wed, Mar 2, 2022 at 10:50 AM Adrian Moreno <[email protected]> wrote:
>
> Thanks Mike,
>
> On 3/1/22 20:14, Mike Pattrick wrote:
> > Add additional logging for debug and info level with a focus on code
> > related to bond members coming up, go down, and changing.
> >
> > Several existing log messages were modified to handle sub 1kB log
> > messages with more grace. Instead of reporting 0kB of traffic load
> > shifting from one member to another, we now suppress these messages at
> > the INFO level and display exact byte count at the debug level.
> >
> > Signed-off-by: Mike Pattrick <[email protected]>
> > ---
> > lib/lacp.c | 15 +++++++++++
> > ofproto/bond.c | 72 ++++++++++++++++++++++++++++++++++++++++----------
> > 2 files changed, 73 insertions(+), 14 deletions(-)
> >
> > diff --git a/lib/lacp.c b/lib/lacp.c
> > index 89d711225..fa01cbbfe 100644
> > --- a/lib/lacp.c
> > +++ b/lib/lacp.c
> > @@ -387,6 +387,12 @@ lacp_process_packet(struct lacp *lacp, const void
> > *member_,
> > goto out;
> > }
> >
> > + if (member->status == LACP_DEFAULTED) {
> > + VLOG_INFO("%s: defaulted member %s reestablished connection with
> > LACP partner.",
> > + lacp->name,
> > + member->name);
>
> Nit: I think ovs coding conventions suggest you need an extra space in the
> above
> two lines.
>
>
> > + }
> > +
> > member->status = LACP_CURRENT;
> > tx_rate = lacp->fast ? LACP_FAST_TIME_TX : LACP_SLOW_TIME_TX;
> > timer_set_duration(&member->rx, LACP_RX_MULTIPLIER * tx_rate);
> > @@ -465,9 +471,15 @@ lacp_member_register(struct lacp *lacp, void *member_,
> > if (!lacp->key_member) {
> > lacp->key_member = member;
> > }
> > +
> > + VLOG_DBG("LACP member %s added.", s->name);
> > }
> >
> > if (!member->name || strcmp(s->name, member->name)) {
> > + if (member->name) {
> > + VLOG_DBG("LACP member %s renamed from %s.", s->name,
> > member->name);
> > + }
> > +
> > free(member->name);
> > member->name = xstrdup(s->name);
> > }
> > @@ -531,6 +543,9 @@ lacp_member_carrier_changed(const struct lacp *lacp,
> > const void *member_,
> > member->count_carrier_changed++;
> > }
> >
> > + VLOG_DBG("LACP member %s changed state to %s.",
> > + member->name, carrier_up ? "up" : "down");
> > +
> > out:
> > lacp_unlock();
> > }
> > diff --git a/ofproto/bond.c b/ofproto/bond.c
> > index cdfdf0b9d..55e335bfa 100644
> > --- a/ofproto/bond.c
> > +++ b/ofproto/bond.c
> > @@ -855,6 +855,8 @@ bond_check_admissibility(struct bond *bond, const void
> > *member_,
> > ovs_rwlock_rdlock(&rwlock);
> > member = bond_member_lookup(bond, member_);
> > if (!member) {
> > + VLOG_DBG_RL(&rl, "bond %s: lookup for member on bond failed",
> > + bond->name);
> > goto out;
> > }
> >
> > @@ -1127,8 +1129,13 @@ log_bals(struct bond *bond, const struct ovs_list
> > *bals)
> > if (ds.length) {
> > ds_put_char(&ds, ',');
> > }
> > - ds_put_format(&ds, " %s %"PRIu64"kB",
> > - member->name, member->tx_bytes / 1024);
> > + if (member->tx_bytes > 1024) {
> > + ds_put_format(&ds, " %s %"PRIu64"kB",
> > + member->name, member->tx_bytes / 1024);
> > + } else {
> > + ds_put_format(&ds, " %s %"PRIu64"B",
> > + member->name, member->tx_bytes);
> > + }
> >
> > if (!member->enabled) {
> > ds_put_cstr(&ds, " (disabled)");
> > @@ -1141,13 +1148,20 @@ log_bals(struct bond *bond, const struct ovs_list
> > *bals)
> > if (&e->list_node !=
> > ovs_list_front(&member->entries)) {
> > ds_put_cstr(&ds, " + ");
> > }
> > - ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
> > - e - bond->hash, e->tx_bytes / 1024);
> > + if (e->tx_bytes > 1024) {
> > + ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB",
> > + e - bond->hash, e->tx_bytes / 1024);
> > + } else {
> > + ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"B",
> > + e - bond->hash, e->tx_bytes);
> > + }
> > }
> > ds_put_cstr(&ds, ")");
> > }
> > }
> > - VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
> > + if (ds.length) {
> > + VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds));
> > + }
> > ds_destroy(&ds);
> > }
> > }
> > @@ -1161,13 +1175,23 @@ bond_shift_load(struct bond_entry *hash, struct
> > bond_member *to)
> > struct bond *bond = from->bond;
> > uint64_t delta = hash->tx_bytes;
> >
> > - VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") "
> > - "from %s to %s (now carrying %"PRIu64"kB and "
> > - "%"PRIu64"kB load, respectively)",
> > - bond->name, delta / 1024, hash - bond->hash,
> > - from->name, to->name,
> > - (from->tx_bytes - delta) / 1024,
> > - (to->tx_bytes + delta) / 1024);
> > + if (delta > 1024) {
> > + VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash
> > %"PRIdPTR") "
> > + "from %s to %s (now carrying %"PRIu64"kB and "
> > + "%"PRIu64"kB load, respectively)",
> > + bond->name, delta / 1024, hash - bond->hash,
> > + from->name, to->name,
> > + (from->tx_bytes - delta) / 1024,
> > + (to->tx_bytes + delta) / 1024);
> > + } else {
> > + VLOG_DBG("bond %s: shift %"PRIu64"B of load (with hash %"PRIdPTR")
> > "
> > + "from %s to %s (now carrying %"PRIu64"kB and "
> > + "%"PRIu64"kB load, respectively)",
> > + bond->name, delta, hash - bond->hash,
> > + from->name, to->name,
> > + (from->tx_bytes - delta) / 1024,
> > + (to->tx_bytes + delta) / 1024);
> > + }
> >
>
> Both versions should have the same log level. Original was INFO but I'm
> wondering if it's too verbose for an INFO. IIUC, this could be printed 256
> times
> at a single rebalancing round.
>
> Should we consider summarized version for INFO level?
>
> Maybe one that sums up a migration, something like: "shifted N Kbps from
> member
> X (which now has Tx Kbps) to member Y (which now has Ty Kbps)". That should
> only
> be printed (N_Members -1) times at most on each rebalancing round.
> Or one that sums up the entire rebalancing round and prints the new tx_bytes
> of
> each member, only printed once per rebalancing round.
Yes, I think that aggregating the messages could really improve this
log message. In a production environment with stochastic network
traffic this could really really produce a lot of spurious messages.
I'll work on a new version.
>
>
> > /* Shift load away from 'from' to 'to'. */
> > from->tx_bytes -= delta;
> > @@ -1345,6 +1369,12 @@ bond_rebalance(struct bond *bond)
> > insert_bal(&bals, member);
> > }
> > }
> > + if (ovs_list_is_empty(&bals)) {
> > + VLOG_INFO("Bond %s cannot be balanced because no members are
> > enabled.",
> > + bond->name);
> > + goto done;
> > + }
> > +
>
> If both members get disabled (or before they get added to the bond), this
> could
> be printed in INFO level every second. Not sure if that's desirable
> considering
> we would already log:
>
> VLOG_INFO_RL(&rl, "bond %s: all members disabled", bond->name);
>
> in "bond_choose_active_member()". What do you think?
>
>
> > log_bals(bond, &bals);
> >
> > /* Shift load from the most-loaded members to the least-loaded
> > members. */
> > @@ -1565,6 +1595,9 @@ bond_print_details(struct ds *ds, const struct bond
> > *bond)
> > if (be_tx_k) {
> > ds_put_format(ds, " hash %d: %"PRIu64" kB load\n",
> > hash, be_tx_k);
> > + } else if (be->tx_bytes) {
> > + ds_put_format(ds, " hash %d: %"PRIu64" B load\n",
> > + hash, be->tx_bytes);
> > }
> >
> > /* XXX How can we list the MACs assigned to hashes of SLB
> > bonds? */
> > @@ -1959,7 +1992,7 @@ choose_output_member(const struct bond *bond, const
> > struct flow *flow,
> > * unsuccussful. If lacp_fallback_ab is enabled use active-
> > * backup mode else drop all traffic. */
> > if (!bond->lacp_fallback_ab) {
> > - return NULL;
> > + goto error;
> > }
> > balance = BM_AB;
> > }
> > @@ -1971,7 +2004,7 @@ choose_output_member(const struct bond *bond, const
> > struct flow *flow,
> > case BM_TCP:
> > if (bond->lacp_status != LACP_NEGOTIATED) {
> > /* Must have LACP negotiations for TCP balanced bonds. */
> > - return NULL;
> > + goto error;
> > }
> > if (wc) {
> > flow_mask_hash_fields(flow, wc,
> > NX_HASH_FIELDS_SYMMETRIC_L3L4_UDP);
> > @@ -1990,6 +2023,17 @@ choose_output_member(const struct bond *bond, const
> > struct flow *flow,
> > default:
> > OVS_NOT_REACHED();
> > }
> > +error:
> > + if (VLOG_IS_DBG_ENABLED()) {
> > + struct ds ds = DS_EMPTY_INITIALIZER;
> > + flow_format(&ds, flow, NULL);
> > + VLOG_DBG("Cannot choose output member on bond %s for flow %s with
> > LACP status %s",
> > + bond->name,
> > + ds_cstr(&ds),
> > + lacp_status_description(bond->lacp_status));
> > + ds_destroy(&ds);
> > + }
> > + return NULL;
>
> Use rate limiting here?
>
> > }
> >
> > static struct bond_member *
>
> --
> Adrián Moreno
>
-M
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev