On Wed, Mar 2, 2022 at 10:50 AM Adrian Moreno <amore...@redhat.com> wrote: > > Thanks Mike, > > On 3/1/22 20:14, Mike Pattrick wrote: > > Add additional logging for debug and info level with a focus on code > > related to bond members coming up, go down, and changing. > > > > Several existing log messages were modified to handle sub 1kB log > > messages with more grace. Instead of reporting 0kB of traffic load > > shifting from one member to another, we now suppress these messages at > > the INFO level and display exact byte count at the debug level. > > > > Signed-off-by: Mike Pattrick <m...@redhat.com> > > --- > > lib/lacp.c | 15 +++++++++++ > > ofproto/bond.c | 72 ++++++++++++++++++++++++++++++++++++++++---------- > > 2 files changed, 73 insertions(+), 14 deletions(-) > > > > diff --git a/lib/lacp.c b/lib/lacp.c > > index 89d711225..fa01cbbfe 100644 > > --- a/lib/lacp.c > > +++ b/lib/lacp.c > > @@ -387,6 +387,12 @@ lacp_process_packet(struct lacp *lacp, const void > > *member_, > > goto out; > > } > > > > + if (member->status == LACP_DEFAULTED) { > > + VLOG_INFO("%s: defaulted member %s reestablished connection with > > LACP partner.", > > + lacp->name, > > + member->name); > > Nit: I think ovs coding conventions suggest you need an extra space in the > above > two lines. > > > > + } > > + > > member->status = LACP_CURRENT; > > tx_rate = lacp->fast ? LACP_FAST_TIME_TX : LACP_SLOW_TIME_TX; > > timer_set_duration(&member->rx, LACP_RX_MULTIPLIER * tx_rate); > > @@ -465,9 +471,15 @@ lacp_member_register(struct lacp *lacp, void *member_, > > if (!lacp->key_member) { > > lacp->key_member = member; > > } > > + > > + VLOG_DBG("LACP member %s added.", s->name); > > } > > > > if (!member->name || strcmp(s->name, member->name)) { > > + if (member->name) { > > + VLOG_DBG("LACP member %s renamed from %s.", s->name, > > member->name); > > + } > > + > > free(member->name); > > member->name = xstrdup(s->name); > > } > > @@ -531,6 +543,9 @@ lacp_member_carrier_changed(const struct lacp *lacp, > > const void *member_, > > member->count_carrier_changed++; > > } > > > > + VLOG_DBG("LACP member %s changed state to %s.", > > + member->name, carrier_up ? "up" : "down"); > > + > > out: > > lacp_unlock(); > > } > > diff --git a/ofproto/bond.c b/ofproto/bond.c > > index cdfdf0b9d..55e335bfa 100644 > > --- a/ofproto/bond.c > > +++ b/ofproto/bond.c > > @@ -855,6 +855,8 @@ bond_check_admissibility(struct bond *bond, const void > > *member_, > > ovs_rwlock_rdlock(&rwlock); > > member = bond_member_lookup(bond, member_); > > if (!member) { > > + VLOG_DBG_RL(&rl, "bond %s: lookup for member on bond failed", > > + bond->name); > > goto out; > > } > > > > @@ -1127,8 +1129,13 @@ log_bals(struct bond *bond, const struct ovs_list > > *bals) > > if (ds.length) { > > ds_put_char(&ds, ','); > > } > > - ds_put_format(&ds, " %s %"PRIu64"kB", > > - member->name, member->tx_bytes / 1024); > > + if (member->tx_bytes > 1024) { > > + ds_put_format(&ds, " %s %"PRIu64"kB", > > + member->name, member->tx_bytes / 1024); > > + } else { > > + ds_put_format(&ds, " %s %"PRIu64"B", > > + member->name, member->tx_bytes); > > + } > > > > if (!member->enabled) { > > ds_put_cstr(&ds, " (disabled)"); > > @@ -1141,13 +1148,20 @@ log_bals(struct bond *bond, const struct ovs_list > > *bals) > > if (&e->list_node != > > ovs_list_front(&member->entries)) { > > ds_put_cstr(&ds, " + "); > > } > > - ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB", > > - e - bond->hash, e->tx_bytes / 1024); > > + if (e->tx_bytes > 1024) { > > + ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"kB", > > + e - bond->hash, e->tx_bytes / 1024); > > + } else { > > + ds_put_format(&ds, "h%"PRIdPTR": %"PRIu64"B", > > + e - bond->hash, e->tx_bytes); > > + } > > } > > ds_put_cstr(&ds, ")"); > > } > > } > > - VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds)); > > + if (ds.length) { > > + VLOG_DBG("bond %s:%s", bond->name, ds_cstr(&ds)); > > + } > > ds_destroy(&ds); > > } > > } > > @@ -1161,13 +1175,23 @@ bond_shift_load(struct bond_entry *hash, struct > > bond_member *to) > > struct bond *bond = from->bond; > > uint64_t delta = hash->tx_bytes; > > > > - VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %"PRIdPTR") " > > - "from %s to %s (now carrying %"PRIu64"kB and " > > - "%"PRIu64"kB load, respectively)", > > - bond->name, delta / 1024, hash - bond->hash, > > - from->name, to->name, > > - (from->tx_bytes - delta) / 1024, > > - (to->tx_bytes + delta) / 1024); > > + if (delta > 1024) { > > + VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash > > %"PRIdPTR") " > > + "from %s to %s (now carrying %"PRIu64"kB and " > > + "%"PRIu64"kB load, respectively)", > > + bond->name, delta / 1024, hash - bond->hash, > > + from->name, to->name, > > + (from->tx_bytes - delta) / 1024, > > + (to->tx_bytes + delta) / 1024); > > + } else { > > + VLOG_DBG("bond %s: shift %"PRIu64"B of load (with hash %"PRIdPTR") > > " > > + "from %s to %s (now carrying %"PRIu64"kB and " > > + "%"PRIu64"kB load, respectively)", > > + bond->name, delta, hash - bond->hash, > > + from->name, to->name, > > + (from->tx_bytes - delta) / 1024, > > + (to->tx_bytes + delta) / 1024); > > + } > > > > Both versions should have the same log level. Original was INFO but I'm > wondering if it's too verbose for an INFO. IIUC, this could be printed 256 > times > at a single rebalancing round. > > Should we consider summarized version for INFO level? > > Maybe one that sums up a migration, something like: "shifted N Kbps from > member > X (which now has Tx Kbps) to member Y (which now has Ty Kbps)". That should > only > be printed (N_Members -1) times at most on each rebalancing round. > Or one that sums up the entire rebalancing round and prints the new tx_bytes > of > each member, only printed once per rebalancing round.
Yes, I think that aggregating the messages could really improve this log message. In a production environment with stochastic network traffic this could really really produce a lot of spurious messages. I'll work on a new version. > > > > /* Shift load away from 'from' to 'to'. */ > > from->tx_bytes -= delta; > > @@ -1345,6 +1369,12 @@ bond_rebalance(struct bond *bond) > > insert_bal(&bals, member); > > } > > } > > + if (ovs_list_is_empty(&bals)) { > > + VLOG_INFO("Bond %s cannot be balanced because no members are > > enabled.", > > + bond->name); > > + goto done; > > + } > > + > > If both members get disabled (or before they get added to the bond), this > could > be printed in INFO level every second. Not sure if that's desirable > considering > we would already log: > > VLOG_INFO_RL(&rl, "bond %s: all members disabled", bond->name); > > in "bond_choose_active_member()". What do you think? > > > > log_bals(bond, &bals); > > > > /* Shift load from the most-loaded members to the least-loaded > > members. */ > > @@ -1565,6 +1595,9 @@ bond_print_details(struct ds *ds, const struct bond > > *bond) > > if (be_tx_k) { > > ds_put_format(ds, " hash %d: %"PRIu64" kB load\n", > > hash, be_tx_k); > > + } else if (be->tx_bytes) { > > + ds_put_format(ds, " hash %d: %"PRIu64" B load\n", > > + hash, be->tx_bytes); > > } > > > > /* XXX How can we list the MACs assigned to hashes of SLB > > bonds? */ > > @@ -1959,7 +1992,7 @@ choose_output_member(const struct bond *bond, const > > struct flow *flow, > > * unsuccussful. If lacp_fallback_ab is enabled use active- > > * backup mode else drop all traffic. */ > > if (!bond->lacp_fallback_ab) { > > - return NULL; > > + goto error; > > } > > balance = BM_AB; > > } > > @@ -1971,7 +2004,7 @@ choose_output_member(const struct bond *bond, const > > struct flow *flow, > > case BM_TCP: > > if (bond->lacp_status != LACP_NEGOTIATED) { > > /* Must have LACP negotiations for TCP balanced bonds. */ > > - return NULL; > > + goto error; > > } > > if (wc) { > > flow_mask_hash_fields(flow, wc, > > NX_HASH_FIELDS_SYMMETRIC_L3L4_UDP); > > @@ -1990,6 +2023,17 @@ choose_output_member(const struct bond *bond, const > > struct flow *flow, > > default: > > OVS_NOT_REACHED(); > > } > > +error: > > + if (VLOG_IS_DBG_ENABLED()) { > > + struct ds ds = DS_EMPTY_INITIALIZER; > > + flow_format(&ds, flow, NULL); > > + VLOG_DBG("Cannot choose output member on bond %s for flow %s with > > LACP status %s", > > + bond->name, > > + ds_cstr(&ds), > > + lacp_status_description(bond->lacp_status)); > > + ds_destroy(&ds); > > + } > > + return NULL; > > Use rate limiting here? > > > } > > > > static struct bond_member * > > -- > Adrián Moreno > -M _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev