From: Loke Berne <[email protected]>

Large scale OVN deployments commonly disable the per-chassis nb_cfg
write-back mechanism by setting options:enable_chassis_nb_cfg_update
to false.  With thousands of hypervisors each writing their nb_cfg
completion back to Chassis_Private on every generation, the resulting
write amplification can overload the southbound OVSDB cluster.
Disabling write-back eliminates this pressure but also removes the
only existing signal for measuring how long a northbound change takes
to reach each hypervisor.

OVN_Northbound already records nb_cfg_timestamp in NB_Global when
ovn-northd advances nb_cfg, but hypervisors connect to the southbound
database only.  This patch adds the same timestamp to SB_Global,
written atomically with each nb_cfg update.  ovn-controller reads
this value and stores it in the local OVS bridge external_ids as
ovn-nb-cfg-sb-ts alongside the existing ovn-nb-cfg-ts (local
completion time).  An external collector such as ovs_exporter can
read both values from the bridge and compute per-chassis propagation
latency histograms without any writes to the southbound database,
keeping measurement overhead independent of fleet size.

Placing the timestamp in SB_Global rather than requiring collectors
to reach the northbound database means it travels transparently
through any relay or VPN between the southbound cluster and the
hypervisor, naturally including that transit in the measurement.

Testing: confirmed in OVN sandbox and a two-container central/HV
setup that nb_cfg_timestamp is written to SB_Global on each nb_cfg
advance, propagated to br-int external_ids as ovn-nb-cfg-sb-ts, and
continues to update correctly when enable_chassis_nb_cfg_update is
set to false.

Signed-off-by: Loke Berne <[email protected]>
Assisted-by: Claude Sonnet 4.6
Submitted-at: https://github.com/ovn-org/ovn/pull/306
Signed-off-by: Numan Siddique <[email protected]>
---
 br-controller/ovn-br-controller.c |  2 +-
 controller/if-status.c            |  2 +-
 controller/ovn-controller.c       | 83 +++++++++++++++++++++++++------
 lib/ofctrl-seqno.c                | 21 +++++++-
 lib/ofctrl-seqno.h                | 11 +++-
 lib/test-ofctrl-seqno.c           |  2 +-
 northd/ovn-northd.c               |  3 ++
 ovn-sb.ovsschema                  |  5 +-
 ovn-sb.xml                        |  9 ++++
 9 files changed, 115 insertions(+), 23 deletions(-)

diff --git a/br-controller/ovn-br-controller.c 
b/br-controller/ovn-br-controller.c
index 93526a2f6d..e20a110513 100644
--- a/br-controller/ovn-br-controller.c
+++ b/br-controller/ovn-br-controller.c
@@ -299,7 +299,7 @@ main(int argc OVS_UNUSED, char *argv[] OVS_UNUSED)
             ofctrl_seqno_update_create(
                 ofctrl_seq_type_br_cfg,
                 get_ovnbr_cfg(ovnbrrec_br_global_table_get(ovnbr_idl_loop.idl),
-                              ovnbr_cond_seqno, ovnbr_expected_cond_seqno));
+                              ovnbr_cond_seqno, ovnbr_expected_cond_seqno), 0);
 
             br_ofctrls_put(ofctrl_seqno_get_req_cfg(),
                            engine_node_changed(&en_lflow_output),
diff --git a/controller/if-status.c b/controller/if-status.c
index 6c6e9b27b1..65e745e250 100644
--- a/controller/if-status.c
+++ b/controller/if-status.c
@@ -712,7 +712,7 @@ if_status_mgr_update(struct if_status_mgr *mgr,
     if (new_ifaces) {
         mgr->iface_seqno++;
         ofctrl_seqno_update_create(mgr->iface_seq_type_pb_cfg,
-                                   mgr->iface_seqno);
+                                   mgr->iface_seqno, 0);
         VLOG_DBG("Seqno requested: %"PRIu32, mgr->iface_seqno);
     }
 }
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index ad094a4543..765948442c 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -141,6 +141,7 @@ static unixctl_cb_func debug_delay_nb_cfg_report;
 
 #define OVS_NB_CFG_NAME "ovn-nb-cfg"
 #define OVS_NB_CFG_TS_NAME "ovn-nb-cfg-ts"
+#define OVS_NB_CFG_SB_TS_NAME "ovn-nb-cfg-sb-ts"
 #define OVS_STARTUP_TS_NAME "ovn-startup-ts"
 
 struct br_int_remote {
@@ -825,28 +826,62 @@ struct ed_type_ct_zones {
 };
 
 
+/* Returns the current SB_Global.nb_cfg and, if 'ts_out' is non-NULL, also
+ * the matching SB_Global.nb_cfg_timestamp.  The pair is always read from
+ * the same SB_Global snapshot so callers can rely on (nb_cfg, ts) being
+ * consistent.
+ *
+ * 'nb_cfg_timestamp' is the wall-clock time northd wrote nb_cfg to SB.
+ * The delta between that and the local completion time is the per-chassis
+ * end-to-end propagation latency (northd compile + SB write + relay
+ * fan-out + ovn-controller engine + ofctrl barrier ack).
+ *
+ * If a monitor condition change is in flight the cached pair from the
+ * previous call is returned, because updates received between the request
+ * and the cond ack could be from before the SB_Global value we're trying
+ * to read.
+ */
 static uint64_t
 get_nb_cfg(const struct sbrec_sb_global_table *sb_global_table,
-           unsigned int cond_seqno, unsigned int expected_cond_seqno)
+           unsigned int cond_seqno, unsigned int expected_cond_seqno,
+           int64_t *ts_out)
 {
     static uint64_t nb_cfg = 0;
+    static int64_t nb_cfg_ts = 0;
 
-    /* Delay getting nb_cfg if there are monitor condition changes
-     * in flight.  It might be that those changes would instruct the
-     * server to send updates that happened before SB_Global.nb_cfg.
-     */
-    if (cond_seqno != expected_cond_seqno) {
-        return nb_cfg;
+    if (cond_seqno == expected_cond_seqno) {
+        const struct sbrec_sb_global *sb
+          = sbrec_sb_global_table_first(sb_global_table);
+        nb_cfg = sb ? sb->nb_cfg : 0;
+        nb_cfg_ts = sb ? sb->nb_cfg_timestamp : 0;
+    }
+
+    if (ts_out) {
+        *ts_out = nb_cfg_ts;
     }
 
-    const struct sbrec_sb_global *sb
-        = sbrec_sb_global_table_first(sb_global_table);
-    nb_cfg = sb ? sb->nb_cfg : 0;
     return nb_cfg;
 }
 
 /* Propagates the local cfg seqno, 'cur_cfg', to the chassis_private record
  * and to the local OVS DB.
+ *
+ * The br-int external_ids triplet (ovn-nb-cfg, ovn-nb-cfg-ts,
+ * ovn-nb-cfg-sb-ts) is stamped unconditionally, independent of
+ * 'enable_ch_nb_cfg_update'.  The SB chassis_private writeback remains
+ * gated by 'enable_ch_nb_cfg_update' for deployments that want to suppress
+ * the per-bump SB write load.  An external exporter watching br-int can
+ * compute the per-chassis propagation delta as
+ *     (ovn-nb-cfg-ts - ovn-nb-cfg-sb-ts)
+ * regardless of the writeback setting.
+ *
+ * 'ovn-nb-cfg-sb-ts' is the SB_Global.nb_cfg_timestamp that was paired
+ * with this cur_cfg at the moment the barrier was queued (snapshotted via
+ * ofctrl-seqno's req_ts).  This avoids the pitfall of pairing the just-
+ * acked cur_cfg with whatever SB_Global timestamp happens to be current
+ * now -- on a fast-churning fleet SB_Global may already have advanced
+ * past cur_cfg by the time the barrier acks, which would under-report
+ * the delta.
  */
 static void
 store_nb_cfg(struct ovsdb_idl_txn *sb_txn, struct ovsdb_idl_txn *ovs_txn,
@@ -858,6 +893,7 @@ store_nb_cfg(struct ovsdb_idl_txn *sb_txn, struct 
ovsdb_idl_txn *ovs_txn,
     struct ofctrl_acked_seqnos *acked_nb_cfg_seqnos =
         ofctrl_acked_seqnos_get(ofctrl_seq_type_nb_cfg);
     uint64_t cur_cfg = acked_nb_cfg_seqnos->last_acked;
+    uint64_t cur_cfg_sb_ts = acked_nb_cfg_seqnos->last_acked_req_ts;
     int64_t startup_ts = daemon_startup_ts();
 
     if (ovs_txn && br_int
@@ -894,6 +930,13 @@ store_nb_cfg(struct ovsdb_idl_txn *sb_txn, struct 
ovsdb_idl_txn *ovs_txn,
                                                  cur_cfg_str);
         ovsrec_bridge_update_external_ids_setkey(br_int, OVS_NB_CFG_TS_NAME,
                                                  cur_cfg_ts_str);
+        if (cur_cfg_sb_ts) {
+            char *sb_ts_str = xasprintf("%"PRIu64, cur_cfg_sb_ts);
+            ovsrec_bridge_update_external_ids_setkey(br_int,
+                                                     OVS_NB_CFG_SB_TS_NAME,
+                                                     sb_ts_str);
+            free(sb_ts_str);
+        }
         free(cur_cfg_ts_str);
         free(cur_cfg_str);
     }
@@ -8200,12 +8243,20 @@ main(int argc, char *argv[])
                                      chassis, mac_cache_data);
                     }
 
-                    ofctrl_seqno_update_create(
-                        ofctrl_seq_type_nb_cfg,
-                        get_nb_cfg(sbrec_sb_global_table_get(
-                                                       ovnsb_idl_loop.idl),
-                                              ovnsb_cond_seqno,
-                                              ovnsb_expected_cond_seqno));
+                    /* Snapshot (nb_cfg, sb_ts) atomically from SB_Global
+                     * and pair them through the barrier ack so the
+                     * eventual completion can be attributed to the
+                     * timestamp that corresponded to this exact nb_cfg
+                     * generation -- not whatever SB_Global value has
+                     * moved on to by the time the barrier acks. */
+                    int64_t sb_nb_cfg_ts = 0;
+                    uint64_t sb_nb_cfg = get_nb_cfg(
+                        sbrec_sb_global_table_get(ovnsb_idl_loop.idl),
+                        ovnsb_cond_seqno, ovnsb_expected_cond_seqno,
+                        &sb_nb_cfg_ts);
+                    ofctrl_seqno_update_create(ofctrl_seq_type_nb_cfg,
+                                               sb_nb_cfg,
+                                               (uint64_t) sb_nb_cfg_ts);
 
                     struct local_binding_data *binding_data =
                         runtime_data ? &runtime_data->lbinding_data : NULL;
diff --git a/lib/ofctrl-seqno.c b/lib/ofctrl-seqno.c
index 83c17c0e52..7c613dc0a4 100644
--- a/lib/ofctrl-seqno.c
+++ b/lib/ofctrl-seqno.c
@@ -36,6 +36,11 @@ struct ofctrl_seqno_update {
                                 * application.
                                 */
     uint64_t req_cfg;          /* Application specific seqno. */
+    uint64_t req_ts;           /* Opaque per-request timestamp captured by
+                                * the caller at the moment the update was
+                                * queued.  Carried through to the acked
+                                * state so consumers can pair the acked
+                                * seqno with the input that produced it. */
 };
 
 /* List of in flight sequence number updates. */
@@ -51,6 +56,9 @@ struct ofctrl_seqno_state {
                                  * application consumed acked requests.
                                  */
     uint64_t cur_cfg;           /* Last acked application seqno. */
+    uint64_t cur_cfg_req_ts;    /* req_ts that was paired with cur_cfg when
+                                 * the update was queued.  0 if the caller
+                                 * didn't supply a timestamp. */
     uint64_t req_cfg;           /* Last requested application seqno. */
 };
 
@@ -73,6 +81,7 @@ ofctrl_acked_seqnos_get(size_t seqno_type)
     struct ofctrl_acked_seqnos *acked_seqnos = xmalloc(sizeof *acked_seqnos);
     acked_seqnos->acked = vector_clone(&state->acked_cfgs);
     acked_seqnos->last_acked = state->cur_cfg;
+    acked_seqnos->last_acked_req_ts = state->cur_cfg_req_ts;
 
     vector_clear(&state->acked_cfgs);
     if (vector_capacity(&state->acked_cfgs) >= VECTOR_THRESHOLD) {
@@ -140,6 +149,7 @@ ofctrl_seqno_add_type(void)
     struct ofctrl_seqno_state state = (struct ofctrl_seqno_state) {
         .acked_cfgs = VECTOR_EMPTY_INITIALIZER(uint64_t),
         .cur_cfg = 0,
+        .cur_cfg_req_ts = 0,
         .req_cfg = 0,
     };
     vector_push(&ofctrl_seqno_states, &state);
@@ -149,9 +159,16 @@ ofctrl_seqno_add_type(void)
 
 /* Creates a new seqno update request for an application specific
  * 'seqno_type'.
+ *
+ * 'req_ts' is an opaque per-request timestamp captured by the caller (for
+ * example, the SB_Global nb_cfg_timestamp at the moment we read 'new_cfg').
+ * It is carried unchanged to ofctrl_acked_seqnos_get() so consumers can
+ * pair the eventual ack with the input state that produced it.  Callers
+ * that don't need this pairing should pass 0.
  */
 void
-ofctrl_seqno_update_create(size_t seqno_type, uint64_t new_cfg)
+ofctrl_seqno_update_create(size_t seqno_type, uint64_t new_cfg,
+                           uint64_t req_ts)
 {
     struct ofctrl_seqno_state *state = ofctrl_seqno_state_get(seqno_type);
 
@@ -169,6 +186,7 @@ ofctrl_seqno_update_create(size_t seqno_type, uint64_t 
new_cfg)
         .seqno_type = seqno_type,
         .flow_cfg = ofctrl_req_seqno,
         .req_cfg = new_cfg,
+        .req_ts = req_ts,
     };
     vector_push(&ofctrl_seqno_updates, &update);
 }
@@ -190,6 +208,7 @@ ofctrl_seqno_run(uint64_t flow_cfg)
         struct ofctrl_seqno_state *state =
             ofctrl_seqno_state_get(update->seqno_type);
         state->cur_cfg = update->req_cfg;
+        state->cur_cfg_req_ts = update->req_ts;
         vector_push(&state->acked_cfgs, &update->req_cfg);
 
         index++;
diff --git a/lib/ofctrl-seqno.h b/lib/ofctrl-seqno.h
index faa97cc535..66b666fe4d 100644
--- a/lib/ofctrl-seqno.h
+++ b/lib/ofctrl-seqno.h
@@ -23,10 +23,18 @@
 
 /* Collection of acked ofctrl_seqno_update requests and the most recent
  * 'last_acked' value.
+ *
+ * 'last_acked_req_ts' carries the opaque timestamp that was associated with
+ * 'last_acked' at the time the seqno was requested.  Consumers that don't
+ * pass a timestamp at create time will see 0 here.  The timestamp is
+ * preserved across the barrier so that applications can pair the acked
+ * config seqno with the input state that produced it (e.g. SB_Global
+ * nb_cfg_timestamp at the moment we asked OVS to barrier).
  */
 struct ofctrl_acked_seqnos {
     struct vector acked;
     uint64_t last_acked;
+    uint64_t last_acked_req_ts;
 };
 
 struct ofctrl_acked_seqnos *ofctrl_acked_seqnos_get(size_t seqno_type);
@@ -35,7 +43,8 @@ bool ofctrl_acked_seqnos_contains(const struct 
ofctrl_acked_seqnos *seqnos,
                                   uint64_t val);
 
 size_t ofctrl_seqno_add_type(void);
-void ofctrl_seqno_update_create(size_t seqno_type, uint64_t new_cfg);
+void ofctrl_seqno_update_create(size_t seqno_type, uint64_t new_cfg,
+                                uint64_t req_ts);
 void ofctrl_seqno_run(uint64_t flow_cfg);
 uint64_t ofctrl_seqno_get_req_cfg(void);
 void ofctrl_seqno_flush(void);
diff --git a/lib/test-ofctrl-seqno.c b/lib/test-ofctrl-seqno.c
index 7d478c033e..b31a50ecdc 100644
--- a/lib/test-ofctrl-seqno.c
+++ b/lib/test-ofctrl-seqno.c
@@ -123,7 +123,7 @@ test_ofctrl_seqno_ack_seqnos(struct ovs_cmdl_context *ctx)
                                         &app_seqno)) {
                 return;
             }
-            ofctrl_seqno_update_create(i, app_seqno);
+            ofctrl_seqno_update_create(i, app_seqno, 0);
         }
     }
     printf("ofctrl-seqno-req-cfg: %u\n", n_reqs);
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index c3c198f2f3..0eaef6de44 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -541,6 +541,7 @@ update_sequence_numbers(int64_t loop_start_time,
      * Also set up to update sb_cfg once our southbound transaction commits. */
     if (nb->nb_cfg != sb->nb_cfg) {
         sbrec_sb_global_set_nb_cfg(sb, nb->nb_cfg);
+        sbrec_sb_global_set_nb_cfg_timestamp(sb, loop_start_time);
         nbrec_nb_global_set_nb_cfg_timestamp(nb, loop_start_time);
     }
     sb_loop->next_cfg = nb->nb_cfg;
@@ -944,6 +945,8 @@ main(int argc, char *argv[])
 
     /* Disable alerting for pure write-only columns. */
     ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, &sbrec_sb_global_col_nb_cfg);
+    ovsdb_idl_omit_alert(ovnsb_idl_loop.idl,
+                         &sbrec_sb_global_col_nb_cfg_timestamp);
     ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, &sbrec_address_set_col_name);
     ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, &sbrec_address_set_col_addresses);
     for (size_t i = 0; i < SBREC_LOGICAL_FLOW_N_COLUMNS; i++) {
diff --git a/ovn-sb.ovsschema b/ovn-sb.ovsschema
index d9a91739cc..afe691a0d3 100644
--- a/ovn-sb.ovsschema
+++ b/ovn-sb.ovsschema
@@ -1,11 +1,12 @@
 {
     "name": "OVN_Southbound",
-    "version": "21.8.0",
-    "cksum": "614397313 36713",
+    "version": "21.8.1",
+    "cksum": "3241242866 36779",
     "tables": {
         "SB_Global": {
             "columns": {
                 "nb_cfg": {"type": {"key": "integer"}},
+                "nb_cfg_timestamp": {"type": {"key": "integer"}},
                 "external_ids": {
                     "type": {"key": "string", "value": "string",
                              "min": 0, "max": "unlimited"}},
diff --git a/ovn-sb.xml b/ovn-sb.xml
index e45b63d73f..c1883db2e3 100644
--- a/ovn-sb.xml
+++ b/ovn-sb.xml
@@ -156,6 +156,15 @@
         the southbound database to bring it up to date with these changes, it
         updates this column to the same value.
       </column>
+
+      <column name="nb_cfg_timestamp">
+        The time at which <code>ovn-northd</code> last wrote
+        <ref column="nb_cfg"/> to the southbound database, in milliseconds
+        since the Unix epoch.  Set atomically with each update to
+        <ref column="nb_cfg"/>.  Hypervisors read this value to measure
+        end-to-end propagation latency from northbound commit to local
+        datapath programming completion.
+      </column>
     </group>
 
     <group title="Common Columns">
-- 
2.54.0

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to