When az1 is paused, there is a small time window during which no az is
active and owns the lock (the time for ovn-ic-sb to receive the unlock
from az1 and notify az2).
Any ovn-ic-nb change received during this time, which would have resulted
in ovn-ic-sb changes, would be lost (from a ovn-ic-sb perspective), as
az1 is paused and az2 is not (yet) the leader.
While at it, properly log when ISB is really locked: previous log was
confusing as logging when acquiring the lock (which could fail/takes some
time).
This issue caused the following test to fail sporadically:
"ovn-ic - IC-SB lock acquisition and transit switch creation"
Fixes: 052a298bb90e ("ovn-ic: Use dual IC-SB connections to prevent constraint
violations.")
Signed-off-by: Xavier Simonart <[email protected]>
---
ic/ovn-ic.c | 16 ++++++++++++++--
ic/ovn-ic.h | 3 ++-
tests/ovn-ic.at | 20 ++++++++++++++++++++
3 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/ic/ovn-ic.c b/ic/ovn-ic.c
index b5c961d2d..b197ceeab 100644
--- a/ic/ovn-ic.c
+++ b/ic/ovn-ic.c
@@ -3809,6 +3809,7 @@ main(int argc, char *argv[])
exiting = false;
state.had_lock = false;
+ state.had_isb_lock = false;
state.paused = false;
while (!exiting) {
@@ -3844,8 +3845,7 @@ main(int argc, char *argv[])
* Ensure that only a single ovn-ic has the permission to
* write to IC-SB.
*/
- VLOG_INFO("OVN ISB lock acquired. "
- "This ovn-ic instance is now active.");
+ VLOG_INFO("Acquiring OVN ISB lock.");
ovsdb_idl_set_lock(ovnisb_idl_loop.idl, "ovn_ic_sb");
}
@@ -3969,6 +3969,17 @@ main(int argc, char *argv[])
state.had_lock = false;
}
+ if (!state.had_isb_lock && ovsdb_idl_has_lock(ctx.ovnisb_idl)) {
+ VLOG_INFO("OVN ISB lock acquired. "
+ "This ovn-ic instance is now active.");
+ state.had_isb_lock = true;
+ inc_proc_ic_force_recompute_immediate();
+ } else if (state.had_isb_lock &&
+ !ovsdb_idl_has_lock(ctx.ovnisb_idl)) {
+ VLOG_INFO("OVN ISB lock lost.");
+ state.had_isb_lock = false;
+ }
+
if (ovsdb_idl_has_lock(ctx.ovnsb_idl) &&
ovsdb_idl_has_ever_connected(ctx.ovnnb_idl) &&
ovsdb_idl_has_ever_connected(ctx.ovnsb_idl) &&
@@ -4055,6 +4066,7 @@ main(int argc, char *argv[])
VLOG_INFO("This ovn-ic instance is now paused. "
"Removing IC-SB lock.");
ovsdb_idl_set_lock(ovnisb_idl_loop.idl, NULL);
+ state.had_isb_lock = false;
}
if (ovsdb_idl_has_lock(ovnsb_idl_loop.idl) ||
diff --git a/ic/ovn-ic.h b/ic/ovn-ic.h
index 9f52bb0f9..1d3f4e4de 100644
--- a/ic/ovn-ic.h
+++ b/ic/ovn-ic.h
@@ -53,7 +53,8 @@ struct ic_context {
};
struct ic_state {
- bool had_lock;
+ bool had_lock; /* Lock to local AZ SB */
+ bool had_isb_lock; /* Lock to ISB */
bool paused;
};
diff --git a/tests/ovn-ic.at b/tests/ovn-ic.at
index 007ba0eb8..0826632e9 100644
--- a/tests/ovn-ic.at
+++ b/tests/ovn-ic.at
@@ -4871,6 +4871,26 @@ check_column "ts1 ts2 ts3" ic-sb:Datapath_Binding
transit_switch
as az2 check ovn-appctl -t ic/ovn-ic resume
+# Now do the same on a "slow" ovn-ic-sb
+OVS_WAIT_UNTIL([test "x$(as az2 ovn-appctl -t ic/ovn-ic status)" = "xStatus:
active"])
+
+AS_BOX([$(date +%H:%M:%S.%03N) Pausing ovn-ic-sb])
+AT_CHECK([kill -STOP $(cat ovn-ic-sb/ovsdb-server.pid)])
+
+as az1 check ovn-appctl -t ic/ovn-ic pause
+OVS_WAIT_UNTIL([test "x$(as az1 ovn-appctl -t ic/ovn-ic status)" = "xStatus:
paused"])
+
+check ovn-ic-nbctl ts-add ts4
+
+AS_BOX([$(date +%H:%M:%S.%03N) Resuming ovn-ic-sb])
+AT_CHECK([kill -CONT $(cat ovn-ic-sb/ovsdb-server.pid)])
+
+wait_row_count ic-sb:Datapath_Binding 1 transit_switch=ts4
+check_row_count ic-sb:Datapath_Binding 4
+check_column "ts1 ts2 ts3 ts4" ic-sb:Datapath_Binding transit_switch
+
+as az1 check ovn-appctl -t ic/ovn-ic resume
+
for i in 1 2; do
az=az$i
ovn_as $az
--
2.47.1
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev