Since the topic of clustered database schema conversion issues was first brought up [0], there has been several performance improvements which has greatly improved the situation.
However, we cannot get away from the fact that the process remains unpredictable, when faced with a cluster that has grown to a scale no longer supported by its configuration. The OVSDB Server is mostly single threaded, and for clustered databases the schema conversion is done server side in the main thread. While conversion is ongoing the server will not service critical cluster communication, such as Raft elections. In an ideal world operators would regularly re-assess their deployment configurations with real data to ensure parameters such as the OVSDB Raft election-timer is appropriately configured for all cluster operations, including schema conversions. The reality is that this does not happen, and operators are instead discovering the hard facts of managing a OVSDB cluster at critical points in time such as during database upgrades. To alleviate this situation we introduce time keeping in the ovsdb_convert functions for clustered databases. A timeout is deduced from the configured election timer using a divisor of 4 to compute buffer time. Buffer time is the time required for writing the result of the conversion to the raft log and initiating command to replace data on peers before an election is held for the next term. For conversions initiated by a client (such as ovsdb-client), the process will be aborted and an error will be returned if the schema conversion process overruns the timeout. Conversions initiated from storage will not abort, a warning indicating the election timer being set too low will be logged instead. This will allow a human or automatic operator to take necessary action, such as increasing the election timer, before attempting the schema conversion again. 0: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-December/052140.html Signed-off-by: Frode Nordahl <[email protected]> --- ovsdb/file.c | 78 +++++++++++++++++++++++++++++++++++++++++--- ovsdb/file.h | 2 +- ovsdb/ovsdb-server.c | 2 +- ovsdb/ovsdb-tool.c | 3 +- ovsdb/raft.c | 6 ++++ ovsdb/raft.h | 1 + ovsdb/storage.c | 6 ++++ ovsdb/storage.h | 1 + ovsdb/trigger.c | 3 +- 9 files changed, 93 insertions(+), 9 deletions(-) diff --git a/ovsdb/file.c b/ovsdb/file.c index 778b4004b..ad4418207 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -272,11 +272,15 @@ error: static struct ovsdb_error * OVS_WARN_UNUSED_RESULT ovsdb_convert_table(struct ovsdb_txn *txn, const struct ovsdb_table *src_table, - struct ovsdb_table *dst_table) + struct ovsdb_table *dst_table, + long long int *msec_left, + bool timeout_is_error) { const struct ovsdb_column **dst_columns; + size_t n_rows_since_last_tick = 0; struct ovsdb_error *error = NULL; const struct ovsdb_row *src_row; + long long int last_tick = 0; unsigned long *src_equal; struct shash_node *node; size_t n_src_columns; @@ -308,6 +312,28 @@ ovsdb_convert_table(struct ovsdb_txn *txn, } HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) { + if (msec_left && !n_rows_since_last_tick) { + last_tick = time_msec(); + } + if (msec_left && ++n_rows_since_last_tick == 4000) { + long long int elapsed = time_msec() - last_tick; + *msec_left -= elapsed; + if (*msec_left <= 0 && timeout_is_error) { + error = ovsdb_error("aborted", "stopping conversion to avoid " + "raft election timer to expire " + "before completion."); + goto exit; + } else if (*msec_left <= 0 && !timeout_is_error) { + VLOG_WARN("Wall clock time for conversion of clustered " + "database exceeded Raft election timer, consider " + "increasing the election-timer."); + /* stop time keeping */ + msec_left = NULL; + } + /* in the event processing is so fast that no time has elapsed + * since last update of last_tick, skip next last_tick update. */ + n_rows_since_last_tick = elapsed ? 0 : 1; + } struct ovsdb_row *dst_row = ovsdb_row_create(dst_table); *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row); @@ -351,16 +377,52 @@ exit: /* Copies the data in 'src', converts it into the schema specified in * 'new_schema', and puts it into a newly created, unbacked database, and - * stores a pointer to the new database in '*dstp'. Returns null if - * successful, otherwise an error; on error, stores NULL in '*dstp'. */ + * stores a pointer to the new database in '*dstp'. When 'src' is a clustered + * database and 'timeout_is_error' is set to true, the operation aborts with + * error return in the event of overrunning the election timer, if set to false + * a warning will be logged in the same situation. Returns null if successful, + * otherwise an error; on error, stores NULL in '*dstp'. */ struct ovsdb_error * OVS_WARN_UNUSED_RESULT ovsdb_convert(const struct ovsdb *src, const struct ovsdb_schema *new_schema, - struct ovsdb **dstp) + struct ovsdb **dstp, bool timeout_is_error) { struct ovsdb *dst = ovsdb_create(ovsdb_schema_clone(new_schema), ovsdb_storage_create_unbacked(NULL)); struct ovsdb_txn *txn = ovsdb_txn_create(dst); struct ovsdb_error *error = NULL; + long long int msec_left = 0; + + if (src->storage && + ovsdb_storage_is_clustered(src->storage)) + { + msec_left = ovsdb_storage_get_election_timer(src->storage); + + if (timeout_is_error) { + /* Set aside buffer time, relative to the election timer. + * + * After the conversion is processed we need time to write the + * result of the conversion to the local log and initiate the + * command to replace the data on our peers prior our peers + * starting an election for the next term. + * + * A divisor of 4 will give 4 seconds buffer time for a 16 second + * election timer and 250 ms buffer time for a 1 second election + * timer. + * + * The value required for buffer time would depend on environmental + * factors such as size of database, speed of physical storage and + * so on. If it turns out that this blanket default is + * insufficient we could consider adding a unixctl command to + * configure this value. */ + long long int buffer_time = msec_left / 4; + VLOG_DBG("schema conversion for clustered database: " + "timeout_is_error=%d election_timer=%lld " + "buffer_time=%lld", + timeout_is_error, msec_left, buffer_time); + msec_left -= buffer_time; + } + } + struct shash_node *node; SHASH_FOR_EACH (node, &src->tables) { @@ -372,7 +434,8 @@ ovsdb_convert(const struct ovsdb *src, const struct ovsdb_schema *new_schema, continue; } - error = ovsdb_convert_table(txn, src_table, dst_table); + error = ovsdb_convert_table(txn, src_table, dst_table, &msec_left, + timeout_is_error); if (error) { goto error; } @@ -384,6 +447,11 @@ ovsdb_convert(const struct ovsdb *src, const struct ovsdb_schema *new_schema, goto error; } + if (msec_left) { + VLOG_DBG("schema conversion for clustered database: completed with " + "%lld msec left", msec_left); + } + *dstp = dst; return NULL; diff --git a/ovsdb/file.h b/ovsdb/file.h index ae90d4fe1..624633ec7 100644 --- a/ovsdb/file.h +++ b/ovsdb/file.h @@ -41,7 +41,7 @@ struct ovsdb *ovsdb_file_read_as_schema(const char *filename, struct ovsdb_error *ovsdb_convert(const struct ovsdb *src, const struct ovsdb_schema *new_schema, - struct ovsdb **dstp) + struct ovsdb **dstp, bool timeout_is_error) OVS_WARN_UNUSED_RESULT; #endif /* ovsdb/file.h */ diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 4d29043f4..f0a2372af 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -605,7 +605,7 @@ update_schema(struct ovsdb *db, new_db = ovsdb_trigger_find_and_steal_converted_db(db, txnid); if (!new_db) { /* No luck. Converting. */ - error = ovsdb_convert(db, schema, &new_db); + error = ovsdb_convert(db, schema, &new_db, false); if (error) { /* Should never happen, because conversion should have been * checked before writing the schema to the storage. */ diff --git a/ovsdb/ovsdb-tool.c b/ovsdb/ovsdb-tool.c index facd680ff..cd720889d 100644 --- a/ovsdb/ovsdb-tool.c +++ b/ovsdb/ovsdb-tool.c @@ -1038,7 +1038,8 @@ raft_record_to_standalone_log(const char *db_file_name, check_ovsdb_error(ovsdb_log_commit_block(db_log_data)); old_db = ovsdb_file_read(db_file_name, false); - check_ovsdb_error(ovsdb_convert(old_db, schema, &new_db)); + check_ovsdb_error(ovsdb_convert(old_db, schema, &new_db, + false)); ovsdb_destroy(old_db); pa->elems[1] = ovsdb_to_txn_json( diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 8effd9ad1..8b46ddd92 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -4915,6 +4915,12 @@ raft_log_election_timer(struct raft *raft) NULL)); } +uint64_t +raft_get_election_timer(struct raft *raft) +{ + return raft ? raft->election_timer : 0; +} + static void raft_unixctl_change_election_timer(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[], diff --git a/ovsdb/raft.h b/ovsdb/raft.h index a5b55d9bf..cd7d2c462 100644 --- a/ovsdb/raft.h +++ b/ovsdb/raft.h @@ -117,6 +117,7 @@ const struct uuid *raft_get_sid(const struct raft *); bool raft_is_connected(const struct raft *); bool raft_is_leader(const struct raft *); void raft_get_memory_usage(const struct raft *, struct simap *usage); +uint64_t raft_get_election_timer(struct raft *); /* Parameter validation */ struct ovsdb_error *raft_validate_election_timer(const uint64_t ms); diff --git a/ovsdb/storage.c b/ovsdb/storage.c index 6c395106c..2d77de051 100644 --- a/ovsdb/storage.c +++ b/ovsdb/storage.c @@ -203,6 +203,12 @@ ovsdb_storage_get_memory_usage(const struct ovsdb_storage *storage, } } +uint64_t +ovsdb_storage_get_election_timer(const struct ovsdb_storage *storage) +{ + return storage->raft ? raft_get_election_timer(storage->raft) : 0; +} + char * ovsdb_storage_get_error(const struct ovsdb_storage *storage) { diff --git a/ovsdb/storage.h b/ovsdb/storage.h index 05f40ce93..f36887e94 100644 --- a/ovsdb/storage.h +++ b/ovsdb/storage.h @@ -42,6 +42,7 @@ const struct uuid *ovsdb_storage_get_sid(const struct ovsdb_storage *); uint64_t ovsdb_storage_get_applied_index(const struct ovsdb_storage *); void ovsdb_storage_get_memory_usage(const struct ovsdb_storage *, struct simap *usage); +uint64_t ovsdb_storage_get_election_timer(const struct ovsdb_storage *); char *ovsdb_storage_get_error(const struct ovsdb_storage *); void ovsdb_storage_run(struct ovsdb_storage *); diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 0edcdd89c..9d518099a 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -299,7 +299,8 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) } if (!error) { ovsdb_destroy(t->converted_db); - error = ovsdb_convert(t->db, new_schema, &t->converted_db); + error = ovsdb_convert(t->db, new_schema, &t->converted_db, + true); } if (error) { ovsdb_schema_destroy(new_schema); -- 2.40.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
