This is an automated email from the ASF dual-hosted git repository. avamingli pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit e5803f23ae44b8b436e3436400baf25927c87f46 Author: soumyadeep2007 <soumyadeep2...@gmail.com> AuthorDate: Tue Nov 14 23:19:31 2023 -0800 Retry gang creation for non-recovery failures Before this commit, any error from a segment that is unrelated to recovery, would result in an immediate termination with the following: ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); In other words, this termination did not respect gp_gang_creation_retry_count. From now on, we exhaust the retries first, before emitting this ERROR. Co-authored-by: Ashwin Agrawal <aash...@vmware.com> Reviewed-by: Xin Zhang <xinz...@users.noreply.github.com> Reviewed-by: Huansong Fu <fuhuans...@gmail.com> --- src/backend/cdb/dispatcher/cdbgang.c | 23 +++++++++++++++++++ src/backend/cdb/dispatcher/cdbgang_async.c | 37 ++++++++++++++++++++++++++---- src/include/cdb/cdbgang.h | 3 +++ 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/backend/cdb/dispatcher/cdbgang.c b/src/backend/cdb/dispatcher/cdbgang.c index 2d2852dd2a..c73192f0ad 100644 --- a/src/backend/cdb/dispatcher/cdbgang.c +++ b/src/backend/cdb/dispatcher/cdbgang.c @@ -228,6 +228,29 @@ segment_failure_due_to_missing_writer(const char *error_message) return false; } +#ifdef FAULT_INJECTOR +bool +segment_failure_due_to_fault_injector(const char *error_message) +{ + char *fatal = NULL, + *ptr = NULL; + int fatal_len = 0; + + if (error_message == NULL) + return false; + + fatal = _("FATAL"); + fatal_len = strlen(fatal); + + ptr = strstr(error_message, fatal); + if ((ptr != NULL) && ptr[fatal_len] == ':' && + strstr(error_message, "fault triggered")) + return true; + + return false; +} +#endif + /* * Reads the GP catalog tables and build a CdbComponentDatabases structure. diff --git a/src/backend/cdb/dispatcher/cdbgang_async.c b/src/backend/cdb/dispatcher/cdbgang_async.c index fea19bff0e..0b26f67b73 100644 --- a/src/backend/cdb/dispatcher/cdbgang_async.c +++ b/src/backend/cdb/dispatcher/cdbgang_async.c @@ -54,6 +54,7 @@ cdbgang_createGang_async(List *segments, SegmentType segmentType) Gang *newGangDefinition; int create_gang_retry_counter = 0; int in_recovery_mode_count = 0; + int other_failures = 0; int successful_connections = 0; int poll_timeout = 0; int i = 0; @@ -116,6 +117,7 @@ create_gang_retry: Assert(newGangDefinition->size == size); successful_connections = 0; in_recovery_mode_count = 0; + other_failures = 0; retry = false; /* @@ -249,17 +251,44 @@ create_gang_retry: if (segment_failure_due_to_recovery(PQerrorMessage(segdbDesc->conn))) { in_recovery_mode_count++; + /* Mark it as done, so we can consider retrying */ connStatusDone[i] = true; elog(LOG, "segment is in reset/recovery mode (%s)", segdbDesc->whoami); } - else + else if (segment_failure_due_to_missing_writer(PQerrorMessage(segdbDesc->conn))) { - if (segment_failure_due_to_missing_writer(PQerrorMessage(segdbDesc->conn))) - markCurrentGxactWriterGangLost(); + markCurrentGxactWriterGangLost(); ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); } +#ifdef FAULT_INJECTOR + else if (segment_failure_due_to_fault_injector(PQerrorMessage(segdbDesc->conn))) + { + ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("failed to acquire resources on one or more segments: fault injector"), + errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); + } +#endif + else + { + /* Failed for some other reason */ + if (gp_gang_creation_retry_count <= 0 || + create_gang_retry_counter >= gp_gang_creation_retry_count) + { + /* + * If we exhausted all of our retries, ERROR out + * with the appropriate message. + */ + ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("failed to acquire resources on one or more segments"), + errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); + } + + /* Mark it as done, so we can consider retrying below */ + connStatusDone[i] = true; + other_failures++; + } break; default: @@ -325,7 +354,7 @@ create_gang_retry: /* some segments are in reset/recovery mode */ if (successful_connections != size) { - Assert(successful_connections + in_recovery_mode_count == size); + Assert(successful_connections + in_recovery_mode_count + other_failures == size); if (gp_gang_creation_retry_count <= 0 || create_gang_retry_counter++ >= gp_gang_creation_retry_count) diff --git a/src/include/cdb/cdbgang.h b/src/include/cdb/cdbgang.h index 56ed9b85df..ecf72793cc 100644 --- a/src/include/cdb/cdbgang.h +++ b/src/include/cdb/cdbgang.h @@ -87,6 +87,9 @@ bool build_gpqeid_param(char *buf, int bufsz, bool is_writer, int identifier, in extern void makeOptions(char **options, char **diff_options); extern bool segment_failure_due_to_recovery(const char *error_message); extern bool segment_failure_due_to_missing_writer(const char *error_message); +#ifdef FAULT_INJECTOR +extern bool segment_failure_due_to_fault_injector(const char *error_message); +#endif /* * cdbgang_parse_gpqeid_params --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cloudberry.apache.org For additional commands, e-mail: commits-h...@cloudberry.apache.org