This is an automated email from the ASF dual-hosted git repository.

avamingli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit e5803f23ae44b8b436e3436400baf25927c87f46
Author: soumyadeep2007 <soumyadeep2...@gmail.com>
AuthorDate: Tue Nov 14 23:19:31 2023 -0800

    Retry gang creation for non-recovery failures
    
    Before this commit, any error from a segment that is unrelated to
    recovery, would result in an immediate termination with the following:
    
    ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
            errmsg("failed to acquire resources on one or more segments"),
            errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), 
segdbDesc->whoami)));
    
    In other words, this termination did not respect
    gp_gang_creation_retry_count. From now on, we exhaust the retries first,
    before emitting this ERROR.
    
    Co-authored-by: Ashwin Agrawal <aash...@vmware.com>
    Reviewed-by: Xin Zhang <xinz...@users.noreply.github.com>
    Reviewed-by: Huansong Fu <fuhuans...@gmail.com>
---
 src/backend/cdb/dispatcher/cdbgang.c       | 23 +++++++++++++++++++
 src/backend/cdb/dispatcher/cdbgang_async.c | 37 ++++++++++++++++++++++++++----
 src/include/cdb/cdbgang.h                  |  3 +++
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/src/backend/cdb/dispatcher/cdbgang.c 
b/src/backend/cdb/dispatcher/cdbgang.c
index 2d2852dd2a..c73192f0ad 100644
--- a/src/backend/cdb/dispatcher/cdbgang.c
+++ b/src/backend/cdb/dispatcher/cdbgang.c
@@ -228,6 +228,29 @@ segment_failure_due_to_missing_writer(const char 
*error_message)
        return false;
 }
 
+#ifdef FAULT_INJECTOR
+bool
+segment_failure_due_to_fault_injector(const char *error_message)
+{
+       char       *fatal = NULL,
+                          *ptr = NULL;
+       int                     fatal_len = 0;
+
+       if (error_message == NULL)
+               return false;
+
+       fatal = _("FATAL");
+       fatal_len = strlen(fatal);
+
+       ptr = strstr(error_message, fatal);
+       if ((ptr != NULL) && ptr[fatal_len] == ':' &&
+               strstr(error_message, "fault triggered"))
+               return true;
+
+       return false;
+}
+#endif
+
 
 /*
  * Reads the GP catalog tables and build a CdbComponentDatabases structure.
diff --git a/src/backend/cdb/dispatcher/cdbgang_async.c 
b/src/backend/cdb/dispatcher/cdbgang_async.c
index fea19bff0e..0b26f67b73 100644
--- a/src/backend/cdb/dispatcher/cdbgang_async.c
+++ b/src/backend/cdb/dispatcher/cdbgang_async.c
@@ -54,6 +54,7 @@ cdbgang_createGang_async(List *segments, SegmentType 
segmentType)
        Gang    *newGangDefinition;
        int             create_gang_retry_counter = 0;
        int             in_recovery_mode_count = 0;
+       int             other_failures = 0;
        int             successful_connections = 0;
        int             poll_timeout = 0;
        int             i = 0;
@@ -116,6 +117,7 @@ create_gang_retry:
        Assert(newGangDefinition->size == size);
        successful_connections = 0;
        in_recovery_mode_count = 0;
+       other_failures = 0;
        retry = false;
 
        /*
@@ -249,17 +251,44 @@ create_gang_retry:
                                                if 
(segment_failure_due_to_recovery(PQerrorMessage(segdbDesc->conn)))
                                                {
                                                        
in_recovery_mode_count++;
+                                                       /* Mark it as done, so 
we can consider retrying */
                                                        connStatusDone[i] = 
true;
                                                        elog(LOG, "segment is 
in reset/recovery mode (%s)", segdbDesc->whoami);
                                                }
-                                               else
+                                               else if 
(segment_failure_due_to_missing_writer(PQerrorMessage(segdbDesc->conn)))
                                                {
-                                                       if 
(segment_failure_due_to_missing_writer(PQerrorMessage(segdbDesc->conn)))
-                                                               
markCurrentGxactWriterGangLost();
+                                                       
markCurrentGxactWriterGangLost();
                                                        ereport(ERROR, 
(errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
                                                                                
        errmsg("failed to acquire resources on one or more segments"),
                                                                                
        errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), 
segdbDesc->whoami)));
                                                }
+#ifdef FAULT_INJECTOR
+                                               else if 
(segment_failure_due_to_fault_injector(PQerrorMessage(segdbDesc->conn)))
+                                               {
+                                                       ereport(ERROR, 
(errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
+                                                               errmsg("failed 
to acquire resources on one or more segments: fault injector"),
+                                                               errdetail("%s 
(%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami)));
+                                               }
+#endif
+                                               else
+                                               {
+                                                       /* Failed for some 
other reason */
+                                                       if 
(gp_gang_creation_retry_count <= 0 ||
+                                                               
create_gang_retry_counter >= gp_gang_creation_retry_count)
+                                                       {
+                                                               /*
+                                                                * If we 
exhausted all of our retries, ERROR out
+                                                                * with the 
appropriate message.
+                                                                */
+                                                               ereport(ERROR, 
(errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
+                                                                       
errmsg("failed to acquire resources on one or more segments"),
+                                                                       
errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami)));
+                                                       }
+
+                                                       /* Mark it as done, so 
we can consider retrying below */
+                                                       connStatusDone[i] = 
true;
+                                                       other_failures++;
+                                               }
                                                break;
 
                                        default:
@@ -325,7 +354,7 @@ create_gang_retry:
                /* some segments are in reset/recovery mode */
                if (successful_connections != size)
                {
-                       Assert(successful_connections + in_recovery_mode_count 
== size);
+                       Assert(successful_connections + in_recovery_mode_count 
+ other_failures == size);
 
                        if (gp_gang_creation_retry_count <= 0 ||
                                create_gang_retry_counter++ >= 
gp_gang_creation_retry_count)
diff --git a/src/include/cdb/cdbgang.h b/src/include/cdb/cdbgang.h
index 56ed9b85df..ecf72793cc 100644
--- a/src/include/cdb/cdbgang.h
+++ b/src/include/cdb/cdbgang.h
@@ -87,6 +87,9 @@ bool build_gpqeid_param(char *buf, int bufsz, bool is_writer, 
int identifier, in
 extern void makeOptions(char **options, char **diff_options);
 extern bool segment_failure_due_to_recovery(const char *error_message);
 extern bool segment_failure_due_to_missing_writer(const char *error_message);
+#ifdef FAULT_INJECTOR
+extern bool segment_failure_due_to_fault_injector(const char *error_message);
+#endif
 
 /*
  * cdbgang_parse_gpqeid_params


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cloudberry.apache.org
For additional commands, e-mail: commits-h...@cloudberry.apache.org

Reply via email to