Repository: incubator-hawq
Updated Branches:
  refs/heads/master b65d7561f -> 21d78d37a


HAWQ-1342. Fixed QE process hang in shared input scan on segment node

The basic idea for this kinds of hung problem is to:
(1) The error thrown segment will invoke rollback the whole transaction, and 
all related fd will be closed during transaction end.
(2) The other segment just act as before, when wait for select(), it will loop 
until the specific fd is closed, then the code will run until process interrupt 
(the rollback transaction will send cancel signal) again in other place 
afterward.

So some previous fix (HAWQ-166,  HAWQ-1282) will be changed accordingly.

Signed-off-by: Amy Bai <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/21d78d37
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/21d78d37
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/21d78d37

Branch: refs/heads/master
Commit: 21d78d37acbf009c138205c5aa1e94dd52ac4e8f
Parents: b65d756
Author: Ming Li <[email protected]>
Authored: Thu Feb 23 13:22:56 2017 +0800
Committer: Ming LI <[email protected]>
Committed: Fri Feb 24 10:08:22 2017 +0800

----------------------------------------------------------------------
 src/backend/executor/nodeShareInputScan.c | 49 +++++++++++---------------
 1 file changed, 21 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/21d78d37/src/backend/executor/nodeShareInputScan.c
----------------------------------------------------------------------
diff --git a/src/backend/executor/nodeShareInputScan.c 
b/src/backend/executor/nodeShareInputScan.c
index 0f08848..74dbcb5 100644
--- a/src/backend/executor/nodeShareInputScan.c
+++ b/src/backend/executor/nodeShareInputScan.c
@@ -40,7 +40,6 @@
 
 #include "postgres.h"
 
-#include "access/xact.h"
 #include "cdb/cdbvars.h"
 #include "executor/executor.h"
 #include "executor/nodeShareInputScan.h"
@@ -641,10 +640,6 @@ read_retry:
                goto read_retry;
        else
        {
-               if(fd >= 0)
-               {
-                       gp_retry_close(fd);
-               }
                elog(ERROR, "could not read from fifo: %m");
        }
        Assert(!"Never be here");
@@ -664,10 +659,6 @@ write_retry:
                goto write_retry;
        else
        {
-               if(fd >= 0)
-               {
-                       gp_retry_close(fd);
-               }
                elog(ERROR, "could not write to fifo: %m");
        }
 
@@ -794,7 +785,14 @@ shareinput_reader_waitready(int share_id, PlanGenerator 
planGen)
                {
                        int save_errno = errno;
                        elog(LOG, "SISC READER (shareid=%d, slice=%d): Wait 
ready try again, errno %d ... ",
-                                       share_id, currentSliceId, save_errno);
+                                                               share_id, 
currentSliceId, save_errno);
+                       if(save_errno == EBADF)
+                       {
+                               /* The file description is invalid, maybe this 
FD has been already closed by writer in some cases
+                                * we need to break here to avoid endless loop 
and continue to run CHECK_FOR_INTERRUPTS.
+                                */
+                               break;
+                       }
                }
        }
        return (void *) pctxt;
@@ -925,9 +923,12 @@ writer_wait_for_acks(ShareInput_Lk_Context *pctxt, int 
share_id, int xslice)
                        int save_errno = errno;
                        elog(LOG, "SISC WRITER (shareid=%d, slice=%d): notify 
still wait for an answer, errno %d",
                                        share_id, currentSliceId, save_errno);
-                       /*if error(except EINTR) happens in select, we just 
return to avoid endless loop*/
-                       if(errno != EINTR){
-                               return;
+                       if(save_errno == EBADF)
+                       {
+                               /* The file description is invalid, maybe this 
FD has been already closed by writer in some cases
+                                * we need to break here to avoid endless loop 
and continue to run CHECK_FOR_INTERRUPTS.
+                                */
+                               break;
                        }
                }
        }
@@ -979,21 +980,6 @@ shareinput_writer_waitdone(void *ctxt, int share_id, int 
nsharer_xslice)
        while(ack_needed > 0)
        {
                CHECK_FOR_INTERRUPTS();
-
-               /*
-                * Writer won't wait for data reading done notification from 
readers if transaction is
-                * aborting. Readers may fail to send data reading done 
notification to writer in two
-                * cases:
-                *
-                *    1. The transaction is aborted due to interrupts or 
exceptions, i.e., user cancels
-                *       query, division by zero on some segment
-                *
-                *    2. Logic errors in reader which incur its unexpected 
exit, i.e., segmentation fault
-                */
-               if (IsAbortInProgress())
-               {
-                       break;
-               }
        
                MPP_FD_ZERO(&rset);
                MPP_FD_SET(pctxt->donefd, &rset);
@@ -1024,6 +1010,13 @@ shareinput_writer_waitdone(void *ctxt, int share_id, int 
nsharer_xslice)
                        int save_errno = errno;
                        elog(LOG, "SISC WRITER (shareid=%d, slice=%d): wait 
done time out once, errno %d",
                                        share_id, currentSliceId, save_errno);
+                       if(save_errno == EBADF)
+                       {
+                               /* The file description is invalid, maybe this 
FD has been already closed by writer in some cases
+                                * we need to break here to avoid endless loop 
and continue to run CHECK_FOR_INTERRUPTS.
+                                */
+                               break;
+                       }
                }
        }
 

Reply via email to