Repository: incubator-hawq Updated Branches: refs/heads/master b65d7561f -> 21d78d37a
HAWQ-1342. Fixed QE process hang in shared input scan on segment node The basic idea for this kinds of hung problem is to: (1) The error thrown segment will invoke rollback the whole transaction, and all related fd will be closed during transaction end. (2) The other segment just act as before, when wait for select(), it will loop until the specific fd is closed, then the code will run until process interrupt (the rollback transaction will send cancel signal) again in other place afterward. So some previous fix (HAWQ-166, HAWQ-1282) will be changed accordingly. Signed-off-by: Amy Bai <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/21d78d37 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/21d78d37 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/21d78d37 Branch: refs/heads/master Commit: 21d78d37acbf009c138205c5aa1e94dd52ac4e8f Parents: b65d756 Author: Ming Li <[email protected]> Authored: Thu Feb 23 13:22:56 2017 +0800 Committer: Ming LI <[email protected]> Committed: Fri Feb 24 10:08:22 2017 +0800 ---------------------------------------------------------------------- src/backend/executor/nodeShareInputScan.c | 49 +++++++++++--------------- 1 file changed, 21 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/21d78d37/src/backend/executor/nodeShareInputScan.c ---------------------------------------------------------------------- diff --git a/src/backend/executor/nodeShareInputScan.c b/src/backend/executor/nodeShareInputScan.c index 0f08848..74dbcb5 100644 --- a/src/backend/executor/nodeShareInputScan.c +++ b/src/backend/executor/nodeShareInputScan.c @@ -40,7 +40,6 @@ #include "postgres.h" -#include "access/xact.h" #include "cdb/cdbvars.h" #include "executor/executor.h" #include "executor/nodeShareInputScan.h" @@ -641,10 +640,6 @@ read_retry: goto read_retry; else { - if(fd >= 0) - { - gp_retry_close(fd); - } elog(ERROR, "could not read from fifo: %m"); } Assert(!"Never be here"); @@ -664,10 +659,6 @@ write_retry: goto write_retry; else { - if(fd >= 0) - { - gp_retry_close(fd); - } elog(ERROR, "could not write to fifo: %m"); } @@ -794,7 +785,14 @@ shareinput_reader_waitready(int share_id, PlanGenerator planGen) { int save_errno = errno; elog(LOG, "SISC READER (shareid=%d, slice=%d): Wait ready try again, errno %d ... ", - share_id, currentSliceId, save_errno); + share_id, currentSliceId, save_errno); + if(save_errno == EBADF) + { + /* The file description is invalid, maybe this FD has been already closed by writer in some cases + * we need to break here to avoid endless loop and continue to run CHECK_FOR_INTERRUPTS. + */ + break; + } } } return (void *) pctxt; @@ -925,9 +923,12 @@ writer_wait_for_acks(ShareInput_Lk_Context *pctxt, int share_id, int xslice) int save_errno = errno; elog(LOG, "SISC WRITER (shareid=%d, slice=%d): notify still wait for an answer, errno %d", share_id, currentSliceId, save_errno); - /*if error(except EINTR) happens in select, we just return to avoid endless loop*/ - if(errno != EINTR){ - return; + if(save_errno == EBADF) + { + /* The file description is invalid, maybe this FD has been already closed by writer in some cases + * we need to break here to avoid endless loop and continue to run CHECK_FOR_INTERRUPTS. + */ + break; } } } @@ -979,21 +980,6 @@ shareinput_writer_waitdone(void *ctxt, int share_id, int nsharer_xslice) while(ack_needed > 0) { CHECK_FOR_INTERRUPTS(); - - /* - * Writer won't wait for data reading done notification from readers if transaction is - * aborting. Readers may fail to send data reading done notification to writer in two - * cases: - * - * 1. The transaction is aborted due to interrupts or exceptions, i.e., user cancels - * query, division by zero on some segment - * - * 2. Logic errors in reader which incur its unexpected exit, i.e., segmentation fault - */ - if (IsAbortInProgress()) - { - break; - } MPP_FD_ZERO(&rset); MPP_FD_SET(pctxt->donefd, &rset); @@ -1024,6 +1010,13 @@ shareinput_writer_waitdone(void *ctxt, int share_id, int nsharer_xslice) int save_errno = errno; elog(LOG, "SISC WRITER (shareid=%d, slice=%d): wait done time out once, errno %d", share_id, currentSliceId, save_errno); + if(save_errno == EBADF) + { + /* The file description is invalid, maybe this FD has been already closed by writer in some cases + * we need to break here to avoid endless loop and continue to run CHECK_FOR_INTERRUPTS. + */ + break; + } } }
