[ https://issues.apache.org/jira/browse/HAWQ-1448?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15992314#comment-15992314 ]
Ming LI edited comment on HAWQ-1448 at 5/2/17 7:16 AM: ------------------------------------------------------- Below is the related log segmentdd/pg_log/hawq-2017-04-13_071837.csv on seg3 {code} 2017-04-13 08:08:15.998769 PDT,,,p23303,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","received smart shutdown request",,,,,,,0,,"postmaster.c",3447, ... 2017-04-13 08:08:43.228325 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","FD 4 having errors raised. errno 111",,,,,,,0,,"rmcomm_AsyncComm.c",188, 2017-04-13 08:08:43.228347 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","Resource manager socket connect has error raised.",,,,,,,0,,"rmcomm_Connect.c",100, 2017-04-13 08:08:43.228364 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","Segment's resource manager sending IMAlive message switches from master to standby",,,,,,,0,,"rmcomm_RMSEG2RM.c",168, 2017-04-13 08:08:43.228383 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","segment will send heart-beat to standby from now on",,,,,,,0,,"resourcemanager_RMSEG.c",285, 2017-04-13 08:09:13.280237 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","Resource manager discovered local host IPv4 address 127.0.0.1",,,,,,,0,,"network_utils.c",210, 2017-04-13 08:09:13.280294 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","Resource manager discovered local host IPv4 address 10.32.34.6",,,,,,,0,,"network_utils.c",210, ........................................... LOOP THESE 6 LINES ........................................... 2017-04-13 10:03:55.869252 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","FD 4 having errors raised. errno 111",,,,,,,0,,"rmcomm_AsyncComm.c",188, 2017-04-13 10:03:55.869277 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","Resource manager socket connect has error raised.",,,,,,,0,,"rmcomm_Connect.c",100, 2017-04-13 10:03:55.869293 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","Segment's resource manager sending IMAlive message switches from master to standby",,,,,,,0,,"rmcomm_RMSEG2RM.c",168, 2017-04-13 10:03:55.869323 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","segment will send heart-beat to standby from now on",,,,,,,0,,"resourcemanager_RMSEG.c",285, 2017-04-13 10:04:01.249461 PDT,"hawqsuperuser","olap_winowerr",p177517,th2028661024,"10.32.35.251","45247",2017-04-13 08:04:00 PDT,0,con4354,,seg6,,,,,"LOG","08006","could not receive data from client: Connection reset by peer",,,,,,,0,,"pqcomm.c",842, 2017-04-13 10:04:01.249522 PDT,"hawqsuperuser","olap_winowerr",p177517,th2028661024,"10.32.35.251","45247",2017-04-13 08:04:00 PDT,0,con4354,,seg6,,,,,"LOG","08P01","unexpected EOF on client connection",,,,,,,0,,"postgres.c",443, 2017-04-13 10:04:01.252964 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","Segment RM exits.",,,,,,,0,,"resourcemanager.c",347, 2017-04-13 10:04:01.253027 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","Clean up handler in message server is called.",,,,,,,0,,"rmcomm_MessageServer.c",105, 2017-04-13 10:04:01.255779 PDT,,,p23308,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","shutting down",,,,,,,0,,"xlog.c",7861, 2017-04-13 10:04:01.257902 PDT,,,p23308,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","database system is shut down",,,,,,,0,,"xlog.c",7882, {code} was (Author: mli): Below is the related log {code} 2017-04-13 08:08:15.998769 PDT,,,p23303,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","received smart shutdown request",,,,,,,0,,"postmaster.c",3447, ... 2017-04-13 08:08:43.228325 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","FD 4 having errors raised. errno 111",,,,,,,0,,"rmcomm_AsyncComm.c",188, 2017-04-13 08:08:43.228347 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","Resource manager socket connect has error raised.",,,,,,,0,,"rmcomm_Connect.c",100, 2017-04-13 08:08:43.228364 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","Segment's resource manager sending IMAlive message switches from master to standby",,,,,,,0,,"rmcomm_RMSEG2RM.c",168, 2017-04-13 08:08:43.228383 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","segment will send heart-beat to standby from now on",,,,,,,0,,"resourcemanager_RMSEG.c",285, 2017-04-13 08:09:13.280237 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","Resource manager discovered local host IPv4 address 127.0.0.1",,,,,,,0,,"network_utils.c",210, 2017-04-13 08:09:13.280294 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","Resource manager discovered local host IPv4 address 10.32.34.6",,,,,,,0,,"network_utils.c",210, ........................................... LOOP THESE 6 LINES ........................................... 2017-04-13 10:03:55.869252 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","FD 4 having errors raised. errno 111",,,,,,,0,,"rmcomm_AsyncComm.c",188, 2017-04-13 10:03:55.869277 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","Resource manager socket connect has error raised.",,,,,,,0,,"rmcomm_Connect.c",100, 2017-04-13 10:03:55.869293 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"WARNING","01000","Segment's resource manager sending IMAlive message switches from master to standby",,,,,,,0,,"rmcomm_RMSEG2RM.c",168, 2017-04-13 10:03:55.869323 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","segment will send heart-beat to standby from now on",,,,,,,0,,"resourcemanager_RMSEG.c",285, 2017-04-13 10:04:01.249461 PDT,"hawqsuperuser","olap_winowerr",p177517,th2028661024,"10.32.35.251","45247",2017-04-13 08:04:00 PDT,0,con4354,,seg6,,,,,"LOG","08006","could not receive data from client: Connection reset by peer",,,,,,,0,,"pqcomm.c",842, 2017-04-13 10:04:01.249522 PDT,"hawqsuperuser","olap_winowerr",p177517,th2028661024,"10.32.35.251","45247",2017-04-13 08:04:00 PDT,0,con4354,,seg6,,,,,"LOG","08P01","unexpected EOF on client connection",,,,,,,0,,"postgres.c",443, 2017-04-13 10:04:01.252964 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","Segment RM exits.",,,,,,,0,,"resourcemanager.c",347, 2017-04-13 10:04:01.253027 PDT,,,p23310,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","Clean up handler in message server is called.",,,,,,,0,,"rmcomm_MessageServer.c",105, 2017-04-13 10:04:01.255779 PDT,,,p23308,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","shutting down",,,,,,,0,,"xlog.c",7861, 2017-04-13 10:04:01.257902 PDT,,,p23308,th2028661024,,,,0,,,seg-10000,,,,,"LOG","00000","database system is shut down",,,,,,,0,,"xlog.c",7882, {code} > Postmaster process hung at recv () on segment > --------------------------------------------- > > Key: HAWQ-1448 > URL: https://issues.apache.org/jira/browse/HAWQ-1448 > Project: Apache HAWQ > Issue Type: Bug > Components: Dispatcher > Reporter: Ming LI > Assignee: Ming LI > Fix For: backlog > > > Some process hung for almost 2 hours before quit. > 4/13/17 8:13:36 AM PDT: Thread 1 (Thread 0x7f9c78eae920 (LWP 177517)): > 4/13/17 8:13:36 AM PDT: #0 0x000000322180ec2c in recv () from > /lib64/libpthread.so.0 > 4/13/17 8:13:36 AM PDT: #1 0x00000000007847e8 in secure_read () > 4/13/17 8:13:36 AM PDT: #2 0x0000000000793735 in pq_recvbuf () > 4/13/17 8:13:36 AM PDT: #3 0x00000000007939b9 in pq_getbyte () > 4/13/17 8:13:36 AM PDT: #4 0x00000000008e39a4 in SocketBackend () > 4/13/17 8:13:36 AM PDT: #5 0x00000000008e3ddc in ReadCommand () > 4/13/17 8:13:36 AM PDT: #6 0x00000000008ea8c3 in PostgresMain () > 4/13/17 8:13:36 AM PDT: #7 0x00000000008944ff in BackendRun () > 4/13/17 8:13:36 AM PDT: #8 0x000000000089391e in BackendStartup () > 4/13/17 8:13:36 AM PDT: #9 0x000000000088d99a in ServerLoop () > 4/13/17 8:13:36 AM PDT: #10 0x000000000088c9a7 in PostmasterMain () > 4/13/17 8:13:36 AM PDT: #11 0x00000000007a9d63 in main () > 4/13/17 8:13:36 AM PDT: ------------------------------------- > All postgres processes on all host are quit, only postmaster on seg3 hung. -- This message was sent by Atlassian JIRA (v6.3.15#6346)