TaoJIn created HAWQ-1640: ---------------------------- Summary: process not exit after query finished immediately while client connection lost Key: HAWQ-1640 URL: https://issues.apache.org/jira/browse/HAWQ-1640 Project: Apache HAWQ Issue Type: Bug Components: Core Reporter: TaoJIn Assignee: Radar Lei Fix For: 2.3.0.0-incubating
When client (such as pgbouncer,jdbc,zeppelin) connected to hawq and execute a long query,if the client connection interrupted before query finished,the server process will not exit until an hour later. This issue was happend in HAWQ 2.3.0.0-incubating.And set parameter gp_interconnect_transmit_timeout to 600(default 3600) will reduce the time to 10 minutes. When the query wa running,we could see its status in pg_stat_activty,but after it finished we could only saw the process id in pg_locks and OS process. We could saw some error log as below: $ tailf hawq-2018-07-04_063514.csv|grep p294 2018-07-04 08:13:29.595365 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58896,con19,cmd32,seg-1,,,x58896,sx1,"LOG","00000","ConnID 5. Returned resource to resource manager.",,,,,,,0,,"rmcomm_QD2RM.c",951, 2018-07-04 08:13:29.595555 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58896,con19,cmd32,seg-1,,,x58896,sx1,"LOG","00000","ConnID 5. Unregistered from HAWQ resource manager.",,,,,,,0,,"rmcomm_QD2RM.c",661, 2018-07-04 08:15:58.706458 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd34,seg-1,,,x58903,sx1,"LOG","00000","ConnID 6. Registered in HAWQ resource manager (By OID)",,,,,,"select * from cppayorderproduct",0,,"rmcomm_QD2RM.c",609, 2018-07-04 08:15:58.706640 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd34,seg-1,,,x58903,sx1,"LOG","00000","ConnID 6. Acquired resource from resource manager, (256 MB, 0.062500 CORE) x 18.",,,,,,"select * from cppayorderproduct",0,,"rmcomm_QD2RM.c",868, 2018-07-04 09:04:56.190873 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","08006","could not send data to client: Connection reset by peer",,,,,,"select * from cppayorderproduct",0,,"pqcomm.c",1413, 2018-07-04 09:04:56.192347 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"FATAL","08006","connection to client lost",,,,,,"select * from cppayorderproduct",0,,"postgres.c",3606, 2018-07-04 10:04:56.306412 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_consume meets error, connection is bad.",,,,,,,0,,,, 2018-07-04 10:04:56.306535 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","dispmgt_thread_func_run(): fail to consume data. Will exit and clean up.",,,,,,,0,,,, 2018-07-04 10:04:56.309663 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.312741 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.315364 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.317885 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.320411 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.322998 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.327342 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.330034 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.332656 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.335257 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.337972 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.340634 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.343785 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.346309 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.350458 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.352760 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.354846 UTC,"dev","hdb",p294,th1627412224,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","function executormgr_cancel calling executormgr_catch_error",,,,,,,0,,,, 2018-07-04 10:04:56.354996 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","dispatcher thinks centos7-datanode3.centos7_hawq_network is down.",,,,,,,0,,"rmcomm_QD2RM.c",1209, 2018-07-04 10:04:56.355020 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","dispatcher sends 1 failed host(s) to resource manager.",,,,,,,0,,"rmcomm_QD2RM.c",1213, 2018-07-04 10:04:56.355398 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","succeed in sending failed host to resource manager.",,,,,,,0,,"rmcomm_QD2RM.c",1232, 2018-07-04 10:04:56.356559 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","ConnID 6. Returned resource to resource manager.",,,,,,,0,,"rmcomm_QD2RM.c",951, 2018-07-04 10:04:56.356869 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,58903,con19,cmd35,seg-1,,,x58903,sx1,"LOG","00000","ConnID 6. Unregistered from HAWQ resource manager.",,,,,,,0,,"rmcomm_QD2RM.c",661, 2018-07-04 10:04:56.356892 UTC,"dev","hdb",p294,th1628359104,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,0,con19,cmd35,seg-1,,,,,"LOG","00000","clean up communication to resource manager now.",,,,,,,0,,"rmcomm_QD2RM.c",460, 2018-07-04 10:04:56.379198 UTC,"dev","hdb",p294,th998237952,"172.17.10.148","63974",2018-07-04 06:37:28 UTC,0,con19,cmd35,seg-1,,,,,"LOG","00000","generateResourceRefreshHeartBeat exits.",,,,,,,0,,,, [backtrace of the server process |https://imgur.com/Qm3QnDA] -- This message was sent by Atlassian JIRA (v7.6.3#76005)