[ https://issues.apache.org/jira/browse/HAWQ-568?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Chunling Wang updated HAWQ-568: ------------------------------- Affects Version/s: 2.0.0 > After query finished, kill a QE but can still recv() from this QE socket > ------------------------------------------------------------------------ > > Key: HAWQ-568 > URL: https://issues.apache.org/jira/browse/HAWQ-568 > Project: Apache HAWQ > Issue Type: Bug > Components: Dispatcher > Affects Versions: 2.0.0 > Reporter: Chunling Wang > Assignee: Lei Chang > > After query finished, we kill a QE and other QEs remain in QE pool. When > check the connection to this QE is whether alive, we use recv() to this QE > socket, but can still receive data. > 1. Run a query and remain some QEs. > {code} > dispatch=# select count(*) from test_dispatch as t1, test_dispatch as t2, > test_dispatch as t3 where t1.id *2 = t2.id and t1.id < t3.id; > count > ------- > 3725 > (1 row) > {code} > {code} > $ ps -ef|grep postgres > 501 55701 1 0 5:38下午 ?? 0:00.38 /usr/local/hawq/bin/postgres > -D /Users/wangchunling/hawq-data-directory/masterdd -i -M master -p 5432 > --silent-mode=true > 501 55702 55701 0 5:38下午 ?? 0:00.01 postgres: port 5432, master > logger process > 501 55705 55701 0 5:38下午 ?? 0:00.00 postgres: port 5432, stats > collector process > 501 55706 55701 0 5:38下午 ?? 0:00.04 postgres: port 5432, writer > process > 501 55707 55701 0 5:38下午 ?? 0:00.01 postgres: port 5432, > checkpoint process > 501 55708 55701 0 5:38下午 ?? 0:00.00 postgres: port 5432, > seqserver process > 501 55709 55701 0 5:38下午 ?? 0:00.01 postgres: port 5432, WAL > Send Server process > 501 55710 55701 0 5:38下午 ?? 0:00.00 postgres: port 5432, DFS > Metadata Cache process > 501 55711 55701 0 5:38下午 ?? 0:00.26 postgres: port 5432, master > resource manager > 501 55727 1 0 5:38下午 ?? 0:00.52 /usr/local/hawq/bin/postgres > -D /Users/wangchunling/hawq-data-directory/segmentdd -i -M segment -p 40000 > --silent-mode=true > 501 55728 55727 0 5:38下午 ?? 0:00.06 postgres: port 40000, logger > process > 501 55731 55727 0 5:38下午 ?? 0:00.00 postgres: port 40000, stats > collector process > 501 55732 55727 0 5:38下午 ?? 0:00.04 postgres: port 40000, writer > process > 501 55733 55727 0 5:38下午 ?? 0:00.01 postgres: port 40000, > checkpoint process > 501 55734 55727 0 5:38下午 ?? 0:00.09 postgres: port 40000, > segment resource manager > 501 55741 55748 0 5:38下午 ?? 0:00.05 postgres: port 5432, > wangchunling dispatch [local] con12 cmd6 idle [local] > 501 55743 55727 0 5:38下午 ?? 0:00.36 postgres: port 40000, > wangchunling dispatch 127.0.0.1(50800) con12 seg0 idle > 501 55770 55727 0 5:43下午 ?? 0:00.12 postgres: port 40000, > wangchunling dispatch 127.0.0.1(50853) con12 seg0 idle > 501 55771 55727 0 5:44下午 ?? 0:00.11 postgres: port 40000, > wangchunling dispatch 127.0.0.1(50855) con12 seg0 idle > 501 55774 26980 0 5:44下午 ttys008 0:00.00 grep postgres > {code} > 2. Kill one QE. > {code} > $ kill 55771 > $ ps -ef|grep postgres > 501 55701 1 0 5:38下午 ?? 0:00.38 /usr/local/hawq/bin/postgres > -D /Users/wangchunling/hawq-data-directory/masterdd -i -M master -p 5432 > --silent-mode=true > 501 55702 55701 0 5:38下午 ?? 0:00.01 postgres: port 5432, master > logger process > 501 55705 55701 0 5:38下午 ?? 0:00.00 postgres: port 5432, stats > collector process > 501 55706 55701 0 5:38下午 ?? 0:00.04 postgres: port 5432, writer > process > 501 55707 55701 0 5:38下午 ?? 0:00.01 postgres: port 5432, > checkpoint process > 501 55708 55701 0 5:38下午 ?? 0:00.00 postgres: port 5432, > seqserver process > 501 55709 55701 0 5:38下午 ?? 0:00.01 postgres: port 5432, WAL > Send Server process > 501 55710 55701 0 5:38下午 ?? 0:00.00 postgres: port 5432, DFS > Metadata Cache process > 501 55711 55701 0 5:38下午 ?? 0:00.27 postgres: port 5432, master > resource manager > 501 55727 1 0 5:38下午 ?? 0:00.52 /usr/local/hawq/bin/postgres > -D /Users/wangchunling/hawq-data-directory/segmentdd -i -M segment -p 40000 > --silent-mode=true > 501 55728 55727 0 5:38下午 ?? 0:00.06 postgres: port 40000, logger > process > 501 55731 55727 0 5:38下午 ?? 0:00.00 postgres: port 40000, stats > collector process > 501 55732 55727 0 5:38下午 ?? 0:00.04 postgres: port 40000, writer > process > 501 55733 55727 0 5:38下午 ?? 0:00.01 postgres: port 40000, > checkpoint process > 501 55734 55727 0 5:38下午 ?? 0:00.09 postgres: port 40000, > segment resource manager > 501 55741 55748 0 5:38下午 ?? 0:00.05 postgres: port 5432, > wangchunling dispatch [local] con12 cmd6 idle [local] > 501 55743 55727 0 5:38下午 ?? 0:00.36 postgres: port 40000, > wangchunling dispatch 127.0.0.1(50800) con12 seg0 idle > 501 55770 55727 0 5:43下午 ?? 0:00.12 postgres: port 40000, > wangchunling dispatch 127.0.0.1(50853) con12 seg0 idle > 501 55776 26980 0 5:44下午 ttys008 0:00.00 grep postgres > {code} > 3. Attach to QD and run query. > {code} > dispatch=# select count(*) from test_dispatch as t1, test_dispatch as t2, > test_dispatch as t3 where t1.id *2 = t2.id and t1.id < t3.id; > {code} > 4. In executormgr_allocate_executor_by_name(), we can get the QE which we > have just killed and check whether is alive in dispatch_validate_conn() > through recv() from this socket. > {code} > * thread #1: tid = 0x242340, 0x000000010f5f130a > postgres`executormgr_allocate_executor_by_name(name=0x00007fd2ea808320, > is_writer='\0') + 42 at executormgr.c:707, queue = 'com.apple.main-thread', > stop reason = step over > frame #0: 0x000000010f5f130a > postgres`executormgr_allocate_executor_by_name(name=0x00007fd2ea808320, > is_writer='\0') + 42 at executormgr.c:707 > 704 // running until finding a valid one or the pool becomes NULL > 705 SegmentDatabaseDescriptor *desc = > 706 poolmgr_get_item_by_name(executor_cache.pool, name); > -> 707 while (desc != NULL && > !executormgr_validate_conn(desc->conn)) { > 708 desc = poolmgr_get_item_by_name(executor_cache.pool, name); > 709 } > 710 return desc; > (lldb) p *desc > (SegmentDatabaseDescriptor) $11 = { > segment = 0x00007fd2e9884e60 > conn = 0x00007fd2e9701a30 > errcode = 0 > error_message = (data = "", len = 0, maxlen = 256) > motionListener = -773536088 > backendPid = 55771 > whoami = 0x00007fd2e95083d0 "seg0 localhost:40000 pid=55771" > } > (lldb) s > Process 55741 stopped > * thread #1: tid = 0x242340, 0x000000010f5f1cec > postgres`executormgr_validate_conn(conn=0x00007fd2e9701a30) + 12 at > executormgr.c:365, queue = 'com.apple.main-thread', stop reason = step in > frame #0: 0x000000010f5f1cec > postgres`executormgr_validate_conn(conn=0x00007fd2e9701a30) + 12 at > executormgr.c:365 > 362 static bool > 363 executormgr_validate_conn(PGconn *conn) > 364 { > -> 365 if (conn == NULL) > 366 return false; > 367 if (!dispatch_validate_conn(conn->sock)) { > 368 printfPQExpBuffer(&conn->errorMessage, > (lldb) n > Process 55741 stopped > * thread #1: tid = 0x242340, 0x000000010f5f1d03 > postgres`executormgr_validate_conn(conn=0x00007fd2e9701a30) + 35 at > executormgr.c:367, queue = 'com.apple.main-thread', stop reason = step over > frame #0: 0x000000010f5f1d03 > postgres`executormgr_validate_conn(conn=0x00007fd2e9701a30) + 35 at > executormgr.c:367 > 364 { > 365 if (conn == NULL) > 366 return false; > -> 367 if (!dispatch_validate_conn(conn->sock)) { > 368 printfPQExpBuffer(&conn->errorMessage, > 369 libpq_gettext( > 370 "server closed > the connection unexpectedly\n" > (lldb) s > Process 55741 stopped > * thread #1: tid = 0x242340, 0x000000010f5ec2cb > postgres`dispatch_validate_conn(sock=61) + 11 at dispatcher.c:1830, queue = > 'com.apple.main-thread', stop reason = step in > frame #0: 0x000000010f5ec2cb postgres`dispatch_validate_conn(sock=61) + > 11 at dispatcher.c:1830 > 1827 ssize_t ret; > 1828 char buf; > 1829 > -> 1830 if (sock < 0) > 1831 return false; > 1832 > 1833 #ifndef WIN32 > (lldb) p sock > (pgsocket) $12 = 61 > (lldb) n > Process 55741 stopped > * thread #1: tid = 0x242340, 0x000000010f5ec2f1 > postgres`dispatch_validate_conn(sock=61) + 49 at dispatcher.c:1834, queue = > 'com.apple.main-thread', stop reason = step over > frame #0: 0x000000010f5ec2f1 postgres`dispatch_validate_conn(sock=61) + > 49 at dispatcher.c:1834 > 1831 return false; > 1832 > 1833 #ifndef WIN32 > -> 1834 ret = recv(sock, &buf, 1, MSG_PEEK|MSG_DONTWAIT); > 1835 #else > 1836 ret = recv(sock, &buf, 1, MSG_PEEK|MSG_PARTIAL); > 1837 #endif > (lldb) > Process 55741 stopped > * thread #1: tid = 0x242340, 0x000000010f5ec2fd > postgres`dispatch_validate_conn(sock=61) + 61 at dispatcher.c:1839, queue = > 'com.apple.main-thread', stop reason = step over > frame #0: 0x000000010f5ec2fd postgres`dispatch_validate_conn(sock=61) + > 61 at dispatcher.c:1839 > 1836 ret = recv(sock, &buf, 1, MSG_PEEK|MSG_PARTIAL); > 1837 #endif > 1838 > -> 1839 if (ret == 0) /* socket has been closed. EOF */ > 1840 return false; > 1841 > 1842 if (ret > 0) /* data waiting on socket, it must be OK. */ > (lldb) p ret > (ssize_t) $13 = 1 > {code} > So the result of this query is: > {code} > dispatch=# select count(*) from test_dispatch as t1, test_dispatch as t2, > test_dispatch as t3 where t1.id *2 = t2.id and t1.id < t3.id; > ERROR: terminating connection due to administrator command (seg0 > localhost:40000 pid=55771) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)