[
https://issues.apache.org/jira/browse/IMPALA-7931?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Tim Armstrong updated IMPALA-7931:
----------------------------------
Description:
On a recent S3 test run test_shutdown_executor hit a timeout waiting for a
query to reach state FINISHED. Instead the query stays at state 5 (EXCEPTION).
{noformat}
12:51:11 __________________ TestShutdownCommand.test_shutdown_executor
__________________
12:51:11 custom_cluster/test_restart_services.py:209: in test_shutdown_executor
12:51:11 assert self.__fetch_and_get_num_backends(QUERY,
before_shutdown_handle) == 3
12:51:11 custom_cluster/test_restart_services.py:356: in
__fetch_and_get_num_backends
12:51:11 self.client.QUERY_STATES['FINISHED'], timeout=20)
12:51:11 common/impala_service.py:267: in wait_for_query_state
12:51:11 target_state, query_state)
12:51:11 E AssertionError: Did not reach query state in time target=4 actual=5
{noformat}
>From the logs I can see that the query fails because one of the executors
>becomes unreachable:
{noformat}
I1204 12:31:39.954125 5609 impala-server.cc:1792] Query
a34c3a84775e5599:b2b25eb900000000: Failed due to unreachable impalad(s):
jenkins-worker:22001
{noformat}
The query was {{select count\(*) from functional_parquet.alltypes where
sleep(1) = bool_col}}.
It seems that the query took longer than expected and was still running when
the executor shut down.
I can reproduce by adding a sleep to the test:
{noformat}
diff --git a/tests/custom_cluster/test_restart_services.py
b/tests/custom_cluster/test_restart_services.py
index e441cbc..32bc8a1 100644
--- a/tests/custom_cluster/test_restart_services.py
+++ b/tests/custom_cluster/test_restart_services.py
@@ -206,7 +206,7 @@ class TestShutdownCommand(CustomClusterTestSuite,
HS2TestSuite):
after_shutdown_handle = self.__exec_and_wait_until_running(QUERY)
# Finish executing the first query before the backend exits.
- assert self.__fetch_and_get_num_backends(QUERY, before_shutdown_handle) ==
3
+ assert self.__fetch_and_get_num_backends(QUERY, before_shutdown_handle,
delay=5) == 3
# Wait for the impalad to exit, then start it back up and run another
query, which
# should be scheduled on it again.
@@ -349,11 +349,14 @@ class TestShutdownCommand(CustomClusterTestSuite,
HS2TestSuite):
self.client.QUERY_STATES['RUNNING'], timeout=20)
return handle
- def __fetch_and_get_num_backends(self, query, handle):
+ def __fetch_and_get_num_backends(self, query, handle, delay=0):
"""Fetch the results of 'query' from the beeswax handle 'handle', close the
query and return the number of backends obtained from the profile."""
self.impalad_test_service.wait_for_query_state(self.client, handle,
self.client.QUERY_STATES['FINISHED'], timeout=20)
+ if delay > 0:
+ LOG.info("sleeping for {0}".format(delay))
+ time.sleep(delay)
self.client.fetch(query, handle)
profile = self.client.get_runtime_profile(handle)
self.client.close_query(handle)
{noformat}
was:
On a recent S3 test run test_shutdown_executor hit a timeout waiting for a
query to reach state FINISHED. Instead the query stays at state 5 (EXCEPTION).
{noformat}
12:51:11 __________________ TestShutdownCommand.test_shutdown_executor
__________________
12:51:11 custom_cluster/test_restart_services.py:209: in test_shutdown_executor
12:51:11 assert self.__fetch_and_get_num_backends(QUERY,
before_shutdown_handle) == 3
12:51:11 custom_cluster/test_restart_services.py:356: in
__fetch_and_get_num_backends
12:51:11 self.client.QUERY_STATES['FINISHED'], timeout=20)
12:51:11 common/impala_service.py:267: in wait_for_query_state
12:51:11 target_state, query_state)
12:51:11 E AssertionError: Did not reach query state in time target=4 actual=5
{noformat}
>From the logs I can see that the query fails because one of the executors
>becomes unreachable:
{noformat}
I1204 12:31:39.954125 5609 impala-server.cc:1792] Query
a34c3a84775e5599:b2b25eb900000000: Failed due to unreachable impalad(s):
jenkins-worker:22001
{noformat}
The query was {{select count\(*) from functional_parquet.alltypes where
sleep(1) = bool_col}}.
It seems that the query took longer than expected and was still running when
the executor shut down.
> test_shutdown_executor fails with timeout waiting for query target state
> ------------------------------------------------------------------------
>
> Key: IMPALA-7931
> URL: https://issues.apache.org/jira/browse/IMPALA-7931
> Project: IMPALA
> Issue Type: Bug
> Components: Infrastructure
> Affects Versions: Impala 3.2.0
> Reporter: Lars Volker
> Assignee: Tim Armstrong
> Priority: Critical
> Labels: broken-build
> Attachments: impala-7931-impalad-logs.tar.gz
>
>
> On a recent S3 test run test_shutdown_executor hit a timeout waiting for a
> query to reach state FINISHED. Instead the query stays at state 5 (EXCEPTION).
> {noformat}
> 12:51:11 __________________ TestShutdownCommand.test_shutdown_executor
> __________________
> 12:51:11 custom_cluster/test_restart_services.py:209: in
> test_shutdown_executor
> 12:51:11 assert self.__fetch_and_get_num_backends(QUERY,
> before_shutdown_handle) == 3
> 12:51:11 custom_cluster/test_restart_services.py:356: in
> __fetch_and_get_num_backends
> 12:51:11 self.client.QUERY_STATES['FINISHED'], timeout=20)
> 12:51:11 common/impala_service.py:267: in wait_for_query_state
> 12:51:11 target_state, query_state)
> 12:51:11 E AssertionError: Did not reach query state in time target=4
> actual=5
> {noformat}
> From the logs I can see that the query fails because one of the executors
> becomes unreachable:
> {noformat}
> I1204 12:31:39.954125 5609 impala-server.cc:1792] Query
> a34c3a84775e5599:b2b25eb900000000: Failed due to unreachable impalad(s):
> jenkins-worker:22001
> {noformat}
> The query was {{select count\(*) from functional_parquet.alltypes where
> sleep(1) = bool_col}}.
> It seems that the query took longer than expected and was still running when
> the executor shut down.
> I can reproduce by adding a sleep to the test:
> {noformat}
> diff --git a/tests/custom_cluster/test_restart_services.py
> b/tests/custom_cluster/test_restart_services.py
> index e441cbc..32bc8a1 100644
> --- a/tests/custom_cluster/test_restart_services.py
> +++ b/tests/custom_cluster/test_restart_services.py
> @@ -206,7 +206,7 @@ class TestShutdownCommand(CustomClusterTestSuite,
> HS2TestSuite):
> after_shutdown_handle = self.__exec_and_wait_until_running(QUERY)
>
> # Finish executing the first query before the backend exits.
> - assert self.__fetch_and_get_num_backends(QUERY, before_shutdown_handle)
> == 3
> + assert self.__fetch_and_get_num_backends(QUERY, before_shutdown_handle,
> delay=5) == 3
>
> # Wait for the impalad to exit, then start it back up and run another
> query, which
> # should be scheduled on it again.
> @@ -349,11 +349,14 @@ class TestShutdownCommand(CustomClusterTestSuite,
> HS2TestSuite):
> self.client.QUERY_STATES['RUNNING'], timeout=20)
> return handle
>
> - def __fetch_and_get_num_backends(self, query, handle):
> + def __fetch_and_get_num_backends(self, query, handle, delay=0):
> """Fetch the results of 'query' from the beeswax handle 'handle', close
> the
> query and return the number of backends obtained from the profile."""
> self.impalad_test_service.wait_for_query_state(self.client, handle,
> self.client.QUERY_STATES['FINISHED'], timeout=20)
> + if delay > 0:
> + LOG.info("sleeping for {0}".format(delay))
> + time.sleep(delay)
> self.client.fetch(query, handle)
> profile = self.client.get_runtime_profile(handle)
> self.client.close_query(handle)
> {noformat}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]