Repository: incubator-impala Updated Branches: refs/heads/master 4a79c9e7e -> 8146532b6
IMPALA-3794: Workaround for Breakpad ID conflicts Breakpad determines the ID of the minidump file to be written in case of a crash during startup of the process randomly, seeded with the current system time with second granularity. If two impalads start up within the same second, there is a chance for a name conflict. The one second delay between starting impalads in start-impala-cluster.py is not sufficient: I0407 22:34:52.018563 28473 minidump.cc:245] Setting minidump size limit to 20971520. I0407 22:34:52.997046 28749 minidump.cc:245] Setting minidump size limit to 20971520. When sending a signal to all of them, one process can overwrite the minidump of another one. This is an upstream issue and is tracked in Breakpad-681. I further confirmed my suspicion by tentatively making an own output folder for each running instance of impalad and was then unable to reproduce the issue. However, it is a more clear solution to fix the underlying issue than to change the folder locations for minidumps in impala. Until this is fixed upstream, we can make sure that we see at least one minidump for the group of impalads in the test cluster. It is not a product defect, since we don't support running multiple impalads on a single host, let alone starting them all at once. To test this I ran the following loop for about an hour on my dev machine without hitting the issue: while [ $? -eq 0 ]; do impala-py.test tests/custom_cluster/test_breakpad.py --exploration_strategy=exhaustive -k test_minidump_relative_path -x -s; done Change-Id: I4ae589f6eb5cbbfb860943214edc0e6415eeb862 Reviewed-on: http://gerrit.cloudera.org:8080/6588 Reviewed-by: Lars Volker <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a827e9ed Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a827e9ed Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a827e9ed Branch: refs/heads/master Commit: a827e9edc1f93ec0ddb2184133b114182ebb2e02 Parents: 4a79c9e Author: Lars Volker <[email protected]> Authored: Fri Apr 7 13:12:34 2017 +0200 Committer: Impala Public Jenkins <[email protected]> Committed: Sat Apr 8 19:58:25 2017 +0000 ---------------------------------------------------------------------- tests/custom_cluster/test_breakpad.py | 37 ++++++++++++++++++------------ 1 file changed, 22 insertions(+), 15 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a827e9ed/tests/custom_cluster/test_breakpad.py ---------------------------------------------------------------------- diff --git a/tests/custom_cluster/test_breakpad.py b/tests/custom_cluster/test_breakpad.py index 834b969..bf480e9 100644 --- a/tests/custom_cluster/test_breakpad.py +++ b/tests/custom_cluster/test_breakpad.py @@ -37,6 +37,9 @@ class TestBreakpad(CustomClusterTestSuite): writing minidump files on unhandled signals and rotating old minidumps on startup. The tests kill the daemons by sending a SIGSEGV signal. """ + # Limit for the number of minidumps that gets passed to the daemons as a startup flag. + MAX_MINIDUMPS = 2 + @classmethod def get_workload(cls): return 'functional-query' @@ -77,10 +80,11 @@ class TestBreakpad(CustomClusterTestSuite): self._start_impala_cluster(cluster_options) def start_cluster(self): - self.start_cluster_with_args(minidump_path=self.tmp_dir, max_minidumps=2) + self.start_cluster_with_args(minidump_path=self.tmp_dir, + max_minidumps=self.MAX_MINIDUMPS) def start_cluster_without_minidumps(self): - self.start_cluster_with_args(minidump_path='', max_minidumps=2) + self.start_cluster_with_args(minidump_path='', max_minidumps=self.MAX_MINIDUMPS) def kill_cluster(self, signal): self.cluster.refresh() @@ -139,6 +143,18 @@ class TestBreakpad(CustomClusterTestSuite): def count_all_minidumps(self, base_dir=None): return sum((self.count_minidumps(daemon, base_dir) for daemon in DAEMONS)) + def assert_num_minidumps_for_all_daemons(self, base_dir=None): + self.assert_num_logfile_entries(1) + # IMPALA-3794 / Breakpad-681: Weak minidump ID generation can lead to name conflicts, + # so that one process overwrites the minidump of others. See IMPALA-3794 for more + # information. + # TODO: Change this here and elsewhere in this file to expect 'cluster_size' minidumps + # once Breakpad-681 has been fixed. + assert self.count_minidumps('impalad', base_dir) >= 1 + assert self.count_minidumps('statestored', base_dir) == 1 + assert self.count_minidumps('catalogd', base_dir) == 1 + + def assert_num_logfile_entries(self, expected_count): self.assert_impalad_log_contains('INFO', 'Wrote minidump to ', expected_count=expected_count) @@ -153,10 +169,7 @@ class TestBreakpad(CustomClusterTestSuite): assert self.count_all_minidumps() == 0 cluster_size = self.get_num_processes('impalad') self.kill_cluster(SIGSEGV) - self.assert_num_logfile_entries(1) - assert self.count_minidumps('impalad') == cluster_size - assert self.count_minidumps('statestored') == 1 - assert self.count_minidumps('catalogd') == 1 + self.assert_num_minidumps_for_all_daemons() @pytest.mark.execute_serially def test_sigusr1_writes_minidump(self): @@ -175,10 +188,7 @@ class TestBreakpad(CustomClusterTestSuite): self.execute_query_expect_success(client, "SELECT COUNT(*) FROM functional.alltypes") # Kill the cluster. Sending SIGKILL will not trigger minidumps to be written. self.kill_cluster(SIGKILL) - self.assert_num_logfile_entries(1) - assert self.count_minidumps('impalad') == cluster_size - assert self.count_minidumps('statestored') == 1 - assert self.count_minidumps('catalogd') == 1 + self.assert_num_minidumps_for_all_daemons() @pytest.mark.execute_serially def test_minidump_relative_path(self): @@ -193,10 +203,7 @@ class TestBreakpad(CustomClusterTestSuite): assert self.count_all_minidumps(minidump_base_dir) == 0 cluster_size = self.get_num_processes('impalad') self.kill_cluster(SIGSEGV) - self.assert_num_logfile_entries(1) - assert self.count_minidumps('impalad', minidump_base_dir) == cluster_size - assert self.count_minidumps('statestored', minidump_base_dir) == 1 - assert self.count_minidumps('catalogd', minidump_base_dir) == 1 + self.assert_num_minidumps_for_all_daemons(minidump_base_dir) shutil.rmtree(minidump_base_dir) @pytest.mark.execute_serially @@ -207,7 +214,7 @@ class TestBreakpad(CustomClusterTestSuite): self.kill_cluster(SIGSEGV) self.assert_num_logfile_entries(1) self.start_cluster() - expected_impalads = min(self.get_num_processes('impalad'), 2) + expected_impalads = min(self.get_num_processes('impalad'), self.MAX_MINIDUMPS) assert self.count_minidumps('impalad') == expected_impalads assert self.count_minidumps('statestored') == 1 assert self.count_minidumps('catalogd') == 1
