Repository: incubator-impala
Updated Branches:
  refs/heads/master 4a79c9e7e -> 8146532b6


IMPALA-3794: Workaround for Breakpad ID conflicts

Breakpad determines the ID of the minidump file to be written in case of
a crash during startup of the process randomly, seeded with the current
system time with second granularity. If two impalads start up within the
same second, there is a chance for a name conflict. The one second delay
between starting impalads in start-impala-cluster.py is not sufficient:

I0407 22:34:52.018563 28473 minidump.cc:245] Setting minidump size limit
to 20971520.
I0407 22:34:52.997046 28749 minidump.cc:245] Setting minidump size limit
to 20971520.

When sending a signal to all of them, one process can overwrite the
minidump of another one. This is an upstream issue and is tracked in
Breakpad-681. I further confirmed my suspicion by tentatively making an
own output folder for each running instance of impalad and was then
unable to reproduce the issue. However, it is a more clear solution to
fix the underlying issue than to change the folder locations for
minidumps in impala.

Until this is fixed upstream, we can make sure that we see at least one
minidump for the group of impalads in the test cluster. It is not a
product defect, since we don't support running multiple impalads on a
single host, let alone starting them all at once.

To test this I ran the following loop for about an hour on my dev
machine without hitting the issue:

while [ $? -eq 0 ]; do impala-py.test
tests/custom_cluster/test_breakpad.py --exploration_strategy=exhaustive
-k test_minidump_relative_path -x -s; done

Change-Id: I4ae589f6eb5cbbfb860943214edc0e6415eeb862
Reviewed-on: http://gerrit.cloudera.org:8080/6588
Reviewed-by: Lars Volker <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a827e9ed
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a827e9ed
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a827e9ed

Branch: refs/heads/master
Commit: a827e9edc1f93ec0ddb2184133b114182ebb2e02
Parents: 4a79c9e
Author: Lars Volker <[email protected]>
Authored: Fri Apr 7 13:12:34 2017 +0200
Committer: Impala Public Jenkins <[email protected]>
Committed: Sat Apr 8 19:58:25 2017 +0000

----------------------------------------------------------------------
 tests/custom_cluster/test_breakpad.py | 37 ++++++++++++++++++------------
 1 file changed, 22 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a827e9ed/tests/custom_cluster/test_breakpad.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_breakpad.py 
b/tests/custom_cluster/test_breakpad.py
index 834b969..bf480e9 100644
--- a/tests/custom_cluster/test_breakpad.py
+++ b/tests/custom_cluster/test_breakpad.py
@@ -37,6 +37,9 @@ class TestBreakpad(CustomClusterTestSuite):
   writing minidump files on unhandled signals and rotating old minidumps on 
startup. The
   tests kill the daemons by sending a SIGSEGV signal.
   """
+  # Limit for the number of minidumps that gets passed to the daemons as a 
startup flag.
+  MAX_MINIDUMPS = 2
+
   @classmethod
   def get_workload(cls):
     return 'functional-query'
@@ -77,10 +80,11 @@ class TestBreakpad(CustomClusterTestSuite):
     self._start_impala_cluster(cluster_options)
 
   def start_cluster(self):
-    self.start_cluster_with_args(minidump_path=self.tmp_dir, max_minidumps=2)
+    self.start_cluster_with_args(minidump_path=self.tmp_dir,
+                                 max_minidumps=self.MAX_MINIDUMPS)
 
   def start_cluster_without_minidumps(self):
-    self.start_cluster_with_args(minidump_path='', max_minidumps=2)
+    self.start_cluster_with_args(minidump_path='', 
max_minidumps=self.MAX_MINIDUMPS)
 
   def kill_cluster(self, signal):
     self.cluster.refresh()
@@ -139,6 +143,18 @@ class TestBreakpad(CustomClusterTestSuite):
   def count_all_minidumps(self, base_dir=None):
     return sum((self.count_minidumps(daemon, base_dir) for daemon in DAEMONS))
 
+  def assert_num_minidumps_for_all_daemons(self, base_dir=None):
+    self.assert_num_logfile_entries(1)
+    # IMPALA-3794 / Breakpad-681: Weak minidump ID generation can lead to name 
conflicts,
+    # so that one process overwrites the minidump of others. See IMPALA-3794 
for more
+    # information.
+    # TODO: Change this here and elsewhere in this file to expect 
'cluster_size' minidumps
+    # once Breakpad-681 has been fixed.
+    assert self.count_minidumps('impalad', base_dir) >= 1
+    assert self.count_minidumps('statestored', base_dir) == 1
+    assert self.count_minidumps('catalogd', base_dir) == 1
+
+
   def assert_num_logfile_entries(self, expected_count):
     self.assert_impalad_log_contains('INFO', 'Wrote minidump to ',
         expected_count=expected_count)
@@ -153,10 +169,7 @@ class TestBreakpad(CustomClusterTestSuite):
     assert self.count_all_minidumps() == 0
     cluster_size = self.get_num_processes('impalad')
     self.kill_cluster(SIGSEGV)
-    self.assert_num_logfile_entries(1)
-    assert self.count_minidumps('impalad') == cluster_size
-    assert self.count_minidumps('statestored') == 1
-    assert self.count_minidumps('catalogd') == 1
+    self.assert_num_minidumps_for_all_daemons()
 
   @pytest.mark.execute_serially
   def test_sigusr1_writes_minidump(self):
@@ -175,10 +188,7 @@ class TestBreakpad(CustomClusterTestSuite):
     self.execute_query_expect_success(client, "SELECT COUNT(*) FROM 
functional.alltypes")
     # Kill the cluster. Sending SIGKILL will not trigger minidumps to be 
written.
     self.kill_cluster(SIGKILL)
-    self.assert_num_logfile_entries(1)
-    assert self.count_minidumps('impalad') == cluster_size
-    assert self.count_minidumps('statestored') == 1
-    assert self.count_minidumps('catalogd') == 1
+    self.assert_num_minidumps_for_all_daemons()
 
   @pytest.mark.execute_serially
   def test_minidump_relative_path(self):
@@ -193,10 +203,7 @@ class TestBreakpad(CustomClusterTestSuite):
     assert self.count_all_minidumps(minidump_base_dir) == 0
     cluster_size = self.get_num_processes('impalad')
     self.kill_cluster(SIGSEGV)
-    self.assert_num_logfile_entries(1)
-    assert self.count_minidumps('impalad', minidump_base_dir) == cluster_size
-    assert self.count_minidumps('statestored', minidump_base_dir) == 1
-    assert self.count_minidumps('catalogd', minidump_base_dir) == 1
+    self.assert_num_minidumps_for_all_daemons(minidump_base_dir)
     shutil.rmtree(minidump_base_dir)
 
   @pytest.mark.execute_serially
@@ -207,7 +214,7 @@ class TestBreakpad(CustomClusterTestSuite):
     self.kill_cluster(SIGSEGV)
     self.assert_num_logfile_entries(1)
     self.start_cluster()
-    expected_impalads = min(self.get_num_processes('impalad'), 2)
+    expected_impalads = min(self.get_num_processes('impalad'), 
self.MAX_MINIDUMPS)
     assert self.count_minidumps('impalad') == expected_impalads
     assert self.count_minidumps('statestored') == 1
     assert self.count_minidumps('catalogd') == 1

Reply via email to