Repository: cassandra-dtest
Updated Branches:
  refs/heads/schema-migration-upgrade-bug [created] 777c9ab43


Verify that no migration storm happens during an upgrade from 3.0 to 3.11

patch by Robert Stupp; reviewed by Andrés de la Peña for CASSANDRA-14109


Project: http://git-wip-us.apache.org/repos/asf/cassandra-dtest/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra-dtest/commit/777c9ab4
Tree: http://git-wip-us.apache.org/repos/asf/cassandra-dtest/tree/777c9ab4
Diff: http://git-wip-us.apache.org/repos/asf/cassandra-dtest/diff/777c9ab4

Branch: refs/heads/schema-migration-upgrade-bug
Commit: 777c9ab431e9fc2c344e7e4ee41a505a659df2d8
Parents: 3d2a6cc
Author: Robert Stupp <[email protected]>
Authored: Fri Dec 8 22:29:08 2017 +0100
Committer: Robert Stupp <[email protected]>
Committed: Wed Dec 13 14:10:44 2017 +0100

----------------------------------------------------------------------
 upgrade_tests/upgrade_schema_agreement_test.py | 258 ++++++++++++++++++++
 1 file changed, 258 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cassandra-dtest/blob/777c9ab4/upgrade_tests/upgrade_schema_agreement_test.py
----------------------------------------------------------------------
diff --git a/upgrade_tests/upgrade_schema_agreement_test.py 
b/upgrade_tests/upgrade_schema_agreement_test.py
new file mode 100644
index 0000000..8541411
--- /dev/null
+++ b/upgrade_tests/upgrade_schema_agreement_test.py
@@ -0,0 +1,258 @@
+import re
+import time
+
+from ccmlib.node import TimeoutError
+from dtest import Tester, debug
+
+
+class TestSchemaAgreementUpgrade(Tester):
+    """
+    Verifies that upgrades do not produce migration storms.
+
+    For upgrades from 3.0 to 3.x we have encountered that the beloved 'cdc' 
column
+    caused rolling upgrades to produce migration storms, because the schema 
digest calculated
+    by 3.11 differs from that in 3.0, because 3.11 included the 'cdc' column 
in the digest
+    calculation.
+
+    This test verifies that (after some grace time, i.e. nodes in status 
NORMAL) neither the
+    upgraded node nor the other nodes perform schema pulls (migration tasks) 
resulting in a
+    "migration storm".
+
+    A few schema migrations however happen before all nodes are in status 
NORMAL. This is not ideal,
+    but legit. In an ideal world, those should not happen - but we would have 
to do the schema
+    changes atomically - and currently there is just nothing like a concurrent 
schema change.
+
+    The tests run for a quite some time, about 5 minutes per test.
+    But there's some debug output for your entertainment - enjoy.
+
+    The node.start/stop calls pass wait_other_notice=False intentionally, 
because there seem to
+    be issues in 3.0 to thoroughly detect and _log_ the status change - i.e. 
the wait_other_notice
+    check doesn't seem to be reliable - at least not in this test.
+    """
+
+    # The number of seconds we wait for schema migration log entries to verify
+    migration_check_time = 30
+
+    def __init__(self, *args, **kwargs):
+        self.ignore_log_patterns = [
+            # This one occurs if we do a non-rolling upgrade, the node
+            # it's trying to send the migration to hasn't started yet,
+            # and when it does, it gets replayed and everything is fine.
+            r'Can\'t send migration request: node.*is down',
+        ]
+
+        Tester.__init__(self, *args, **kwargs)
+
+    def _prepare(self, version, num_nodes=3):
+        cluster = self.cluster
+
+        # Forcing cluster version on purpose
+        cluster.set_install_dir(version=version)
+        cluster.populate(num_nodes).start()
+
+        return cluster
+
+    def _set_verify_log_mark(self, nodes):
+        for node in nodes:
+            node.verify_log_mark = node.mark_log(filename='debug.log')
+
+    def _expect_no_schema_migrations(self, nodes):
+        """
+        Inspects the debug.log files from the given nodes whether any log 
about schema migration is present.
+        """
+
+        # Verify that there are _no_ log messages like:
+        expressions = [" - [pP]ulling schema from endpoint",
+                       " - [Ss]ubmitting migration task",
+                       " - [Pp]ulled schema from endpoint"]
+        debug("Inspecting log files of {}...".format([n.name for n in nodes]))
+        all_matchings = ""
+        for node in nodes:
+            try:
+                matchings = node.watch_log_for(expressions, 
from_mark=node.verify_log_mark, timeout=0, filename='debug.log')
+                all_matchings = all_matchings + "\n{}: {}".format(node.name, 
matchings)
+            except TimeoutError:
+                # good
+                debug("  {}: log files don't show schema migration messages 
(good)".format(node.name))
+        if all_matchings != "":
+            msg = "Expected no schema migration log entries, but 
got:{}".format(all_matchings)
+            debug(msg)  # debug message for the validation test case (3.0 vs 
3.11.1)
+            self.fail(msg)
+
+    def _wait_for_status_normal(self, node, mark):
+        # Wait until the node is in state NORMAL (otherwise we can expect
+        # schema migrations for the first upgraded node).
+        node.watch_log_for("Node /{} state jump to 
NORMAL".format(node.address()),
+                           from_mark=mark, timeout=300, filename='debug.log')
+
+    def _bounce_node(self, node):
+        debug("Bouncing {}...".format(node.name))
+        debug("  Stopping...")
+        node.stop(wait_other_notice=False)  # intentionally set to 
wait_other_notice=False
+        mark = node.mark_log(filename='debug.log')
+        debug("  Starting...")
+        node.start(wait_other_notice=False)  # intentionally set to 
wait_other_notice=False
+        debug("  Waiting for status NORMAL...")
+        self._wait_for_status_normal(node, mark)
+
+    def _min_version(self, nodes):
+        """
+        Retrieve the minimum major version (x.y) the given nodes run.
+        """
+
+        min_version = 99.9
+        for node in nodes:
+            short_version = node.get_base_cassandra_version()
+            debug("{} is on {} ({})".format(node.name, short_version, 
node.get_cassandra_version()))
+            if short_version < min_version:
+                min_version = short_version
+        return min_version
+
+    def _upgrade_schema_agreement_test(self, upgrade_path):
+        """
+        Test the upgrade though the specified versions and verify that there 
is schema agreement.
+        In theory, upgrading a 2-node cluster and verifying that might be 
sufficient, but the intention
+        of this test is also to catch potential edge cases. Therefore it uses 
a 3-node cluster and
+        bounces the upgraded nodes for an additional verification.
+
+        The most effective way to check for schema migration is to inspect the 
debug log file for
+        related messages. However, since schema migrations are normal during 
upgrades and may happen
+        in a lazy fashion (e.g. potentially delayed by migration-interval), it 
does not work reliably -
+        i.e. the test would depend heavily on the test environment (hardware).
+
+        Therefore, at most points, we can only efficiently check the schema 
versions as shown via
+        'nodetool describecluster'.
+
+        If these tests become flaky, tuning the (initial) migration-delay 
should help.
+
+        :param upgrade_path: two-dimensional array containing the versions and 
whether each version
+                             logs a '"Gossiping my * schema version' instead 
of the usual 'Gossiping my
+                             schema version' message, which is only true for 
3.11.2 (but not 4.0).
+        """
+
+        # prepare the cluster with initial version from the upgrade path
+        debug('Starting upgrade test with {}'.format(upgrade_path[0][1]))
+        cluster = self._prepare(version=upgrade_path[0][1])
+
+        nodes = self.cluster.nodelist()
+
+        # perform _rolling_ upgrades from one version to another
+        for (gossip_log_with_product_version, version) in upgrade_path[1:]:
+            debug("")
+            debug("Upgrading cluster to {}".format(version))
+            cluster.set_install_dir(version=version)
+
+            for node in nodes:
+                other_nodes = [n for n in nodes if n != node]
+
+                debug("")
+                debug("Stopping {} for upgrade...".format(node.name))
+                # needed to "patch" the config file (especially since 4.0) and 
get the correct version number
+                node.set_install_dir(version=version)
+                node.stop(wait_other_notice=False)  # intentionally set to 
wait_other_notice=False
+
+                # remember the logfile-mark when the node was upgraded
+                upgrade_log_mark = node.mark_log(filename='debug.log')
+                debug("Starting upgraded {}...".format(node.name))
+                node.start(wait_other_notice=False)  # intentionally set to 
wait_other_notice=False
+
+                # wait until the upgraded node is in status NORMAL
+                self._wait_for_status_normal(node, upgrade_log_mark)
+
+                # If it's a 3.11.2 node, check that the correct schema version 
is announced
+                min_version = self._min_version(nodes)
+                debug("Minimum version: {}".format(min_version))
+                if gossip_log_with_product_version:
+                    # 3.11.2 nodes (and only 3.11.2) indicate whether they 
announce
+                    # a "3.0 compatible" or "real" "3.11" schema version.
+                    watch_part = "Gossiping my {} schema version".format("3.0 
compatible" if min_version == 3.0 else "3.11")
+                    debug("Inspecting log for '{}'...".format(watch_part))
+                    matchings = node.watch_log_for(watch_part, 
from_mark=upgrade_log_mark, timeout=120, filename='debug.log')
+                    debug("  Found: {}".format(matchings))
+
+                # Only log the schema information for debug purposes here. 
Primarily want to catch the
+                # schema migration race.
+                for n in nodes:
+                    out, _, _ = n.nodetool("describecluster")
+                    debug("nodetool describecluster of {}:".format(n.name))
+                    debug(out)
+
+                # We expect no schema migrations at this point.
+                self._set_verify_log_mark(other_nodes)
+                debug("  Sleep for {} 
seconds...".format(self.migration_check_time))
+                time.sleep(self.migration_check_time)
+                self._expect_no_schema_migrations(other_nodes)
+
+                # Try to trigger the schema migration race by bouncing the 
upgraded node.
+                # Bouncing a nodes causes a new gossip-digest-'generation', 
which in turn causes
+                # the whole endpoint state to propagate - including the schema 
version, which, in theory,
+                # should trigger the race.
+                # It is expected, that the _other_ nodes do not try to pull 
the schema.
+                debug("")
+                debug("Try to trigger schema migration race by bouncing the 
upgraded node")
+                self._bounce_node(node)
+                self._set_verify_log_mark(other_nodes)
+                debug("  Sleep for {} 
seconds...".format(self.migration_check_time))
+                time.sleep(self.migration_check_time)
+                self._expect_no_schema_migrations(other_nodes)
+
+                # Even if it was impossible to trigger a schema version race, 
compare the schema versions.
+                # Kind of a last resort. Sometimes it's difficult to trigger 
the race. But we know that we
+                # only want to have one schema version.
+                for n in nodes:
+                    out, _, _ = n.nodetool("describecluster")
+                    debug("nodetool describecluster of {}:".format(n.name))
+                    debug(out)
+                    versions = out.split('Schema versions:')[1].strip()
+                    num_schemas = len(re.findall('\[.*?\]', versions))
+                    self.assertEqual(num_schemas, 1, "Multiple schema versions 
detected on {}: {}".format(n.name, out))
+
+    def upgrade_schema_agreement_30_3112_test(self):
+        """
+        Test the upgrade from 3.0.latest to 3.11.2 and
+        verify schema agreement and no further migrations.
+        """
+        self._upgrade_schema_agreement_test(upgrade_path=[[False, 
'alias:apache/cassandra-3.0'],
+                                                          [True, 
'alias:apache/cassandra-3.11']])
+
+    def upgrade_schema_agreement_3111_3112_test(self):
+        """
+        Test the upgrade from 3.11.1 to 3.11.2 and
+        verify schema agreement and no further migrations.
+        """
+        self._upgrade_schema_agreement_test(upgrade_path=[[False, 
'alias:apache/cassandra-3.11.1'],
+                                                          [True, 
'alias:apache/cassandra-3.11']])
+
+    def upgrade_schema_agreement_30latest_3111_test(self):
+        """
+        Cross-check that the dtest still works (it expects the assertion on 
the log files).
+
+        If this test becomes flaky, check whether the time to verify 
(self.migration_check_time) the log files is
+        still good enough. However, as long as the vast majority of runs of 
this tests succeeds, the functionality
+        itself works.
+        """
+        with self.assertRaises(AssertionError, msg="Expected no schema 
migration log entries for the last {} 
seconds".format(self.migration_check_time)):
+            self._upgrade_schema_agreement_test(upgrade_path=[[False, 
'alias:apache/cassandra-3.0'],
+                                                              [False, 
'alias:apache/cassandra-3.11.1']])
+
+    # REVIEWER NOTICE:
+    # The following tests should not make it into the dtest repo. The above 
tests using cassandra-3.11 cannot work
+    # until the corresponding is committed.
+
+    def upgrade_schema_agreement_ok30_test(self):
+        """
+        Test the upgrade from 3.0 to 3.11.2 and verify that there is no 
migration storm.
+
+        TO BE CLEAR: THIS TEST IS INTENDED TO FAIL AND NOT INTENDED TO BE 
COMMITTED!
+        """
+        self._upgrade_schema_agreement_test(upgrade_path=[[False, 
'alias:apache/cassandra-3.0'],
+                                                          [True, 
'alias:snazy/schema-migration-upgrade-bug-trunk']])
+
+    def upgrade_schema_agreement_ok3112_test(self):
+        """
+        Test the upgrade from 3.0 to 3.11.2 and verify that there is no 
migration storm.
+
+        TO BE CLEAR: THIS TEST IS INTENDED TO FAIL AND NOT INTENDED TO BE 
COMMITTED!
+        """
+        self._upgrade_schema_agreement_test(upgrade_path=[[False, 
'alias:apache/cassandra-3.11'],
+                                                          [True, 
'alias:snazy/schema-migration-upgrade-bug-3.11']])


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to