Repository: cassandra-dtest Updated Branches: refs/heads/schema-migration-upgrade-bug [created] 777c9ab43
Verify that no migration storm happens during an upgrade from 3.0 to 3.11 patch by Robert Stupp; reviewed by Andrés de la Peña for CASSANDRA-14109 Project: http://git-wip-us.apache.org/repos/asf/cassandra-dtest/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra-dtest/commit/777c9ab4 Tree: http://git-wip-us.apache.org/repos/asf/cassandra-dtest/tree/777c9ab4 Diff: http://git-wip-us.apache.org/repos/asf/cassandra-dtest/diff/777c9ab4 Branch: refs/heads/schema-migration-upgrade-bug Commit: 777c9ab431e9fc2c344e7e4ee41a505a659df2d8 Parents: 3d2a6cc Author: Robert Stupp <[email protected]> Authored: Fri Dec 8 22:29:08 2017 +0100 Committer: Robert Stupp <[email protected]> Committed: Wed Dec 13 14:10:44 2017 +0100 ---------------------------------------------------------------------- upgrade_tests/upgrade_schema_agreement_test.py | 258 ++++++++++++++++++++ 1 file changed, 258 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra-dtest/blob/777c9ab4/upgrade_tests/upgrade_schema_agreement_test.py ---------------------------------------------------------------------- diff --git a/upgrade_tests/upgrade_schema_agreement_test.py b/upgrade_tests/upgrade_schema_agreement_test.py new file mode 100644 index 0000000..8541411 --- /dev/null +++ b/upgrade_tests/upgrade_schema_agreement_test.py @@ -0,0 +1,258 @@ +import re +import time + +from ccmlib.node import TimeoutError +from dtest import Tester, debug + + +class TestSchemaAgreementUpgrade(Tester): + """ + Verifies that upgrades do not produce migration storms. + + For upgrades from 3.0 to 3.x we have encountered that the beloved 'cdc' column + caused rolling upgrades to produce migration storms, because the schema digest calculated + by 3.11 differs from that in 3.0, because 3.11 included the 'cdc' column in the digest + calculation. + + This test verifies that (after some grace time, i.e. nodes in status NORMAL) neither the + upgraded node nor the other nodes perform schema pulls (migration tasks) resulting in a + "migration storm". + + A few schema migrations however happen before all nodes are in status NORMAL. This is not ideal, + but legit. In an ideal world, those should not happen - but we would have to do the schema + changes atomically - and currently there is just nothing like a concurrent schema change. + + The tests run for a quite some time, about 5 minutes per test. + But there's some debug output for your entertainment - enjoy. + + The node.start/stop calls pass wait_other_notice=False intentionally, because there seem to + be issues in 3.0 to thoroughly detect and _log_ the status change - i.e. the wait_other_notice + check doesn't seem to be reliable - at least not in this test. + """ + + # The number of seconds we wait for schema migration log entries to verify + migration_check_time = 30 + + def __init__(self, *args, **kwargs): + self.ignore_log_patterns = [ + # This one occurs if we do a non-rolling upgrade, the node + # it's trying to send the migration to hasn't started yet, + # and when it does, it gets replayed and everything is fine. + r'Can\'t send migration request: node.*is down', + ] + + Tester.__init__(self, *args, **kwargs) + + def _prepare(self, version, num_nodes=3): + cluster = self.cluster + + # Forcing cluster version on purpose + cluster.set_install_dir(version=version) + cluster.populate(num_nodes).start() + + return cluster + + def _set_verify_log_mark(self, nodes): + for node in nodes: + node.verify_log_mark = node.mark_log(filename='debug.log') + + def _expect_no_schema_migrations(self, nodes): + """ + Inspects the debug.log files from the given nodes whether any log about schema migration is present. + """ + + # Verify that there are _no_ log messages like: + expressions = [" - [pP]ulling schema from endpoint", + " - [Ss]ubmitting migration task", + " - [Pp]ulled schema from endpoint"] + debug("Inspecting log files of {}...".format([n.name for n in nodes])) + all_matchings = "" + for node in nodes: + try: + matchings = node.watch_log_for(expressions, from_mark=node.verify_log_mark, timeout=0, filename='debug.log') + all_matchings = all_matchings + "\n{}: {}".format(node.name, matchings) + except TimeoutError: + # good + debug(" {}: log files don't show schema migration messages (good)".format(node.name)) + if all_matchings != "": + msg = "Expected no schema migration log entries, but got:{}".format(all_matchings) + debug(msg) # debug message for the validation test case (3.0 vs 3.11.1) + self.fail(msg) + + def _wait_for_status_normal(self, node, mark): + # Wait until the node is in state NORMAL (otherwise we can expect + # schema migrations for the first upgraded node). + node.watch_log_for("Node /{} state jump to NORMAL".format(node.address()), + from_mark=mark, timeout=300, filename='debug.log') + + def _bounce_node(self, node): + debug("Bouncing {}...".format(node.name)) + debug(" Stopping...") + node.stop(wait_other_notice=False) # intentionally set to wait_other_notice=False + mark = node.mark_log(filename='debug.log') + debug(" Starting...") + node.start(wait_other_notice=False) # intentionally set to wait_other_notice=False + debug(" Waiting for status NORMAL...") + self._wait_for_status_normal(node, mark) + + def _min_version(self, nodes): + """ + Retrieve the minimum major version (x.y) the given nodes run. + """ + + min_version = 99.9 + for node in nodes: + short_version = node.get_base_cassandra_version() + debug("{} is on {} ({})".format(node.name, short_version, node.get_cassandra_version())) + if short_version < min_version: + min_version = short_version + return min_version + + def _upgrade_schema_agreement_test(self, upgrade_path): + """ + Test the upgrade though the specified versions and verify that there is schema agreement. + In theory, upgrading a 2-node cluster and verifying that might be sufficient, but the intention + of this test is also to catch potential edge cases. Therefore it uses a 3-node cluster and + bounces the upgraded nodes for an additional verification. + + The most effective way to check for schema migration is to inspect the debug log file for + related messages. However, since schema migrations are normal during upgrades and may happen + in a lazy fashion (e.g. potentially delayed by migration-interval), it does not work reliably - + i.e. the test would depend heavily on the test environment (hardware). + + Therefore, at most points, we can only efficiently check the schema versions as shown via + 'nodetool describecluster'. + + If these tests become flaky, tuning the (initial) migration-delay should help. + + :param upgrade_path: two-dimensional array containing the versions and whether each version + logs a '"Gossiping my * schema version' instead of the usual 'Gossiping my + schema version' message, which is only true for 3.11.2 (but not 4.0). + """ + + # prepare the cluster with initial version from the upgrade path + debug('Starting upgrade test with {}'.format(upgrade_path[0][1])) + cluster = self._prepare(version=upgrade_path[0][1]) + + nodes = self.cluster.nodelist() + + # perform _rolling_ upgrades from one version to another + for (gossip_log_with_product_version, version) in upgrade_path[1:]: + debug("") + debug("Upgrading cluster to {}".format(version)) + cluster.set_install_dir(version=version) + + for node in nodes: + other_nodes = [n for n in nodes if n != node] + + debug("") + debug("Stopping {} for upgrade...".format(node.name)) + # needed to "patch" the config file (especially since 4.0) and get the correct version number + node.set_install_dir(version=version) + node.stop(wait_other_notice=False) # intentionally set to wait_other_notice=False + + # remember the logfile-mark when the node was upgraded + upgrade_log_mark = node.mark_log(filename='debug.log') + debug("Starting upgraded {}...".format(node.name)) + node.start(wait_other_notice=False) # intentionally set to wait_other_notice=False + + # wait until the upgraded node is in status NORMAL + self._wait_for_status_normal(node, upgrade_log_mark) + + # If it's a 3.11.2 node, check that the correct schema version is announced + min_version = self._min_version(nodes) + debug("Minimum version: {}".format(min_version)) + if gossip_log_with_product_version: + # 3.11.2 nodes (and only 3.11.2) indicate whether they announce + # a "3.0 compatible" or "real" "3.11" schema version. + watch_part = "Gossiping my {} schema version".format("3.0 compatible" if min_version == 3.0 else "3.11") + debug("Inspecting log for '{}'...".format(watch_part)) + matchings = node.watch_log_for(watch_part, from_mark=upgrade_log_mark, timeout=120, filename='debug.log') + debug(" Found: {}".format(matchings)) + + # Only log the schema information for debug purposes here. Primarily want to catch the + # schema migration race. + for n in nodes: + out, _, _ = n.nodetool("describecluster") + debug("nodetool describecluster of {}:".format(n.name)) + debug(out) + + # We expect no schema migrations at this point. + self._set_verify_log_mark(other_nodes) + debug(" Sleep for {} seconds...".format(self.migration_check_time)) + time.sleep(self.migration_check_time) + self._expect_no_schema_migrations(other_nodes) + + # Try to trigger the schema migration race by bouncing the upgraded node. + # Bouncing a nodes causes a new gossip-digest-'generation', which in turn causes + # the whole endpoint state to propagate - including the schema version, which, in theory, + # should trigger the race. + # It is expected, that the _other_ nodes do not try to pull the schema. + debug("") + debug("Try to trigger schema migration race by bouncing the upgraded node") + self._bounce_node(node) + self._set_verify_log_mark(other_nodes) + debug(" Sleep for {} seconds...".format(self.migration_check_time)) + time.sleep(self.migration_check_time) + self._expect_no_schema_migrations(other_nodes) + + # Even if it was impossible to trigger a schema version race, compare the schema versions. + # Kind of a last resort. Sometimes it's difficult to trigger the race. But we know that we + # only want to have one schema version. + for n in nodes: + out, _, _ = n.nodetool("describecluster") + debug("nodetool describecluster of {}:".format(n.name)) + debug(out) + versions = out.split('Schema versions:')[1].strip() + num_schemas = len(re.findall('\[.*?\]', versions)) + self.assertEqual(num_schemas, 1, "Multiple schema versions detected on {}: {}".format(n.name, out)) + + def upgrade_schema_agreement_30_3112_test(self): + """ + Test the upgrade from 3.0.latest to 3.11.2 and + verify schema agreement and no further migrations. + """ + self._upgrade_schema_agreement_test(upgrade_path=[[False, 'alias:apache/cassandra-3.0'], + [True, 'alias:apache/cassandra-3.11']]) + + def upgrade_schema_agreement_3111_3112_test(self): + """ + Test the upgrade from 3.11.1 to 3.11.2 and + verify schema agreement and no further migrations. + """ + self._upgrade_schema_agreement_test(upgrade_path=[[False, 'alias:apache/cassandra-3.11.1'], + [True, 'alias:apache/cassandra-3.11']]) + + def upgrade_schema_agreement_30latest_3111_test(self): + """ + Cross-check that the dtest still works (it expects the assertion on the log files). + + If this test becomes flaky, check whether the time to verify (self.migration_check_time) the log files is + still good enough. However, as long as the vast majority of runs of this tests succeeds, the functionality + itself works. + """ + with self.assertRaises(AssertionError, msg="Expected no schema migration log entries for the last {} seconds".format(self.migration_check_time)): + self._upgrade_schema_agreement_test(upgrade_path=[[False, 'alias:apache/cassandra-3.0'], + [False, 'alias:apache/cassandra-3.11.1']]) + + # REVIEWER NOTICE: + # The following tests should not make it into the dtest repo. The above tests using cassandra-3.11 cannot work + # until the corresponding is committed. + + def upgrade_schema_agreement_ok30_test(self): + """ + Test the upgrade from 3.0 to 3.11.2 and verify that there is no migration storm. + + TO BE CLEAR: THIS TEST IS INTENDED TO FAIL AND NOT INTENDED TO BE COMMITTED! + """ + self._upgrade_schema_agreement_test(upgrade_path=[[False, 'alias:apache/cassandra-3.0'], + [True, 'alias:snazy/schema-migration-upgrade-bug-trunk']]) + + def upgrade_schema_agreement_ok3112_test(self): + """ + Test the upgrade from 3.0 to 3.11.2 and verify that there is no migration storm. + + TO BE CLEAR: THIS TEST IS INTENDED TO FAIL AND NOT INTENDED TO BE COMMITTED! + """ + self._upgrade_schema_agreement_test(upgrade_path=[[False, 'alias:apache/cassandra-3.11'], + [True, 'alias:snazy/schema-migration-upgrade-bug-3.11']]) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
