Currently an unreachable node can make LURenewCrypto fail completely. This patch adds a unit test for it, and improves the error handling of unreachable nodes in a way, that the rest of the nodes are still handled properly.
Signed-off-by: Helga Velroyen <[email protected]> --- lib/cmdlib/cluster.py | 23 ++++++++++++++---- test/py/cmdlib/cluster_unittest.py | 50 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/lib/cmdlib/cluster.py b/lib/cmdlib/cluster.py index 3762489..43ced02 100644 --- a/lib/cmdlib/cluster.py +++ b/lib/cmdlib/cluster.py @@ -152,17 +152,30 @@ class LUClusterRenewCrypto(NoHooksLU): except IOError: pass + node_errors = {} nodes = self.cfg.GetAllNodesInfo() for (node_uuid, node_info) in nodes.items(): if node_info.offline: feedback_fn("* Skipping offline node %s" % node_info.name) continue if node_uuid != master_uuid: - new_digest = CreateNewClientCert(self, node_uuid) - if node_info.master_candidate: - utils.AddNodeToCandidateCerts(node_uuid, - new_digest, - cluster.candidate_certs) + try: + new_digest = CreateNewClientCert(self, node_uuid) + if node_info.master_candidate: + utils.AddNodeToCandidateCerts(node_uuid, + new_digest, + cluster.candidate_certs) + except errors.OpExecError as e: + node_errors[node_uuid] = e + + if node_errors: + msg = ("Some nodes' SSL client certificates could not be renewed." + " Please make sure those nodes are reachable and rerun" + " the operation. The affected nodes and their errors are:\n") + for uuid, e in node_errors.items(): + msg += "Node %s: %s\n" % (uuid, e) + feedback_fn(msg) + utils.RemoveNodeFromCandidateCerts("%s-SERVER" % master_uuid, cluster.candidate_certs) utils.RemoveNodeFromCandidateCerts("%s-OLDMASTER" % master_uuid, diff --git a/test/py/cmdlib/cluster_unittest.py b/test/py/cmdlib/cluster_unittest.py index 87fd9c7..34983d8 100644 --- a/test/py/cmdlib/cluster_unittest.py +++ b/test/py/cmdlib/cluster_unittest.py @@ -2351,6 +2351,56 @@ class TestLUClusterRenewCrypto(CmdlibTestCase): cluster = self.cfg.GetClusterInfo() self.assertFalse(cluster.candidate_certs) + def _partiallyFailingRpc(self, node_uuid, _): + if node_uuid == self._failed_node: + return self.RpcResultsBuilder() \ + .CreateFailedNodeResult(node_uuid) + else: + return self.RpcResultsBuilder() \ + .CreateSuccessfulNodeResult(node_uuid, + [(constants.CRYPTO_TYPE_SSL_DIGEST, self._GetFakeDigest(node_uuid))]) + + @patchPathutils("cluster") + def testNonMasterFails(self, pathutils): + + # patch pathutils to point to temporary files + pathutils.NODED_CERT_FILE = self._node_cert + pathutils.NODED_CLIENT_CERT_FILE = self._client_node_cert + pathutils.NODED_CLIENT_CERT_FILE_TMP = \ + self._client_node_cert_tmp + + # create a few non-master, online nodes + num_nodes = 3 + for _ in range(num_nodes): + self.cfg.AddNewNode() + nodes = self.cfg.GetAllNodesInfo() + + # pick one node as the failing one + master_uuid = self.cfg.GetMasterNode() + self._failed_node = [node_uuid for node_uuid in nodes + if node_uuid != master_uuid][1] + self.rpc.call_node_crypto_tokens = self._partiallyFailingRpc + + op = opcodes.OpClusterRenewCrypto() + self.ExecOpCode(op) + + # Check if the correct certificates exist and don't exist on the master + self.assertTrue(os.path.exists(pathutils.NODED_CERT_FILE)) + self.assertTrue(os.path.exists(pathutils.NODED_CLIENT_CERT_FILE)) + self.assertFalse(os.path.exists(pathutils.NODED_CLIENT_CERT_FILE_TMP)) + + # Check if we have the correct digests in the configuration + cluster = self.cfg.GetClusterInfo() + # There should be one digest missing. + self.assertEqual(num_nodes, len(cluster.candidate_certs)) + nodes = self.cfg.GetAllNodesInfo() + for (node_uuid, _) in nodes.items(): + if node_uuid == self._failed_node: + self.assertTrue(node_uuid not in cluster.candidate_certs) + else: + expected_digest = self._GetFakeDigest(node_uuid) + self.assertEqual(expected_digest, cluster.candidate_certs[node_uuid]) + if __name__ == "__main__": testutils.GanetiTestProgram() -- 2.2.0.rc0.207.ga3a616c
