Hi! On Thu, 23 Apr 2015 at 11:29 Hrvoje Ribicic <[email protected]> wrote:
> On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel < > [email protected]> wrote: > >> When removing an SSH key, RemoveNodeSshKey so far did only >> try to contact each node once. This made the entire >> procedure not very resilient towards temporary network >> failures. This patch adds a couple of retries for contacting >> each node and only gives up after those are exhausted. >> >> Signed-off-by: Helga Velroyen <[email protected]> >> --- >> lib/backend.py | 101 >> ++++++++++++++++++++++++------------- >> lib/cmdlib/node.py | 21 ++++++-- >> test/py/ganeti.backend_unittest.py | 65 ++++++++++++++++++++++++ >> 3 files changed, 148 insertions(+), 39 deletions(-) >> >> diff --git a/lib/backend.py b/lib/backend.py >> index 8ec22d9..c3190d2 100644 >> --- a/lib/backend.py >> +++ b/lib/backend.py >> @@ -1660,12 +1660,15 @@ def RemoveNodeSshKey(node_uuid, node_name, >> @returns: list of feedback messages >> >> """ >> - result_msgs = [] >> + # Non-disruptive error messages, mapped from node name to message >> + result_msgs = {} >> + # maximum number of retries for SSH connections >> + max_retries = 3 >> >> # Make sure at least one of these flags is true. >> if not (from_authorized_keys or from_public_keys or >> clear_authorized_keys >> or clear_public_keys): >> - result_msgs.append("No removal from any key file was requested.") >> + raise errors.SshUpdateError("No removal from any key file was >> requested.") >> >> if not ssconf_store: >> ssconf_store = ssconf.SimpleStore() >> @@ -1717,8 +1720,11 @@ def RemoveNodeSshKey(node_uuid, node_name, >> >> all_nodes = ssconf_store.GetNodeList() >> online_nodes = ssconf_store.GetOnlineNodeList() >> + logging.debug("Removing key of node '%s' from all nodes but itself >> and" >> + " master.", node_name) >> for node in all_nodes: >> if node == master_node: >> + logging.debug("Skipping master node '%s'.", master_node) >> continue >> if node not in online_nodes: >> logging.debug("Skipping offline node '%s'.", node) >> @@ -1729,24 +1735,48 @@ def RemoveNodeSshKey(node_uuid, node_name, >> " node '%s', map: %s." % >> (node, ssh_port_map)) >> if node in potential_master_candidates: >> - try: >> - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >> - ssh_port, pot_mc_data, >> - debug=False, verbose=False, use_cluster_key=False, >> - ask_key=False, strict_host_check=False) >> - except errors.OpExecError as e: >> - result_msgs.append("Warning: the SSH setup of node '%s' >> could not" >> - " be adjusted." % node) >> - else: >> - if from_authorized_keys: >> + error_msg_try = ("The SSH setup of node '%s' could not" >> + " be adjusted in try no %s. Error: %s.") >> + error_msg_final = ("When removing the key of node '%s', >> updating the" >> + " SSH key files of node '%s' failed after >> %s" >> + " retries. Not trying again. Last error >> was: %s.") >> > > Looking at the code, it seems this has to go before the if as it is used > in the else as well. > Ack. > > >> + last_exception = None >> + for i in range(max_retries): >> try: >> run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >> - ssh_port, base_data, >> + ssh_port, pot_mc_data, >> debug=False, verbose=False, >> use_cluster_key=False, >> ask_key=False, strict_host_check=False) >> + break >> except errors.OpExecError as e: >> - result_msgs.append("Warning: the SSH setup of node '%s' >> could" >> - " not be adjusted." % node) >> + logging.error(error_msg_try, node, i, e) >> + last_exception = e >> + else: >> + if last_exception: >> + error_msg = error_msg_final % (node_name, node, >> max_retries, >> + last_exception) >> + result_msgs[node] = error_msg >> + logging.error(error_msg) >> + >> + else: >> + last_exception = None >> + if from_authorized_keys: >> + for i in range(max_retries): >> + try: >> + run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >> + ssh_port, base_data, >> + debug=False, verbose=False, >> use_cluster_key=False, >> + ask_key=False, strict_host_check=False) >> + break >> + except errors.OpExecError as e: >> + logging.error(error_msg_try, node, i, e) >> + last_exception = e >> + else: >> + if last_exception: >> + error_msg = error_msg_final % (node_name, node, >> max_retries, >> + last_exception) >> + result_msgs[node] = error_msg >> + logging.error(error_msg) > > >> if clear_authorized_keys or from_public_keys or clear_public_keys: >> data = {} >> @@ -1790,9 +1820,10 @@ def RemoveNodeSshKey(node_uuid, node_name, >> debug=False, verbose=False, use_cluster_key=False, >> ask_key=False, strict_host_check=False) >> except errors.OpExecError as e: >> - result_msgs.append("Removing SSH keys from node '%s' failed. This >> can" >> - " happen when the node is already unreachable." >> - " Error: %s" % (node_name, e)) >> + result_msgs[node_name] = \ >> + ("Removing SSH keys from node '%s' failed. This can" >> + " happen when the node is already unreachable." >> + " Error: %s" % (node_name, e)) >> >> return result_msgs >> >> @@ -1980,15 +2011,14 @@ def RenewSshKeys(node_uuids, node_names, >> ssh_port_map, >> # and that would terminate all communication from the master to >> the >> # node. >> logging.debug("Removing SSH key of node '%s'.", node_name) >> - RemoveNodeSshKey(node_uuid, node_name, >> - master_candidate_uuids, >> - potential_master_candidates, >> - ssh_port_map, >> - master_uuid=master_node_uuid, >> - from_authorized_keys=master_candidate, >> - from_public_keys=False, >> - clear_authorized_keys=False, >> - clear_public_keys=False) >> + node_errors = RemoveNodeSshKey( >> + node_uuid, node_name, master_candidate_uuids, >> + potential_master_candidates, ssh_port_map, >> + master_uuid=master_node_uuid, >> from_authorized_keys=master_candidate, >> + from_public_keys=False, clear_authorized_keys=False, >> + clear_public_keys=False) >> + if node_errors: >> + all_node_errors = all_node_errors + [node_errors.items()] >> else: >> logging.debug("Old key of node '%s' is the same as the current >> master" >> " key. Not deleting that key on the node.", >> node_name) >> @@ -2070,15 +2100,14 @@ def RenewSshKeys(node_uuids, node_names, >> ssh_port_map, >> >> # Remove the old key from all node's authorized keys file >> logging.debug("Remove the old master key from all nodes.") >> - RemoveNodeSshKey(master_node_uuid, master_node_name, >> - master_candidate_uuids, >> - potential_master_candidates, >> - ssh_port_map, >> - keys_to_remove=old_master_keys_by_uuid, >> - from_authorized_keys=True, >> - from_public_keys=False, >> - clear_authorized_keys=False, >> - clear_public_keys=False) >> + node_errors = RemoveNodeSshKey( >> + master_node_uuid, master_node_name, master_candidate_uuids, >> + potential_master_candidates, ssh_port_map, >> + keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True, >> + from_public_keys=False, clear_authorized_keys=False, >> + clear_public_keys=False) >> + if node_errors: >> + all_node_errors = all_node_errors + node_errors.items() >> >> return all_node_errors >> >> diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py >> index 6a13d8a..8453b33 100644 >> --- a/lib/cmdlib/node.py >> +++ b/lib/cmdlib/node.py >> @@ -366,6 +366,12 @@ class LUNodeAdd(LogicalUnit): >> remove_result[master_node].Raise( >> "Could not remove SSH keys of node %s before readding," >> " (UUID: %s)." % (new_node_name, new_node_uuid)) >> > > As with previous patches, please insert space, use helper function, alter > message. > I added a helper function and used it here. I don't really get the request for spaces. It is calling the rpc, raising serious exceptions, then evaluating the other output. I think that pretty much belongs together as a block. > > >> + result_msgs = remove_result[master_node].payload >> + if result_msgs: >> + feedback_fn("Removing old SSH key of node '%s' failed on some >> nodes." >> + % new_node_name) >> + for node, error_msg in result_msgs.items(): >> + feedback_fn("%s: %s" % (node, error_msg)) >> >> result = rpcrunner.call_node_ssh_key_add( >> [master_node], new_node_uuid, new_node_name, >> @@ -885,12 +891,15 @@ class LUNodeSetParams(LogicalUnit): >> False, # currently, all nodes are potential master candidates >> False, # do not clear node's 'authorized_keys' >> False) # do not clear node's 'ganeti_pub_keys' >> - if not ssh_result[master_node].fail_msg: >> - for message in ssh_result[master_node].payload: >> - feedback_fn(message) >> ssh_result[master_node].Raise( >> "Could not adjust the SSH setup after demoting node '%s'" >> " (UUID: %s)." % (node.name, node.uuid)) >> > > As above. > See above. > > >> + if not ssh_result[master_node].fail_msg: >> + if ssh_result[master_node].payload: >> + feedback_fn("When demoting node '%s', updating SSH keys on" >> + " some nodes failed." % (node.name)) >> + for node, message in ssh_result[master_node].payload: >> + feedback_fn("%s: %s" % (node, message)) >> >> if self.new_role == self._ROLE_CANDIDATE: >> ssh_result = self.rpc.call_node_ssh_key_add( >> @@ -1603,6 +1612,12 @@ class LUNodeRemove(LogicalUnit): >> result[master_node].Raise( >> "Could not remove the SSH key of node '%s' (UUID: %s)." % >> (self.op.node_name, self.node.uuid)) >> + result_msgs = result[master_node].payload >> + if result_msgs: >> + feedback_fn("Removing old SSH key of node '%s' failed on some >> nodes." >> + % self.op.node_name) >> + for node, error_msg in result_msgs.items(): >> + feedback_fn("%s: %s" % (node, error_msg)) >> >> # Promote nodes to master candidate as needed >> AdjustCandidatePool(self, [self.node.uuid]) >> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ >> ganeti.backend_unittest.py >> index e05580d..3c207f8 100755 >> --- a/test/py/ganeti.backend_unittest.py >> +++ b/test/py/ganeti.backend_unittest.py >> @@ -1563,6 +1563,71 @@ class >> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): >> >> self.assertTrue(node_errors[other_node_name]) >> >> + def testRemoveKeySuccessfullyWithRetriesOnOtherNode(self): >> + """Test removing keys even if one of the old nodes needs retries. >> + >> + This tests checks whether a key can be removed successfully even >> + when one of the other nodes needs to be contacted with several >> + retries. >> + >> + """ >> + all_master_candidates = >> self._ssh_file_manager.GetAllMasterCandidates() >> + node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0] >> + other_node_name, _, _, _, _, _ = all_master_candidates[1] >> + assert node_name != self._master_node >> + assert other_node_name != self._master_node >> + assert node_name != other_node_name >> + self._ssh_file_manager.SetMaxRetries(other_node_name, 3) >> + >> + backend.RemoveNodeSshKey(node_uuid, node_name, >> + self._master_candidate_uuids, >> + self._potential_master_candidates, >> + self._ssh_port_map, >> + from_authorized_keys=True, >> + from_public_keys=True, >> + clear_authorized_keys=True, >> + clear_public_keys=True, >> + pub_key_file=self._pub_key_file, >> + ssconf_store=self._ssconf_mock, >> + noded_cert_file=self.noded_cert_file, >> + run_cmd_fn=self._run_cmd_mock) >> + >> + self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey( >> + node_uuid, node_key)) >> + >> self.assertTrue(self._ssh_file_manager.NoNodeHasAuthorizedKey(node_key)) >> + >> + def testRemoveKeyFailedWithRetriesOnOtherNode(self): >> + """Test removing keys even if one of the old nodes fails even with >> retries. >> + >> + This tests checks whether the removal of a key finishes properly, >> even if >> + the update of the key files on one of the other nodes fails despite >> several >> + retries. >> + >> + """ >> + all_master_candidates = >> self._ssh_file_manager.GetAllMasterCandidates() >> + node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0] >> + other_node_name, _, _, _, _, _ = all_master_candidates[1] >> + assert node_name != self._master_node >> + assert other_node_name != self._master_node >> + assert node_name != other_node_name >> + self._ssh_file_manager.SetMaxRetries(other_node_name, 4) >> + >> + error_msgs = backend.RemoveNodeSshKey( >> + node_uuid, node_name, self._master_candidate_uuids, >> + self._potential_master_candidates, self._ssh_port_map, >> + from_authorized_keys=True, from_public_keys=True, >> + clear_authorized_keys=True, clear_public_keys=True, >> + pub_key_file=self._pub_key_file, ssconf_store=self._ssconf_mock, >> + noded_cert_file=self.noded_cert_file, >> run_cmd_fn=self._run_cmd_mock) >> + >> + for node in self._all_nodes: >> + if node == other_node_name: >> + self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( >> + node, node_key)) >> + else: >> + self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( >> + node, node_key)) >> + self.assertTrue(error_msgs[other_node_name]) >> >> class TestVerifySshSetup(testutils.GanetiTestCase): >> >> -- >> 2.2.0.rc0.207.ga3a616c >> >> Interdiff: commit cdd4f00f57d5610dd467ba1c3562d3d89ff6eca0 Author: Helga Velroyen <[email protected]> Date: Thu Apr 23 12:29:06 2015 +0200 interdiff diff --git a/lib/backend.py b/lib/backend.py index 87c8c77..bdc7d3e 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1659,8 +1659,8 @@ def RemoveNodeSshKey(node_uuid, node_name, @returns: list of feedback messages """ - # Non-disruptive error messages, mapped from node name to message - result_msgs = {} + # Non-disruptive error messages, list of (node, msg) pairs + result_msgs = [] # Make sure at least one of these flags is true. if not (from_authorized_keys or from_public_keys or clear_authorized_keys @@ -1731,13 +1731,12 @@ def RemoveNodeSshKey(node_uuid, node_name, raise errors.OpExecError("No SSH port information available for" " node '%s', map: %s." % (node, ssh_port_map)) + error_msg_try = ("The SSH setup of node '%s' could not" + " be adjusted in try no %s. Error: %s.") + error_msg_final = ("When removing the key of node '%s', updating the" + " SSH key files of node '%s' failed after %s" + " retries. Not trying again. Last error was: %s.") if node in potential_master_candidates: - error_msg_try = ("The SSH setup of node '%s' could not" - " be adjusted in try no %s. Error: %s.") - error_msg_final = ("When removing the key of node '%s', updating the" - " SSH key files of node '%s' failed after %s" - " retries. Not trying again. Last error was: %s.") - last_exception = None for i in range(constants.SSHS_MAX_RETRIES): try: run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, @@ -1752,7 +1751,7 @@ def RemoveNodeSshKey(node_uuid, node_name, if last_exception: error_msg = error_msg_final % ( node_name, node, constants.SSHS_MAX_RETRIES, last_exception) - result_msgs[node] = error_msg + result_msgs.append((node, error_msg)) logging.error(error_msg) else: @@ -1772,7 +1771,7 @@ def RemoveNodeSshKey(node_uuid, node_name, if last_exception: error_msg = error_msg_final % ( node_name, node, constants.SSHS_MAX_RETRIES, last_exception) - result_msgs[node] = error_msg + result_msgs.append((node, error_msg)) logging.error(error_msg) if clear_authorized_keys or from_public_keys or clear_public_keys: @@ -2015,7 +2014,7 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map, from_public_keys=False, clear_authorized_keys=False, clear_public_keys=False) if node_errors: - all_node_errors = all_node_errors + [node_errors.items()] + all_node_errors = all_node_errors + node_errors else: logging.debug("Old key of node '%s' is the same as the current master" " key. Not deleting that key on the node.", node_name) @@ -2104,7 +2103,7 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map, from_public_keys=False, clear_authorized_keys=False, clear_public_keys=False) if node_errors: - all_node_errors = all_node_errors + node_errors.items() + all_node_errors = all_node_errors + node_errors return all_node_errors diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py index 337290e..41451b3 100644 --- a/lib/cmdlib/node.py +++ b/lib/cmdlib/node.py @@ -366,12 +366,7 @@ class LUNodeAdd(LogicalUnit): remove_result[master_node].Raise( "Could not remove SSH keys of node %s before readding," " (UUID: %s)." % (new_node_name, new_node_uuid)) - result_msgs = remove_result[master_node].payload - if result_msgs: - feedback_fn("Removing old SSH key of node '%s' failed on some nodes." - % new_node_name) - for node, error_msg in result_msgs.items(): - feedback_fn("%s: %s" % (node, error_msg)) + EvaluateSshUpdateRPC(remove_result, master_node, feedback_fn) result = rpcrunner.call_node_ssh_key_add( [master_node], new_node_uuid, new_node_name, @@ -891,12 +886,7 @@ class LUNodeSetParams(LogicalUnit): ssh_result[master_node].Raise( "Could not adjust the SSH setup after demoting node '%s'" " (UUID: %s)." % (node.name, node.uuid)) - if not ssh_result[master_node].fail_msg: - if ssh_result[master_node].payload: - feedback_fn("When demoting node '%s', updating SSH keys on" - " some nodes failed." % (node.name)) - for node, message in ssh_result[master_node].payload: - feedback_fn("%s: %s" % (node, message)) + EvaluateSshUpdateRPC(ssh_result, master_node, feedback_fn) if self.new_role == self._ROLE_CANDIDATE: ssh_result = self.rpc.call_node_ssh_key_add( @@ -1605,12 +1595,7 @@ class LUNodeRemove(LogicalUnit): result[master_node].Raise( "Could not remove the SSH key of node '%s' (UUID: %s)." % (self.op.node_name, self.node.uuid)) - result_msgs = result[master_node].payload - if result_msgs: - feedback_fn("Removing old SSH key of node '%s' failed on some nodes." - % self.op.node_name) - for node, error_msg in result_msgs.items(): - feedback_fn("%s: %s" % (node, error_msg)) + EvaluateSshUpdateRPC(result, master_node, feedback_fn) # Promote nodes to master candidate as needed AdjustCandidatePool(self, [self.node.uuid]) diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ ganeti.backend_unittest.py index 3e54c38..181f495 100755 --- a/test/py/ganeti.backend_unittest.py +++ b/test/py/ganeti.backend_unittest.py @@ -1634,7 +1634,8 @@ class TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): else: self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( node, node_key)) - self.assertTrue(error_msgs[other_node_name]) + self.assertTrue([error_msg for (node, error_msg) in error_msgs + if node == other_node_name]) class TestVerifySshSetup(testutils.GanetiTestCase): > > Hrvoje Ribicic > Ganeti Engineering > Google Germany GmbH > Dienerstr. 12, 80331, München > > Registergericht und -nummer: Hamburg, HRB 86891 > Sitz der Gesellschaft: Hamburg > Geschäftsführer: Graham Law, Christine Elizabeth Flores > Steuernummer: 48/725/00206 > Umsatzsteueridentifikationsnummer: DE813741370 >
