On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel < [email protected]> wrote:
> This patch introduces retries for updating non-master nodes > when a new SSH key is added to the cluster. If an SSH update > fails even after the maximum number of retries is exhausted, > this does not raise an exception, but an error message is > added to the result of the method, which can be displayed > in the LU operations so that the admin sees which nodes > were the problem and need further attention. > > Signed-off-by: Helga Velroyen <[email protected]> > --- > lib/backend.py | 73 +++++++++++++++++++-------- > lib/cmdlib/cluster.py | 9 +++- > lib/cmdlib/node.py | 14 ++++- > test/py/ganeti.backend_unittest.py | 101 > ++++++++++++++++++++++++++++++++++++- > 4 files changed, 169 insertions(+), 28 deletions(-) > > diff --git a/lib/backend.py b/lib/backend.py > index 818f3fb..3b17306 100644 > --- a/lib/backend.py > +++ b/lib/backend.py > @@ -1566,17 +1566,38 @@ def AddNodeSshKey(node_uuid, node_name, > master_node = ssconf_store.GetMasterNode() > online_nodes = ssconf_store.GetOnlineNodeList() > > + node_errors = {} for node in all_nodes: > if node == master_node: > + logging.debug("Skipping master node '%s'.", master_node) > continue > if node not in online_nodes: > logging.debug("Skipping offline node '%s'.", node) > continue > if node in potential_master_candidates: > - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, > - ssh_port_map.get(node), pot_mc_data, > - debug=False, verbose=False, use_cluster_key=False, > - ask_key=False, strict_host_check=False) > + logging.debug("Updating SSH key files of node '%s'.") > + for i in range(max_retries): > + try: > + run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, > + ssh_port_map.get(node), pot_mc_data, > + debug=False, verbose=False, use_cluster_key=False, > + ask_key=False, strict_host_check=False) > + break > + except errors.OpExecError as e: > + logging.error("Updating SSH key files of node '%s' failed" > + " at try no %s. Error: %s.", node, i, e) > + last_exception = e > + else: > + if last_exception: > + error_msg = ("When adding the key of node '%s', updating SSH > key" > + " files of node '%s' failed after %s retries." > + " Not trying again. Last errors was: %s." % > Nitpick: s/errors/error/ > + (node, node_name, max_retries, last_exception)) + node_errors[node] = error_msg > + # We only log the error and don't throw an exception, because > + # one unreachable node shall not abort the entire procedure. > + logging.error(error_msg) > + > else: > if to_authorized_keys: > run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, > @@ -1584,6 +1605,8 @@ def AddNodeSshKey(node_uuid, node_name, > debug=False, verbose=False, use_cluster_key=False, > ask_key=False, strict_host_check=False) > > + return node_errors > + > > def RemoveNodeSshKey(node_uuid, node_name, > master_candidate_uuids, > @@ -1925,6 +1948,10 @@ def RenewSshKeys(node_uuids, node_names, > ssh_port_map, > > master_node_name = ssconf_store.GetMasterNode() > master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, > master_node_name) > + # List of all node errors that happened, but which did not abort the > + # procedure as a whole. It is important that this is a list to have a > + # somewhat chronological history of events. > + all_node_errors = [] > > # process non-master nodes > for node_uuid, node_name in node_uuid_name_map: > @@ -1989,15 +2016,16 @@ def RenewSshKeys(node_uuids, node_names, > ssh_port_map, > ssh.AddPublicKey(node_uuid, pub_key, key_file=pub_key_file) > > logging.debug("Add ssh key of node '%s'.", node_name) > - AddNodeSshKey(node_uuid, node_name, > - potential_master_candidates, > - ssh_port_map, > - to_authorized_keys=master_candidate, > - to_public_keys=potential_master_candidate, > - get_public_keys=True, > - pub_key_file=pub_key_file, ssconf_store=ssconf_store, > - noded_cert_file=noded_cert_file, > - run_cmd_fn=run_cmd_fn) > + node_errors = AddNodeSshKey( > + node_uuid, node_name, potential_master_candidates, > + ssh_port_map, to_authorized_keys=master_candidate, > + to_public_keys=potential_master_candidate, > + get_public_keys=True, > + pub_key_file=pub_key_file, ssconf_store=ssconf_store, > + noded_cert_file=noded_cert_file, > + run_cmd_fn=run_cmd_fn) > + if node_errors: > + all_node_errors = all_node_errors + node_errors.items() > > # Renewing the master node's key > > @@ -2022,15 +2050,14 @@ def RenewSshKeys(node_uuids, node_names, > ssh_port_map, > > # Add new master key to all node's public and authorized keys > logging.debug("Add new master key to all nodes.") > - AddNodeSshKey(master_node_uuid, master_node_name, > - potential_master_candidates, > - ssh_port_map, > - to_authorized_keys=True, > - to_public_keys=True, > - get_public_keys=False, > - pub_key_file=pub_key_file, ssconf_store=ssconf_store, > - noded_cert_file=noded_cert_file, > - run_cmd_fn=run_cmd_fn) > + node_errors = AddNodeSshKey( > + master_node_uuid, master_node_name, potential_master_candidates, > + ssh_port_map, to_authorized_keys=True, to_public_keys=True, > + get_public_keys=False, pub_key_file=pub_key_file, > + ssconf_store=ssconf_store, noded_cert_file=noded_cert_file, > + run_cmd_fn=run_cmd_fn) > + if node_errors: > + all_node_errors = all_node_errors + node_errors.items() > > # Remove the old key file and rename the new key to the non-temporary > filename > _ReplaceMasterKeyOnMaster(root_keyfiles) > @@ -2053,6 +2080,8 @@ def RenewSshKeys(node_uuids, node_names, > ssh_port_map, > clear_authorized_keys=False, > clear_public_keys=False) > > + return all_node_errors > + > > def GetBlockDevSizes(devices): > """Return the size of the given block devices > diff --git a/lib/cmdlib/cluster.py b/lib/cmdlib/cluster.py > index c504d60..14c18da 100644 > --- a/lib/cmdlib/cluster.py > +++ b/lib/cmdlib/cluster.py > @@ -210,7 +210,7 @@ class LUClusterRenewCrypto(NoHooksLU): > self.cfg.RemoveNodeFromCandidateCerts("%s-SERVER" % master_uuid) > self.cfg.RemoveNodeFromCandidateCerts("%s-OLDMASTER" % master_uuid) > > - def _RenewSshKeys(self): > + def _RenewSshKeys(self, feedback_fn): > """Renew all nodes' SSH keys. > > """ > @@ -230,6 +230,11 @@ class LUClusterRenewCrypto(NoHooksLU): > master_candidate_uuids, > potential_master_candidates) > result[master_uuid].Raise("Could not renew the SSH keys of all nodes") > Just out of curiosity: under which conditions would this actually be raised? If my observation has merit, maybe you can change the message to reflect the unexpectedness of the failure. Also, a comment and/or an empty line would be useful here. Otherwise it looks like a bit of a bug. The same is true for the next two entries. > + node_errors = result[master_uuid].payload > + if node_errors: > + feedback_fn("Some nodes' SSH key files could not be updated:") > + for node_name, error_msg in node_errors: > + feedback_fn("%s: %s" % (node_name, error_msg)) > > def Exec(self, feedback_fn): > if self.op.node_certificates: > @@ -237,7 +242,7 @@ class LUClusterRenewCrypto(NoHooksLU): > self._RenewNodeSslCertificates(feedback_fn) > if self.op.ssh_keys and not self._ssh_renewal_suppressed: > feedback_fn("Renewing SSH keys") > - self._RenewSshKeys() > + self._RenewSshKeys(feedback_fn) > elif self._ssh_renewal_suppressed: > feedback_fn("Cannot renew SSH keys if the cluster is configured to > not" > " modify the SSH setup.") > diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py > index 19b630a..6a13d8a 100644 > --- a/lib/cmdlib/node.py > +++ b/lib/cmdlib/node.py > @@ -339,7 +339,7 @@ class LUNodeAdd(LogicalUnit): > result.Raise("Failed to initialize OpenVSwitch on new node") > > def _SshUpdate(self, new_node_uuid, new_node_name, is_master_candidate, > - is_potential_master_candidate, rpcrunner, readd): > + is_potential_master_candidate, rpcrunner, readd, > feedback_fn): > """Update the SSH setup of all nodes after adding a new node. > > @type readd: boolean > @@ -373,6 +373,11 @@ class LUNodeAdd(LogicalUnit): > is_master_candidate, is_potential_master_candidate, > is_potential_master_candidate) > result[master_node].Raise("Could not update the node's SSH setup.") > + node_errors = result[master_node].payload > + if node_errors: > + feedback_fn("Some nodes' SSH key files could not be updated:") > + for node_name, error_msg in node_errors.items(): > + feedback_fn("%s: %s." % (node_name, error_msg)) > Between SSL and the multiple placements, this really is calling out to be a helper function. > > def Exec(self, feedback_fn): > """Adds the new node to the cluster. > @@ -481,7 +486,7 @@ class LUNodeAdd(LogicalUnit): > # FIXME: so far, all nodes are considered potential master > candidates > self._SshUpdate(self.new_node.uuid, self.new_node.name, > self.new_node.master_candidate, True, > - self.rpc, self.op.readd) > + self.rpc, self.op.readd, feedback_fn) > > > class LUNodeSetParams(LogicalUnit): > @@ -897,6 +902,11 @@ class LUNodeSetParams(LogicalUnit): > ssh_result[master_node].Raise( > "Could not update the SSH setup of node '%s' after promotion" > " (UUID: %s)." % (node.name, node.uuid)) + node_errors = ssh_result[master_node].payload > + if node_errors: > + feedback_fn("Some nodes' SSH key files could not be updated:") > + for node_name, error_msg in node_errors.items(): > + feedback_fn("%s: %s." % (node_name, error_msg)) > return result > > diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ > ganeti.backend_unittest.py > index cc65501..e05580d 100755 > --- a/test/py/ganeti.backend_unittest.py > +++ b/test/py/ganeti.backend_unittest.py > @@ -1400,7 +1400,14 @@ class > TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): > self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( > node, node_key)) > > - def testAddKeySuccessfullyWithRetries(self): > + def testAddKeySuccessfullyOnNewNodeWithRetries(self): > + """Tests adding a new node's key when updating that node takes > retries. > + > + This test checks whether adding a new node's key successfully updates > + all node's SSH key files, even if updating the new node's key files > itself > Nit: s/node's/nodes'/ for the first one. The second one stays as-is. Possible better formulation: .. updates the SSH key files of all nodes, even if updating the new node's files ... > + takes a couple of retries to succeed. > + > + """ > new_node_name = "new_node_name" > new_node_uuid = "new_node_uuid" > new_node_key = "new_node_key" > @@ -1430,7 +1437,14 @@ class > TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): > self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey( > new_node_key)) > > - def testAddKeyFailedWithRetries(self): > + def testAddKeyFailedOnNewNodeWithRetries(self): > + """Tests clean up if updating a new node's SSH setup fails. > + > + If adding a new node's keys fails, because the SSH key files of that > + new node fails, check whether already carried out operations are > Nit: Better formulation: If adding the keys of a new node fails because the SSH key files ... > + successfully rolled back and thus the state of the cluster is cleaned > up. > + > + """ > new_node_name = "new_node_name" > new_node_uuid = "new_node_uuid" > new_node_key = "new_node_key" > @@ -1466,6 +1480,89 @@ class > TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): > self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey( > new_node_uuid, new_node_key)) > > + def testAddKeySuccessfullyOnOldNodeWithRetries(self): > + """Tests adding a new key even if updating nodes takes retries. > + > + This tests whether adding a new node's key successfully finishes, > + even if one of the other cluster nodes takes a couple of retries > + to succeed. > + > + """ > + new_node_name = "new_node_name" > + new_node_uuid = "new_node_uuid" > + new_node_key = "new_node_key" > + is_master_candidate = True > + is_potential_master_candidate = True > + is_master = False > + > + other_node_name, _, _, _, _, _ = self._ssh_file_manager \ > + .GetAllMasterCandidates()[0] > Afaik, this will be fixed in a follow-up patch, right? I second Petr's suggestion of named tuples. > + self._ssh_file_manager.SetMaxRetries(other_node_name, 3) > + assert other_node_name != new_node_name > + self._AddNewNodeToTestData( > + new_node_name, new_node_uuid, new_node_key, > + is_potential_master_candidate, is_master_candidate, > + is_master) > + > + backend.AddNodeSshKey(new_node_uuid, new_node_name, > + self._potential_master_candidates, > + self._ssh_port_map, > + to_authorized_keys=is_master_candidate, > + to_public_keys=is_potential_master_candidate, > + get_public_keys=is_potential_master_candidate, > + pub_key_file=self._pub_key_file, > + ssconf_store=self._ssconf_mock, > + noded_cert_file=self.noded_cert_file, > + run_cmd_fn=self._run_cmd_mock) > + > + self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey( > + new_node_key)) > + > + def testAddKeyFailedOnOldNodeWithRetries(self): > + """Tests adding keys when updating one node's SSH setup fails. > + > + This tests, whether when adding a new node's key and one node is > Nit: s/,// > + unreachable (but not marked as offline) the operation still finishes > + properly and only that unreachable node's SSH key setup did not get > + updated. > + > + """ > + new_node_name = "new_node_name" > + new_node_uuid = "new_node_uuid" > + new_node_key = "new_node_key" > + is_master_candidate = True > + is_potential_master_candidate = True > + is_master = False > + > + other_node_name, _, _, _, _, _ = self._ssh_file_manager \ > + .GetAllMasterCandidates()[0] > + self._ssh_file_manager.SetMaxRetries(other_node_name, 4) > + assert other_node_name != new_node_name > + self._AddNewNodeToTestData( > + new_node_name, new_node_uuid, new_node_key, > + is_potential_master_candidate, is_master_candidate, > + is_master) > + > + node_errors = backend.AddNodeSshKey( > + new_node_uuid, new_node_name, self._potential_master_candidates, > + self._ssh_port_map, to_authorized_keys=is_master_candidate, > + to_public_keys=is_potential_master_candidate, > + get_public_keys=is_potential_master_candidate, > + pub_key_file=self._pub_key_file, > + ssconf_store=self._ssconf_mock, > + noded_cert_file=self.noded_cert_file, > + run_cmd_fn=self._run_cmd_mock) > + > + for node in self._all_nodes: > + if node == other_node_name: > + self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( > + node, new_node_key)) > + else: > + self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( > + node, new_node_key)) > + > + self.assertTrue(node_errors[other_node_name]) > + > > class TestVerifySshSetup(testutils.GanetiTestCase): > > -- > 2.2.0.rc0.207.ga3a616c > > Hrvoje Ribicic Ganeti Engineering Google Germany GmbH Dienerstr. 12, 80331, München Registergericht und -nummer: Hamburg, HRB 86891 Sitz der Gesellschaft: Hamburg Geschäftsführer: Graham Law, Christine Elizabeth Flores Steuernummer: 48/725/00206 Umsatzsteueridentifikationsnummer: DE813741370
