On Wed, 22 Apr 2015 at 16:33 Hrvoje Ribicic <[email protected]> wrote:
> On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel <
> [email protected]> wrote:
>
>> So far, if a SSH key of a node is added to the cluster
>> and updating the SSH setup of the node itself fails,
>> the entire operation fails quite disgraceful. This
>> patch introduces a retry mechanism and a proper cleanup
>> in case of a failure.
>>
>> Note that there are more places where retries would
>> be a good idea. They'll be added in other patches.
>>
>> Signed-off-by: Helga Velroyen <[email protected]>
>> ---
>> lib/backend.py | 28 +++++++++++++---
>> test/py/ganeti.backend_unittest.py | 67
>> ++++++++++++++++++++++++++++++++++++++
>> test/py/testutils_ssh.py | 30 ++++++++++++++++-
>> 3 files changed, 120 insertions(+), 5 deletions(-)
>>
>> diff --git a/lib/backend.py b/lib/backend.py
>> index bc1a570..818f3fb 100644
>> --- a/lib/backend.py
>> +++ b/lib/backend.py
>> @@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
>> _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
>> cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
>>
>> + # number of retries for SSH connections
>> + max_retries = 3
>
> +
>> # Update the target node itself
>> if get_public_keys:
>> node_data = {}
>> @@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
>> all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
>> node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
>> (constants.SSHS_OVERRIDE, all_keys)
>> - run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>> - ssh_port_map.get(node_name), node_data,
>> - debug=False, verbose=False, use_cluster_key=False,
>> - ask_key=False, strict_host_check=False)
>> +
>> + last_exception = None
>> + for _ in range(max_retries):
>> + try:
>> + run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>> + ssh_port_map.get(node_name), node_data,
>> + debug=False, verbose=False, use_cluster_key=False,
>> + ask_key=False, strict_host_check=False)
>> + break
>> + except errors.OpExecError as e:
>> + logging.error("Updating SSH key files of node '%s' failed."
>> + " Error: %s", node_name, e)
>> + last_exception = e
>> + else:
>> + if last_exception:
>> + # Clean up the master's public key file if adding key fails
>> + if to_public_keys:
>> + ssh.RemovePublicKey(node_uuid)
>> + raise errors.SshUpdateError(
>> + "Could not update the SSH setup of node '%s' itself. Error:
>> %s."
>> + % (node_name, last_exception))
>>
>> # Update all nodes except master and the target node
>> if to_authorized_keys:
>> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
>> ganeti.backend_unittest.py
>> index 5305c15..cc65501 100755
>> --- a/test/py/ganeti.backend_unittest.py
>> +++ b/test/py/ganeti.backend_unittest.py
>> @@ -1400,6 +1400,73 @@ class
>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>> self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>> node, node_key))
>>
>> + def testAddKeySuccessfullyWithRetries(self):
>> + new_node_name = "new_node_name"
>> + new_node_uuid = "new_node_uuid"
>> + new_node_key = "new_node_key"
>> + is_master_candidate = True
>> + is_potential_master_candidate = True
>> + is_master = False
>> +
>> + self._AddNewNodeToTestData(
>> + new_node_name, new_node_uuid, new_node_key,
>> + is_potential_master_candidate, is_master_candidate,
>> + is_master)
>> + self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
>> +
>> + backend.AddNodeSshKey(new_node_uuid, new_node_name,
>> + self._potential_master_candidates,
>> + self._ssh_port_map,
>> + to_authorized_keys=is_master_candidate,
>> + to_public_keys=is_potential_master_candidate,
>> + get_public_keys=is_potential_master_candidate,
>> + pub_key_file=self._pub_key_file,
>> + ssconf_store=self._ssconf_mock,
>> + noded_cert_file=self.noded_cert_file,
>> + run_cmd_fn=self._run_cmd_mock)
>> +
>> + self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
>> + new_node_name)
>>
>
> This looks like it needs an assertTrue.
>
Actually, PotentialMasterCandidatesOnlyHavePublicKey is more an assertion
function than a boolean one, because it throws an exception instead of
returning false. I fixed that inconsistency for all occurrences in a later
patch.
>
>
>> + self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
>> + new_node_key))
>> +
>> + def testAddKeyFailedWithRetries(self):
>> + new_node_name = "new_node_name"
>> + new_node_uuid = "new_node_uuid"
>> + new_node_key = "new_node_key"
>> + is_master_candidate = True
>> + is_potential_master_candidate = True
>> + is_master = False
>> +
>> + self._AddNewNodeToTestData(
>> + new_node_name, new_node_uuid, new_node_key,
>> + is_potential_master_candidate, is_master_candidate,
>> + is_master)
>> + self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
>> +
>> + self.assertRaises(
>> + errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
>> + new_node_name, self._potential_master_candidates,
>> self._ssh_port_map,
>> + to_authorized_keys=is_master_candidate,
>> + to_public_keys=is_potential_master_candidate,
>> + get_public_keys=is_potential_master_candidate,
>> + pub_key_file=self._pub_key_file,
>> + ssconf_store=self._ssconf_mock,
>> + noded_cert_file=self.noded_cert_file,
>> + run_cmd_fn=self._run_cmd_mock)
>> +
>> + for node in self._all_nodes:
>> + if node == new_node_name:
>> + self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>> + node, new_node_key))
>> + else:
>> + self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
>> + node, new_node_key))
>> +
>> + self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
>> + new_node_uuid, new_node_key))
>> +
>> +
>> class TestVerifySshSetup(testutils.GanetiTestCase):
>>
>> _NODE1_UUID = "uuid1"
>> diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
>> index ff7db63..a180144 100644
>> --- a/test/py/testutils_ssh.py
>> +++ b/test/py/testutils_ssh.py
>> @@ -32,6 +32,7 @@
>>
>> from ganeti import constants
>> from ganeti import pathutils
>> +from ganeti import errors
>>
>>
>> class FakeSshFileManager(object):
>> @@ -66,6 +67,13 @@ class FakeSshFileManager(object):
>> self._public_keys = {} # dict of dicts
>> # Node name of the master node
>> self._master_node_name = None
>> + # Dictionary mapping nodes by name to number of retries where
>> 'RunCommand'
>> + # succeeds. For example if set to '3', RunCommand will fail two
>> times when
>> + # called for this node before it succeeds in the 3rd retry.
>> + self._max_retries = {}
>> + # Dictionary mapping nodes by name to number of retries which
>> + # 'RunCommand' has already carried out.
>> + self._retries = {}
>>
>> def _SetMasterNodeName(self):
>> self._master_node_name = [name for name, (_, _, _, _, master)
>> @@ -124,6 +132,17 @@ class FakeSshFileManager(object):
>> self._FillAuthorizedKeyOfOneNode(node)
>> self._SetMasterNodeName()
>>
>> + def SetMaxRetries(self, node_name, retries):
>> + """Set the number of unsuccessful retries of 'RunCommand' per node.
>> +
>> + @type node_name: string
>> + @param node_name: name of the node
>> + @type retries: integer
>> + @param retries: number of unsuccessful retries
>> +
>> + """
>> + self._max_retries[node_name] = retries
>> +
>> def GetSshPortMap(self, port):
>> """Creates a SSH port map with all nodes mapped to the given port.
>>
>> @@ -281,7 +300,8 @@ class FakeSshFileManager(object):
>> for name in self._all_node_data.keys():
>> node_pub_keys = self._public_keys[name]
>> if (uuid, key) in node_pub_keys.items():
>> - return False
>> + raise Exception("Node '%s' does have public key '%s' of node
>> '%s'"
>> + % (name, key, uuid))
>> return True
>>
>> def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name):
>> @@ -326,6 +346,14 @@ class FakeSshFileManager(object):
>> of SSH keys. No actual key files of any node is touched.
>>
>> """
>> + if node in self._max_retries:
>> + if node not in self._retries:
>> + self._retries[node] = 0
>> + self._retries[node] += 1
>> + if self._retries[node] < self._max_retries[node]:
>> + raise errors.OpExecError("(Fake) SSH connection to node '%s'
>> failed."
>> + % node)
>> +
>> assert base_cmd == pathutils.SSH_UPDATE
>>
>> if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
>> --
>> 2.2.0.rc0.207.ga3a616c
>>
>>
> Hrvoje Ribicic
> Ganeti Engineering
> Google Germany GmbH
> Dienerstr. 12, 80331, München
>
> Registergericht und -nummer: Hamburg, HRB 86891
> Sitz der Gesellschaft: Hamburg
> Geschäftsführer: Graham Law, Christine Elizabeth Flores
> Steuernummer: 48/725/00206
> Umsatzsteueridentifikationsnummer: DE813741370
>