Am 18. Februar 2011 15:14 schrieb René Nussbaumer <[email protected]>:
> This is a convinience command to do an automated EPO in the possible limits of
s/convinience/convenience/
> Ganeti.
> +def _InstanceStart(opts, inst_list, start):
> […]
> + bad_cnt = len([row for row in results if not row[0]])
What is row[0]? (hint: explicit unpacking)
> +class _RunWhenNodesReachableHelper:
> + """Helper class to make shared internal state sharing easier.
> +
> + @ivar success: Indicates if all action_cb calls were successful
> + """
Wrong indentation for docstring, missing empty line.
> + def Wait(self, secs):
> + start = time.time()
> + for node in self.down:
> + if self._ping_fn(self.node2ip[node], self.port,
> + timeout=self._EPO_PING_TIMEOUT,
> live_port_needed=True):
You didn't test or pylint this code, did you? Hint: there's no self._EPO_*.
> +def _RunWhenNodesReachable(node_list, action_cb, interval):
> + client = GetClient()
> + cluster_info = client.QueryClusterInfo()
> + if cluster_info["primary_ip_version"] == constants.IP4_VERSION:
> + family = netutils.IPAddress.family
> + else:
> + family = netutils.IP6Address.family
> +
> + node2ip = dict((node, netutils.GetHostname(node, family=family).ip)
> + for node in node_list)
> + for node in node_list:
> + host = netutils.GetHostname(node, family=family)
> + node2ip[node] = host.ip
\o/ No, you don't need that loop, the code before it does everything.
> + helper = _RunWhenNodesReachableHelper(node_list, action_cb, node2ip)
> +
> + try:
> + return utils.Retry(helper, interval, _EPO_REACHABLE_TIMEOUT,
> + wait_fn=helper.Wait)
> + except utils.RetryTimeout:
> + ToStderr("Time exceeded while waiting for nodes to become reachable"
> + " again:\n - %s", " - ".join(helper.down))
> + return False
Very nice!
> +def _EpoOn(opts, full_node_list, node_list, inst_map):
> + """Does the actual power on.
> +
> + @param opts: The command line options selected by the user
> + @param full_node_list: All nodes to operate on (includes nodes not
> supporting
> + OOB)
> + @param node_list: The list of nodes to operate on (all need to support OOB)
> + @param inst_map: A dict of inst -> nodes mapping
> + @return: The desired exit status
> +
> + """
> + if node_list and not _OobPower(opts, node_list, False):
> + ToStderr("Not all nodes seems to get back up, investigate and start"
s/seems/seem/
> + " manually if needed")
> +
> + # Wait for the nodes to be back up
> + action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map))
> +
> + ToStdout("Waiting until all nodes are available again")
> + if not _RunWhenNodesReachable(full_node_list, action_cb,
> _EPO_PING_INTERVAL):
> + ToStderr("Please investigate and start not running instances manually")
s/not running/stopped/
> + return constants.EXIT_FAILURE
> +
> + return constants.EXIT_FAILURE
> +
> +
> +def _EpoOff(opts, node_list, inst_map):
> + if not _InstanceStart(opts, inst_map.keys(), False):
> + ToStderr("Please investigate and stop instances manually before
> continue")
s/continue/continuing/
> + return constants.EXIT_FAILURE
> +
> + if node_list and _OobPower(opts, node_list, False):
> + return constants.EXIT_SUCCESS
> + else:
> + return constants.EXIT_FAILURE
> +def Epo(opts, args):
> […]
> + for (idx, (node, master, pinsts, sinsts, powered,
> + offline)) in enumerate(result):
> + # Normalize the node_query_list as well
> + node_query_list[idx] = node
> + if not offline:
> + for inst in (pinsts + sinsts):
> + if inst not in inst_map:
Please swap the two branches, so that you don't need “not” here.
> + if master:
> + inst_map[inst] = set()
> + else:
> + inst_map[inst] = set([node])
> + else:
> + if not master:
elif
> + inst_map[inst].add(node)
> +
> + if master and opts.on:
> + # We ignore the master for turning on the machines, in fact we are
> + # already operating on the master at this point :)
> + continue
> + elif master and not opts.show_all:
> + ToStderr("%s is the master node, please do a master-failover to
> another"
> + " node not affected by the EPO or use --all if you intend to"
> + " shutdown the whole cluster", node)
> + return constants.EXIT_FAILURE
> + elif powered is None:
> + ToStdout("Node %s does not support out-of-band handling, it can not be"
> + " handled in a fully automated manner", node)
> + elif powered == opts.on:
> + ToStdout("Node %s is already in desired power state, skipping", node)
> + else:
> + if not offline or (offline and powered):
elif
> + node_list.append(node)
> + "epo": (
> + Epo, [ArgUnknown()],
> + [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT],
> + "[opts...] [args]",
> + "Performs an EPO on given args"),
s/EPO/emergency power-off/ (maybe also in other places, don't assume
everyone knows abbreviations)
> --- a/man/gnt-cluster.rst
> +++ b/man/gnt-cluster.rst
> @@ -93,6 +93,23 @@ Remove all configuration files related to the cluster, so
> that a
> Since this is a dangerous command, you are required to pass the
> argument *--yes-do-it.*
>
> +EPO
> +~~~
> +
> +**epo** [--on] [--groups|--all] *arguments*
> +
> +Performans an EPO on nodes given as arguments. If ``--groups`` is given,
Here again.
> +arguments are node groups. If ``--all`` is provided, the whole cluster will
> be
> +shut down.
> +
> +The ``--on`` flag says, that the cluster should be recovered from a EPO.
“… flag restarts the cluster after an emergency power-off”
> +Please note that the master node will not be turned down or up automatically.
> +It will just be left in a state, where you can manully perform the shutdown
> of
s/manully/manually/
> +that one node. If the master is in the list of affected nodes and this is not
“is not _a_”
> +complete cluster EPO (e.g. using ``--all``), you're required to do a master
emergency power-off
> +failover to another node not affected.
Michael