On Wed, Oct 02, 2013 at 02:33:07PM +0200, Klaus Aehlig wrote:
> This command will coordinate the switching to a new
> Ganeti version across the cluster. This has become
> possible by the new layout that allows several Ganeti
> version to be present at the same time.
s/version/versions/
> 
> Signed-off-by: Klaus Aehlig <[email protected]>
> ---
>  lib/client/gnt_cluster.py | 279 
> ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 279 insertions(+)
> 
> diff --git a/lib/client/gnt_cluster.py b/lib/client/gnt_cluster.py
> index 8d81519..f31d8d7 100644
> --- a/lib/client/gnt_cluster.py
> +++ b/lib/client/gnt_cluster.py
> @@ -43,7 +43,9 @@ from ganeti import objects
>  from ganeti import uidpool
>  from ganeti import compat
>  from ganeti import netutils
> +from ganeti import ssconf
>  from ganeti import pathutils
> +from ganeti import qlang
>  
>  
>  ON_OPT = cli_option("--on", default=False,
> @@ -64,6 +66,12 @@ FORCE_DISTRIBUTION = cli_option("--yes-do-it", 
> dest="yes_do_it",
>                                  " is drained",
>                                  default=False, action="store_true")
>  
> +TO_OPT = cli_option("--to", default=None, type="string",
> +                    help="The Ganeti version to upgrade to")
> +
> +RESUME_OPT = cli_option("--resume", default=False, action="store_true",
> +                        help="Resume any pending Ganeti upgrades")
> +
>  _EPO_PING_INTERVAL = 30 # 30 seconds between pings
>  _EPO_PING_TIMEOUT = 1 # 1 second
>  _EPO_REACHABLE_TIMEOUT = 15 * 60 # 15 minutes
> @@ -1617,6 +1625,274 @@ def ShowCreateCommand(opts, args):
>    ToStdout(_GetCreateCommand(result))
>  
>  
> +def _RunCommandAndReport(cmd):
> +  """Run a command and reports its output, iff it failed.

s/reports/report/

> +
> +  @param cmd: the command to execute
> +  @type cmd: list
> +  @rtype: bool
> +  @return: False, if the execution failed.
> +
> +  """
> +  result = utils.RunCmd(cmd)
> +  if result.failed:
> +    ToStderr("Command %s failed: %s; Output %s" %
> +             (cmd, result.fail_reason, result.output))
> +    return False
> +  return True
> +
> +
> +def _VerifyCommand(cmd):
> +  """Verifes that a given command succeeds on all online nodes.

s/Verifes/Verify/

> +
> +  As this function is intended to run during upgrades, it
> +  is implemented in such a way that it still works, if all Ganeti
> +  daemons are down.
> +
> +  @param cmd: the command to execute
> +  @type cmd: list
> +  @rtype: list
> +  @return: the list of node names that are online where
> +      the command failed.

Other docstrings seem to have a newline here.

> +  """
> +  command = utils.text.ShellQuoteArgs([str(val) for val in cmd])
> +
> +  nodes = ssconf.SimpleStore().GetOnlineNodeList()
> +  master_node = ssconf.SimpleStore().GetMasterNode()
> +  cluster_name = ssconf.SimpleStore().GetClusterName()
> +
> +  # Make sure master node is at list end
> +  if master_node in nodes:
> +    nodes.remove(master_node)
> +    nodes.append(master_node)

Why not move out of the if

  nodes.append(master_node)

for example,

  if master_node in nodes:
    nodes.remove(master_node)
  nodes.append(master_node)

unless the list does not necessarily have to contain master node, in
which case I update the comment to say

# If master node is in 'nodes', make sure master node is at list end

Whatever you decide to do here, don't forget to update the other
occurrence of this code, if it falls within the same situation.

> +
> +  failed = []
> +
> +  srun = ssh.SshRunner(cluster_name=cluster_name)
> +  for name in nodes:
> +    result = srun.Run(name, constants.SSH_LOGIN_USER, command)
> +    if result.exit_code != 0:
> +      failed.append(name)
> +
> +  return failed
> +
> +
> +def _VerifyVersionInstalled(versionstring):
> +  """Verify that the given version of ganeti is isntalled on all online 
> nodes.

s/isntalled/installed/
> +
> +  Do nothing, if this is the case, otherwise print an appropriate
> +  message to stderr.
> +
> +  @param versionstring: the version to check for
> +  @type versionstring: string
> +  @rtype: bool
> +  @return: True, if the version is installed on all online nodes.

s/.//

> +
> +  """
> +  badnodes = _VerifyCommand(["test", "-d",
> +                             os.path.join(pathutils.PKGLIBDIR, 
> versionstring)])
> +  if badnodes:
> +    ToStderr("Ganeti version %s not installed on nodes %s"
> +             % (versionstring, ", ".join(badnodes)))
> +    return False
> +
> +  return True
> +
> +
> +def _getRunning():

Why not

s/_getRunning/_GetRunning/

to be consistent, for example, with '_GetEnabledDiskTemplates' and '_GetVgName'

> +  """Determine the list of running jobs.
> +
> +  @rtype: list
> +  @return: empty list, if no jobs are running

Saying just 'empty list' sounds confusing. Perhaps a more careful
explanation ?

> +
> +  """
> +  cl = GetClient()
> +  qfilter = qlang.MakeSimpleFilter("status",
> +                                   frozenset([constants.JOB_STATUS_RUNNING]))
> +  return cl.Query(constants.QR_JOB, [], qfilter).data
> +
> +
> +def _setGanetiVersion(versionstring):

Same as above.

> +  """Set the active version of ganeti to the given versionstring
> +
> +  @type versionstring: string
> +  @rtype: list
> +  @return: the list of nodes where the version change failed.

s/.//

> +
> +  """
> +  failed = []
> +  failed.extend(_VerifyCommand(
> +      ["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]))
> +  failed.extend(_VerifyCommand(
> +      ["ln", "-s", "-f", os.path.join(pathutils.PKGLIBDIR, versionstring),
> +       os.path.join(pathutils.SYSCONFDIR, "ganeti/lib")]))
> +  failed.extend(_VerifyCommand(
> +      ["rm", "-f", os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]))
> +  failed.extend(_VerifyCommand(
> +      ["ln", "-s", "-f", os.path.join(pathutils.SHAREDIR, versionstring),
> +       os.path.join(pathutils.SYSCONFDIR, "ganeti/share")]))
> +  return list(set(failed))
> +
> +
> +# pylint: disable=R0911
> +def UpgradeGanetiCommand(opts, args):
> +  """Upgrade a cluster to a new ganeti version.
> +
> +  @param opts: the command line options selected by the user
> +  @type args: list
> +  @param args: should be an empty list
> +  @rtype: int
> +  @return: the desired exit code
> +
> +  """
> +  if ((not opts.resume and opts.to is None)
> +      or (opts.resume and opts.to is not None)):
> +    ToStderr("Precisely one of the otpions --to and --resume"

s/otpions/options/

> +             " has to be given")
> +    return 1
> +
> +  if opts.resume:
> +    # TODO: implement
> +    ToStderr("The --resume mode is not yet implemented")
> +    return 1
> +
> +  versionstring = opts.to
> +  version = utils.version.ParseVersion(versionstring)
> +  if version is None:
> +    ToStderr("Could not parse version string %s" % versionstring)
> +    return 1

I guess now it would be nice to tell the user the correct format of
the 'versionstring'.  Perhaps we can reuse the regex ?

> +
> +  msg = utils.version.UpgradeRange(version)
> +  if msg is not None:
> +    ToStderr("Cannot upgrade to %s: %s" % (versionstring, msg))
> +    return 1
> +
> +  downgrade = utils.version.ShouldCfgdowngrade(version)
> +
> +  if not _VerifyVersionInstalled(versionstring):
> +    return 1
> +
> +  # TODO: write intent-to-upgrade file
> +
> +  ToStdout("Draining queue")
> +  client = GetClient()
> +  client.SetQueueDrainFlag(True)
> +
> +  if utils.SimpleRetry([], _getRunning,
> +                       constants.UPGRADE_QUEUE_POLL_INTERVALL,
> +                       constants.UPGRADE_QUEUE_DRAIN_TAIMEOUT):
> +    ToStderr("Failed to completely empty the queue.")
> +    client = GetClient()
> +    client.SetQueueDrainFlag(False)
> +    return 1
> +
> +  ToStdout("Stopping daemons on master node.")
> +  if not _RunCommandAndReport([pathutils.DAEMON_UTIL, "stop-all"]):
> +    client = GetClient()
> +    client.SetQueueDrainFlag(False)
> +    return 1
> +
> +  if not _VerifyVersionInstalled(versionstring):
> +    utils.RunCmd([pathutils.DAEMON_UTIL, "start-all"])
> +    client = GetClient()
> +    client.SetQueueDrainFlag(False)
> +    return 1
> +
> +  ToStdout("Stopping daemons everywhere.")
> +  badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"])
> +  if badnodes:
> +    ToStderr("Failed to stop daemons on %s." % (", ".join(badnodes),))
> +    _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
> +    client = GetClient()
> +    client.SetQueueDrainFlag(False)

There is a lot of duplication.  For example, this sequence appears
quite often and could be factored out.

> +    return 1
> +
> +  backuptar = os.path.join(pathutils.LOCALSTATEDIR,
> +                           "lib/ganeti%d.tar" % time.time())
> +  ToStdout("Backing up configuration as %s" % backuptar)
> +  if not _RunCommandAndReport(["tar", "cf", backuptar,
> +                               pathutils.DATA_DIR]):
> +    _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
> +    client = GetClient()
> +    client.SetQueueDrainFlag(False)
> +    return 1
> +
> +  if downgrade:
> +    ToStdout("Downgrading configuration")
> +    if not _RunCommandAndReport([pathutils.CFGUPGRADE, "--downgrade", "-f"]):
> +      _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
> +      client = GetClient()
> +      client.SetQueueDrainFlag(False)
> +      return 1
> +
> +  # Configuration change is the point of no return. From then onwards, it is
> +  # safer to push trough the up/dowgrade than to try to roll it back.

s/trough/through/

> +
> +  returnvalue = 0
> +
> +  ToStdout("Switching to version %s everywhere" % versionstring)

I am not sure if the user can understand the concept of 'everywhere'.
On all nodes perhaps ?

> +  badnodes = _setGanetiVersion(versionstring)
> +  if badnodes:
> +    ToStderr("Failed to switch to Ganeti version %s on nodes %s"
> +             % (versionstring, ", ".join(badnodes)))
> +    _setGanetiVersion(constants.DIR_VERSION)
> +    _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
> +    client = GetClient()
> +    client.SetQueueDrainFlag(False)
> +    return 1
> +
> +  # Now that we have changed to the new version of Ganeti we should
> +  # not communicate over luxi any more, as luxi might have changed in
> +  # incompatible ways. Therefore, manually call the corresponding ganeti
> +  # commands using their canonical (version independent) path.
> +
> +  if not downgrade:
> +    ToStdout("Upgrading configuration")
> +    if not _RunCommandAndReport([pathutils.CFGUPGRADE, "-f"]):
> +      _setGanetiVersion(constants.DIR_VERSION)
> +      _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
> +      client = GetClient()
> +      client.SetQueueDrainFlag(False)
> +      return 1
> +
> +  ToStdout("Starting daemons everywhere.")
> +  badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "start-all"])
> +  if badnodes:
> +    ToStderr("Warning: failed to start daemons on %s." % (", 
> ".join(badnodes),))
> +    returnvalue = 1
> +
> +  ToStdout("Ensuring directories everywhere.")
> +  badnodes = _VerifyCommand([pathutils.ENSURE_DIRS])
> +  if badnodes:
> +    ToStderr("Warning: failed to ensure directories on %s." %
> +             (", ".join(badnodes)))
> +    returnvalue = 1
> +
> +  ToStdout("Redistributing the configuration.")
> +  if not _RunCommandAndReport(["gnt-cluster", "redist-conf", "--yes-do-it"]):
> +    returnvalue = 1
> +
> +  ToStdout("Restarting daemons everywhere.")
> +  badnodes = _VerifyCommand([pathutils.DAEMON_UTIL, "stop-all"])
> +  badnodes.extend(_VerifyCommand([pathutils.DAEMON_UTIL, "start-all"]))
> +  if badnodes:
> +    ToStderr("Warning: failed to start daemons on %s." %
> +             (", ".join(list(set(badnodes))),))
> +    returnvalue = 1
> +
> +  ToStdout("Undraining the queue.")
> +  if not _RunCommandAndReport(["gnt-cluster", "queue", "undrain"]):
> +    returnvalue = 1
> +
> +  # TODO: write intent-to-upgrade file
> +
> +  ToStdout("Verifying cluster.")
> +  if not _RunCommandAndReport(["gnt-cluster", "verify"]):
> +    returnvalue = 1
> +
> +  return returnvalue
> +
> +
>  commands = {
>    "init": (
>      InitCluster, [ArgHost(min=1, max=1)],
> @@ -1735,6 +2011,9 @@ commands = {
>    "show-ispecs-cmd": (
>      ShowCreateCommand, ARGS_NONE, [], "",
>      "Show the command line to re-create the cluster"),
> +  "upgrade": (
> +    UpgradeGanetiCommand, ARGS_NONE, [TO_OPT, RESUME_OPT], "",
> +    "Upgrade (or downgrade) to a new Ganeti version"),
>    }
>  
>  
> -- 
> 1.8.4
> 

Thanks,
Jose

-- 
Jose Antonio Lopes
Ganeti Engineering
Google Germany GmbH
Dienerstr. 12, 80331, München

Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Steuernummer: 48/725/00206
Umsatzsteueridentifikationsnummer: DE813741370

Reply via email to