On Tue, Feb 18, 2014 at 3:39 PM, Hrvoje Ribicic <[email protected]> wrote:
> This patch adds a QA utility function that acquires a set of locks, and > attempts to run a given function with the locks in place. Should the > given function block, this function does not detect this - later > patches will address the issue. > > An example of its use is provided by having the move-instance test > modified to use it. > > Signed-off-by: Hrvoje Ribicic <[email protected]> > --- > Makefile.am | 1 + > qa/qa_job_utils.py | 149 > +++++++++++++++++++++++++++++++++++++++++++++++++++++ > qa/qa_rapi.py | 18 +++++-- > 3 files changed, 163 insertions(+), 5 deletions(-) > create mode 100644 qa/qa_job_utils.py > > diff --git a/Makefile.am b/Makefile.am > index 8cc443e..4009eae 100644 > --- a/Makefile.am > +++ b/Makefile.am > @@ -1010,6 +1010,7 @@ qa_scripts = \ > qa/qa_instance.py \ > qa/qa_instance_utils.py \ > qa/qa_job.py \ > + qa/qa_job_utils.py \ > qa/qa_monitoring.py \ > qa/qa_node.py \ > qa/qa_os.py \ > diff --git a/qa/qa_job_utils.py b/qa/qa_job_utils.py > new file mode 100644 > index 0000000..bc4733c > --- /dev/null > +++ b/qa/qa_job_utils.py > @@ -0,0 +1,149 @@ > +# > +# > + > +# Copyright (C) 2014 Google Inc. > +# > +# This program is free software; you can redistribute it and/or modify > +# it under the terms of the GNU General Public License as published by > +# the Free Software Foundation; either version 2 of the License, or > +# (at your option) any later version. > +# > +# This program is distributed in the hope that it will be useful, but > +# WITHOUT ANY WARRANTY; without even the implied warranty of > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +# General Public License for more details. > +# > +# You should have received a copy of the GNU General Public License > +# along with this program; if not, write to the Free Software > +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > +# 02110-1301, USA. > + > + > +"""QA utility functions for testing jobs > + > +""" > + > +import re > + > +from ganeti import constants > +from ganeti import locking > +from ganeti import utils > + > +import qa_config > +import qa_error > + > +from qa_utils import AssertCommand, GetCommandOutput, GetObjectInfo > + > + > +AVAILABLE_LOCKS = [locking.LEVEL_NODE, ] > + > + > +def _GetOutputFromMaster(cmd): > + """ Gets the output of a command executed on master. > + > + """ > + if isinstance(cmd, basestring): > + cmdstr = cmd > + else: > + cmdstr = utils.ShellQuoteArgs(cmd) > + > + # Necessary due to the stderr stream not being captured properly on the > + # buildbot > + cmdstr += " 2>&1" > + > + return GetCommandOutput(qa_config.GetMasterNode().primary, cmdstr) > + > + > +def ExecuteJobProducingCommand(cmd): > + """ Executes a --submit-using command producing a job, and returns an > id. > This is somewhat confusing, I'd suggest to reword it to something like "a command using --submit". > + > + @type cmd: list of string > + @param cmd: The command to execute, broken into constituent components. > + > + """ > + job_id_output = _GetOutputFromMaster(cmd) > + > + possible_job_ids = re.findall("JobID: ([0-9]+)", job_id_output) > + if len(possible_job_ids) != 1: > + raise qa_error.Error("Cannot parse command output to find job id: > output " > + "is %s" % job_id_output) > + > + return int(possible_job_ids[0]) > + > + > +def _StartDelayFunction(locks, timeout): > + """ Starts the gnt-debug delay option with the given locks and timeout. > + > + """ > + # The interruptible switch must be used > + cmd = ["gnt-debug", "delay", "-i", "--submit", "--no-master"] > + > + for node in locks.get(locking.LEVEL_NODE, []): > + cmd.append("-n%s" % node) > + > + cmd.append(str(timeout)) > + > + job_id = ExecuteJobProducingCommand(cmd) > + job_info = GetObjectInfo(["gnt-job", "info", str(job_id)]) > + execution_logs = job_info[0]["Opcodes"][0]["Execution log"] > + > + is_termination_info_fn = \ > + lambda e: e["Content"][1] == constants.ELOG_DELAY_TEST > + filtered_logs = filter(is_termination_info_fn, execution_logs) > + > + if len(filtered_logs) != 1: > + raise qa_error.Error("Failure when trying to retrieve delay > termination " > + "information") > + > + _, _, (socket_path, ) = filtered_logs[0]["Content"] > + > + return socket_path > + > + > +def _TerminateDelayFunction(termination_socket): > + """ Terminates the delay function by communicating with the domain > socket. > + > + """ > + AssertCommand("echo a | socat -u stdin UNIX-CLIENT:%s" % > termination_socket) > For the QA it's probably completely fine to call an external command. You might also want to have a look at Transport in lib/rpc/transport.py, which is used in Luxi and WConfd to send a EOM-terminated messages (0x03) over UD sockets. > + > + > +# TODO: Can this be done as a decorator? Implement as needed. > IDK if it makes sense here, but perhaps creating a ContextManager could be one option: http://docs.python.org/2/reference/compound_stmts.html#the-with-statement/ > +def RunWithLocks(fn, locks, timeout, *args, **kwargs): > + """ Runs the given function, acquiring a set of locks beforehand. > + > + @type fn: function > + @param fn: The function to invoke. > + @type locks: dict of string to list of string > + @param locks: The locks to acquire, per lock category. > + @type timeout: number > + @param timeout: The number of seconds the locks should be held before > + expiring. > + > + This function allows a set of locks to be acquired in preparation for a > QA > + test, to try and see if the function can run in parallel with other > + operations. > + > + The current version simply creates the locks, which expire after a given > + timeout, and attempts to invoke the provided function. > + > + This will probably block the QA, and future versions will address this. > + > + A default timeout is not provided by design - the test creator must > make a > + good conservative estimate. > + > + """ > + if filter(lambda l_type: l_type not in AVAILABLE_LOCKS, locks): > + raise qa_error.Error("Attempted to acquire locks that cannot yet be " > + "acquired in the course of a QA test.") > + > + # The watcher may interfere by issuing its own jobs - therefore pause it > + AssertCommand(["gnt-cluster", "watcher", "pause", "12h"]) > + > + termination_socket = _StartDelayFunction(locks, timeout) > + > + fn(*args, **kwargs) > + > + _TerminateDelayFunction(termination_socket) > + > + # Revive the watcher > + AssertCommand(["gnt-cluster", "watcher", "continue"]) > diff --git a/qa/qa_rapi.py b/qa/qa_rapi.py > index 3164cb5..9ad8afb 100644 > --- a/qa/qa_rapi.py > +++ b/qa/qa_rapi.py > @@ -52,6 +52,7 @@ import qa_utils > from qa_instance import IsDiskReplacingSupported > from qa_instance import IsFailoverSupported > from qa_instance import IsMigrationSupported > +from qa_job_utils import RunWithLocks > from qa_utils import (AssertEqual, AssertIn, AssertMatch, > StartLocalCommand) > from qa_utils import InstanceCheck, INST_DOWN, INST_UP, FIRST_ARG > > @@ -917,7 +918,10 @@ def _InvokeMoveInstance(current_dest_inst, > current_src_inst, rapi_pw_filename, > "--dest-secondary-node=%s" % snode, > ]) > else: > - cmd.append("--iallocator=%s" % constants.IALLOC_HAIL) > + cmd.extend([ > + "--iallocator=%s" % constants.IALLOC_HAIL, > + "--opportunistic-tries=1", > + ]) > > cmd.extend([ > "--net=0:mac=%s" % constants.VALUE_GENERATE, > @@ -959,10 +963,14 @@ def TestInterClusterInstanceMove(src_instance, > dest_instance, > snode = tnode > pnode = inodes[0] > > - # pnode:snode are the *current* nodes, so we move it first to > tnode:pnode > - _InvokeMoveInstance(dest_instance.name, src_instance.name, > rapi_pw_file.name, > - master.primary, perform_checks, > - target_nodes=(tnode.primary, pnode.primary)) > + # pnode:snode are the *current* nodes, and the first move is an > + # iallocator-guided move outside of pnode. The node lock for the pnode > + # assures that this happens, and while we cannot be sure where the > instance > + # will land, it is a real move. > + locks = {locking.LEVEL_NODE: [pnode.primary]} > + RunWithLocks(_InvokeMoveInstance, locks, 600.0, > + dest_instance.name, src_instance.name, rapi_pw_file.name, > + master.primary, perform_checks) > > # And then back to pnode:snode > _InvokeMoveInstance(src_instance.name, dest_instance.name, > rapi_pw_file.name, > -- > 1.9.0.rc1.175.g0b1dcb5 > > LGTM, thanks.
