On Mon, Feb 24, 2014 at 12:47 PM, Petr Pudlák <[email protected]> wrote:
> > > > On Tue, Feb 18, 2014 at 3:39 PM, Hrvoje Ribicic <[email protected]> wrote: > >> This patch adds a QA utility function that acquires a set of locks, and >> attempts to run a given function with the locks in place. Should the >> given function block, this function does not detect this - later >> patches will address the issue. >> >> An example of its use is provided by having the move-instance test >> modified to use it. >> >> Signed-off-by: Hrvoje Ribicic <[email protected]> >> --- >> Makefile.am | 1 + >> qa/qa_job_utils.py | 149 >> +++++++++++++++++++++++++++++++++++++++++++++++++++++ >> qa/qa_rapi.py | 18 +++++-- >> 3 files changed, 163 insertions(+), 5 deletions(-) >> create mode 100644 qa/qa_job_utils.py >> >> diff --git a/Makefile.am b/Makefile.am >> index 8cc443e..4009eae 100644 >> --- a/Makefile.am >> +++ b/Makefile.am >> @@ -1010,6 +1010,7 @@ qa_scripts = \ >> qa/qa_instance.py \ >> qa/qa_instance_utils.py \ >> qa/qa_job.py \ >> + qa/qa_job_utils.py \ >> qa/qa_monitoring.py \ >> qa/qa_node.py \ >> qa/qa_os.py \ >> diff --git a/qa/qa_job_utils.py b/qa/qa_job_utils.py >> new file mode 100644 >> index 0000000..bc4733c >> --- /dev/null >> +++ b/qa/qa_job_utils.py >> @@ -0,0 +1,149 @@ >> +# >> +# >> + >> +# Copyright (C) 2014 Google Inc. >> +# >> +# This program is free software; you can redistribute it and/or modify >> +# it under the terms of the GNU General Public License as published by >> +# the Free Software Foundation; either version 2 of the License, or >> +# (at your option) any later version. >> +# >> +# This program is distributed in the hope that it will be useful, but >> +# WITHOUT ANY WARRANTY; without even the implied warranty of >> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> +# General Public License for more details. >> +# >> +# You should have received a copy of the GNU General Public License >> +# along with this program; if not, write to the Free Software >> +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> +# 02110-1301, USA. >> + >> + >> +"""QA utility functions for testing jobs >> + >> +""" >> + >> +import re >> + >> +from ganeti import constants >> +from ganeti import locking >> +from ganeti import utils >> + >> +import qa_config >> +import qa_error >> + >> +from qa_utils import AssertCommand, GetCommandOutput, GetObjectInfo >> + >> + >> +AVAILABLE_LOCKS = [locking.LEVEL_NODE, ] >> + >> + >> +def _GetOutputFromMaster(cmd): >> + """ Gets the output of a command executed on master. >> + >> + """ >> + if isinstance(cmd, basestring): >> + cmdstr = cmd >> + else: >> + cmdstr = utils.ShellQuoteArgs(cmd) >> + >> + # Necessary due to the stderr stream not being captured properly on the >> + # buildbot >> + cmdstr += " 2>&1" >> + >> + return GetCommandOutput(qa_config.GetMasterNode().primary, cmdstr) >> + >> + >> +def ExecuteJobProducingCommand(cmd): >> + """ Executes a --submit-using command producing a job, and returns an >> id. >> > This is somewhat confusing, I'd suggest to reword it to something like "a > command using --submit". > > Oh, good point, will change this. I will also reword the line to indicate the job has to be present. > + >> + @type cmd: list of string >> + @param cmd: The command to execute, broken into constituent components. >> + >> + """ >> + job_id_output = _GetOutputFromMaster(cmd) >> + >> + possible_job_ids = re.findall("JobID: ([0-9]+)", job_id_output) >> + if len(possible_job_ids) != 1: >> + raise qa_error.Error("Cannot parse command output to find job id: >> output " >> + "is %s" % job_id_output) >> + >> + return int(possible_job_ids[0]) >> + >> + >> +def _StartDelayFunction(locks, timeout): >> + """ Starts the gnt-debug delay option with the given locks and timeout. >> + >> + """ >> + # The interruptible switch must be used >> + cmd = ["gnt-debug", "delay", "-i", "--submit", "--no-master"] >> + >> + for node in locks.get(locking.LEVEL_NODE, []): >> + cmd.append("-n%s" % node) >> + >> + cmd.append(str(timeout)) >> + >> + job_id = ExecuteJobProducingCommand(cmd) >> + job_info = GetObjectInfo(["gnt-job", "info", str(job_id)]) >> + execution_logs = job_info[0]["Opcodes"][0]["Execution log"] >> + >> + is_termination_info_fn = \ >> + lambda e: e["Content"][1] == constants.ELOG_DELAY_TEST >> + filtered_logs = filter(is_termination_info_fn, execution_logs) >> + >> + if len(filtered_logs) != 1: >> + raise qa_error.Error("Failure when trying to retrieve delay >> termination " >> + "information") >> + >> + _, _, (socket_path, ) = filtered_logs[0]["Content"] >> + >> + return socket_path >> + >> + >> +def _TerminateDelayFunction(termination_socket): >> + """ Terminates the delay function by communicating with the domain >> socket. >> + >> + """ >> + AssertCommand("echo a | socat -u stdin UNIX-CLIENT:%s" % >> termination_socket) >> > > For the QA it's probably completely fine to call an external command. You > might also want to have a look at Transport in lib/rpc/transport.py, which > is used in Luxi and WConfd to send a EOM-terminated messages (0x03) over UD > sockets. > > I have looked at the class, but using it here confers no significant benefits, while introducing a more complex syntax and tying the QA to the LUXI protocol. Maybe it's best to keep this line as-is. > + >> + >> +# TODO: Can this be done as a decorator? Implement as needed. >> > > IDK if it makes sense here, but perhaps creating a ContextManager could be > one option: > http://docs.python.org/2/reference/compound_stmts.html#the-with-statement/ > As discussed offline, due to the need to run the code within the block in another thread, it is probably not an option, unfortunately. > > > >> +def RunWithLocks(fn, locks, timeout, *args, **kwargs): >> + """ Runs the given function, acquiring a set of locks beforehand. >> + >> + @type fn: function >> + @param fn: The function to invoke. >> + @type locks: dict of string to list of string >> + @param locks: The locks to acquire, per lock category. >> + @type timeout: number >> + @param timeout: The number of seconds the locks should be held before >> + expiring. >> + >> + This function allows a set of locks to be acquired in preparation for >> a QA >> + test, to try and see if the function can run in parallel with other >> + operations. >> + >> + The current version simply creates the locks, which expire after a >> given >> + timeout, and attempts to invoke the provided function. >> + >> + This will probably block the QA, and future versions will address this. >> + >> + A default timeout is not provided by design - the test creator must >> make a >> + good conservative estimate. >> + >> + """ >> + if filter(lambda l_type: l_type not in AVAILABLE_LOCKS, locks): >> + raise qa_error.Error("Attempted to acquire locks that cannot yet be " >> + "acquired in the course of a QA test.") >> + >> + # The watcher may interfere by issuing its own jobs - therefore pause >> it >> + AssertCommand(["gnt-cluster", "watcher", "pause", "12h"]) >> + >> + termination_socket = _StartDelayFunction(locks, timeout) >> + >> + fn(*args, **kwargs) >> + >> + _TerminateDelayFunction(termination_socket) >> + >> + # Revive the watcher >> + AssertCommand(["gnt-cluster", "watcher", "continue"]) >> diff --git a/qa/qa_rapi.py b/qa/qa_rapi.py >> index 3164cb5..9ad8afb 100644 >> --- a/qa/qa_rapi.py >> +++ b/qa/qa_rapi.py >> @@ -52,6 +52,7 @@ import qa_utils >> from qa_instance import IsDiskReplacingSupported >> from qa_instance import IsFailoverSupported >> from qa_instance import IsMigrationSupported >> +from qa_job_utils import RunWithLocks >> from qa_utils import (AssertEqual, AssertIn, AssertMatch, >> StartLocalCommand) >> from qa_utils import InstanceCheck, INST_DOWN, INST_UP, FIRST_ARG >> >> @@ -917,7 +918,10 @@ def _InvokeMoveInstance(current_dest_inst, >> current_src_inst, rapi_pw_filename, >> "--dest-secondary-node=%s" % snode, >> ]) >> else: >> - cmd.append("--iallocator=%s" % constants.IALLOC_HAIL) >> + cmd.extend([ >> + "--iallocator=%s" % constants.IALLOC_HAIL, >> + "--opportunistic-tries=1", >> + ]) >> >> cmd.extend([ >> "--net=0:mac=%s" % constants.VALUE_GENERATE, >> @@ -959,10 +963,14 @@ def TestInterClusterInstanceMove(src_instance, >> dest_instance, >> snode = tnode >> pnode = inodes[0] >> >> - # pnode:snode are the *current* nodes, so we move it first to >> tnode:pnode >> - _InvokeMoveInstance(dest_instance.name, src_instance.name, >> rapi_pw_file.name, >> - master.primary, perform_checks, >> - target_nodes=(tnode.primary, pnode.primary)) >> + # pnode:snode are the *current* nodes, and the first move is an >> + # iallocator-guided move outside of pnode. The node lock for the pnode >> + # assures that this happens, and while we cannot be sure where the >> instance >> + # will land, it is a real move. >> + locks = {locking.LEVEL_NODE: [pnode.primary]} >> + RunWithLocks(_InvokeMoveInstance, locks, 600.0, >> + dest_instance.name, src_instance.name, rapi_pw_file.name, >> + master.primary, perform_checks) >> >> # And then back to pnode:snode >> _InvokeMoveInstance(src_instance.name, dest_instance.name, >> rapi_pw_file.name, >> -- >> 1.9.0.rc1.175.g0b1dcb5 >> >> > LGTM, thanks. > > Thanks for the review, micro-interdiff is: diff --git a/qa/qa_job_utils.py b/qa/qa_job_utils.py index bc4733c..56e4b2e 100644 --- a/qa/qa_job_utils.py +++ b/qa/qa_job_utils.py @@ -55,7 +55,7 @@ def _GetOutputFromMaster(cmd): def ExecuteJobProducingCommand(cmd): - """ Executes a --submit-using command producing a job, and returns an id. + """ Executes a command that contains the --submit flag, and returns a job id. @type cmd: list of string @param cmd: The command to execute, broken into constituent components.
