Hi, 16.02.2012 02:02, Jake Smith wrote: > When using upstart jobs in Pacemaker I haven't been able to find > much of anything for documentation. After reading a post a few minutes ago by > Andreas I wanted to verify... > > Are upstart jobs expected to conform to the LSB spec with regards to exit > codes, etc? > Is there any reference documentation using upstart resources in Pacemaker? > Or any good advice :-)
Newer versions of pacemaker and lrmd are able to deal with upstart resources via dbus. However I do not like this way, so please find resource-agent attached, which is able to manage arbitrary upstart job (just like Anything but for upstart resources). It already saved me much time and nerves managing libvirtd (with my own upstart job) which you probably already know always wants to SIGABRT (btw I even know the main reason for that and now testing patch which I will hopefully send to libvirt ml). Best, Vladislav
#!/bin/bash # # OCF resource agent which manages upstart jobs. # # Copyright (c) 2011 Vladislav Bogdanov <bub...@hoster-ok.com> # # OCF instance parameters: # OCF_RESKEY_job_name: name of upstart job # OCF_RESKEY_process_name: name of process # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults meta_data() { cat <<END <?xml version="1.0"?> <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> <resource-agent name="upstart-resource"> <version>1.0</version> <longdesc lang="en"> This RA manages upstart jobs as HA resources. </longdesc> <shortdesc lang="en">Manage upstart job</shortdesc> <parameters> <parameter name="job_name" unique="1" required="1"> <longdesc lang="en"> The name of the upstart job. Can also contain job instance appended after space. Example: job_name="my_job INSTANCE=1" </longdesc> <shortdesc lang="en">Job name</shortdesc> <content type="string"/> </parameter> <parameter name="process_name" unique="0" required="1"> <longdesc lang="en"> The name of the process which is to be launched by upstart job. </longdesc> <shortdesc lang="en">Process name</shortdesc> <content type="string"/> </parameter> <parameter name="check_command" unique="0" required="0"> <longdesc lang="en"> Additional command to run on mointor. </longdesc> <shortdesc lang="en">Additional monitor command</shortdesc> <content type="string"/> </parameter> <parameter name="check_timeout" unique="0" required="0"> <longdesc lang="en"> How many seconds to wait for check command to finish. </longdesc> <shortdesc lang="en">Monitor command timeout</shortdesc> <content type="integer" default="5" /> </parameter> <parameter name="check_action" unique="0" required="0"> <longdesc lang="en"> What to run if monitor command fails or times out. </longdesc> <shortdesc lang="en">Monitor failure action</shortdesc> <content type="string"/> </parameter> </parameters> <actions> <action name="start" timeout="30" /> <action name="stop" timeout="30" /> <action name="reload" timeout="30" /> <action name="monitor" depth="0" timeout="30" interval="10"/> <action name="meta-data" timeout="5" /> <action name="validate-all" timeout="30" /> </actions> </resource-agent> END } usage() { cat <<END usage: $0 {start|stop|reload|monitor|validate-all|meta-data} Expects to have a fully populated OCF RA-compliant environment set. END } start() { local status=$1 monitor "${status}" if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS fi status=$( initctl start ${OCF_RESKEY_job_name} 2>&1 ) monitor "${status}" case $? in $OCF_SUCCESS) ocf_log info "Upstart job ${OCF_RESKEY_job_name} started successfully." ret=$OCF_SUCCESS ;; *) ocf_log err "Failed to start upstart job ${OCF_RESKEY_job_name}." ret=$OCF_ERR_GENERIC ;; esac return ${ret} } stop() { local status=$1 monitor "${status}" if [ $? -eq $OCF_NOT_RUNNING ]; then return $OCF_SUCCESS fi status=$( initctl stop ${OCF_RESKEY_job_name} 2>&1 ) monitor "${status}" case $? in $OCF_NOT_RUNNING) ocf_log info "Upstart job ${OCF_RESKEY_job_name} stopped successfully." ret=$OCF_SUCCESS ;; *) ocf_log err "Failed to stop upstart job ${OCF_RESKEY_job_name}." ret=$OCF_ERR_GENERIC ;; esac return ${ret} } get_status() { local _output _output=$( initctl status ${OCF_RESKEY_job_name} 2>&1 ) if echo "${_output}" | grep -q "Unknown job" ; then ocf_log err "Unknown upstart job ${OCF_RESKEY_job_name}" exit $OCF_ERR_INSTALLED fi # Leave only first line (main process) _output=$( echo "${_output}" | awk '{print $0; exit}' ) # Store job status for later consumption eval $1=\${_output} } monitor() { local status=$1 local pid local ret=$OCF_NOT_RUNNING local process # Operation timeout minus 5 seconds local attempts=$((($OCF_RESKEY_CRM_meta_timeout/1000) - 5)) local i=0 if ocf_is_decimal ${OCF_RESKEY_check_timeout} ; then attempts=$(( attempts - OCF_RESKEY_check_timeout )) fi if [ ${attempts} -le 0 ] ; then attempts=0 fi # We first receive output from outside, then re-poll for it while [ ${ret} -eq $OCF_NOT_RUNNING ] ; do # upstart can report: # <job_name> (instance) start/[running|pre-start], process (item0) pid if [[ "${status}" =~ (^${OCF_RESKEY_job_name}( \(.+\)){0,1} start/([a-z-]+), process (\(.+\) ){0,1}([0-9]+)) ]] ; then state=${BASH_REMATCH[3]} case ${state} in running) pid=${BASH_REMATCH[5]} if [ -n "${pid}" ] ; then kill -0 ${pid} if [ $? -eq 0 ] ; then process=$( awk '/^Name:/ {print $2}' < /proc/${pid}/status ) if [ "${process}" != "${OCF_RESKEY_process_name}" ] ; then # job is started, but it did not yet launched process itself (( i == 0 )) && ocf_log info "pid ${pid} corresponds to process ${process} instead of ${OCF_RESKEY_process_name}, waiting." ret=$OCF_NOT_RUNNING else ret=$OCF_SUCCESS fi else # This will cause resource to be marked as 'Started FAILED' # with subsequent stop and start (( i == 0 )) && ocf_log info "upstart reports process ${pid} is running, but it really isn't, waiting." ret=$OCF_NOT_RUNNING fi fi ;; pre-start) : Just waiting ;; *) : Ditto ;; esac if [ ${ret} -eq $OCF_NOT_RUNNING ] ; then # Wait for upstart to recover started job if (( i++ >= attempts )) ; then ocf_log err "Timed out waiting for process ${OCF_RESKEY_process_name} pid ${pid} to appear." break fi sleep 1 get_status status fi else # Job is not launched ocf_log info "${status}" ret=$OCF_NOT_RUNNING break fi done if [ $ret = $OCF_SUCCESS ] ; then if [ -n "${OCF_RESKEY_check_command}" ] ; then local rc if ! ocf_is_decimal ${OCF_RESKEY_check_timeout} ; then OCF_RESKEY_check_timeout=5 fi timeout -s KILL ${OCF_RESKEY_check_timeout} ${OCF_RESKEY_check_command} >/dev/null 2>&1 rc=$? if [ ${rc} -ne 0 ] ; then ocf_log warn "check_command '${OCF_RESKEY_check_command}' exited with status ${rc}." if [ -n "${OCF_RESKEY_check_action}" ] ; then ocf_log warn "Running repair command '${OCF_RESKEY_check_action}'." ${OCF_RESKEY_check_action} >/dev/null 2>&1 fi fi fi fi return ${ret} } validate() { check_binary initctl # Check the interface parameter if [ -z "${OCF_RESKEY_job_name}" ]; then ocf_log err "Empty job_name parameter." exit $OCF_ERR_CONFIGURED fi if [ -z "${OCF_RESKEY_process_name}" ]; then ocf_log err "Empty process_name parameter." exit $OCF_ERR_CONFIGURED fi return $OCF_SUCCESS } case $__OCF_ACTION in meta-data) meta_data exit $OCF_SUCCESS ;; usage|help) usage exit $OCF_SUCCESS ;; esac if [ `uname` != "Linux" ] ; then ocf_log err "This RA works only on linux." exit $OCF_ERR_INSTALLED fi validate || exit $? status="" get_status status case $__OCF_ACTION in start) start "${status}" ;; stop) stop "${status}" ;; monitor) monitor "${status}" ;; reload) if monitor "${status}" ; then if stop ; then # Re-poll job status get_status status start "${status}" else exit $OCF_ERR_GENERIC fi else start "${status}" fi ;; validate-all) ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $?
_______________________________________________ Pacemaker mailing list: Pacemaker@oss.clusterlabs.org http://oss.clusterlabs.org/mailman/listinfo/pacemaker Project Home: http://www.clusterlabs.org Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf Bugs: http://bugs.clusterlabs.org