Hello all,

I have brought up a test Pacemaker cluster with,

Pacemaker 1.1.8-7.el6 (Build: 394e906)
DRBD 8.4.4
CMAN 3.0.12

I am trying to make KVM guests highly available using 2 server hosts. KVM
guests are not included in pacemaker cluster. It is only KVM hosts that are
included in pacemaker cluster.

Dummy service, and DRBD seem to migrate cleanly, when i shutdown the
service on one of the physical nodes. However, the same thing does not
happen for VirtualDomain Resource.

I am using ocf:hearbeat:VirtualDomain resource to manage kvm guest
(rsc_lvpvm01), and tried to include the colocation and order constraints
with DRBD block device (VmData2Clone) - as below :



pcs -f fs_cfg constraint colocation add rsc_lvpvm01 VmData2Clone
INFINITY with-rsc-role=Master

pcs -f fs_cfg constraint order promote VmData2Clone then start rsc_lvpvm01


I have created the VirtualDomain resource as below :

pcs -f kvm_cfg resource create rsc_lvpvm01 ocf:heartbeat:VirtualDomain
hypervisor="qemu:///system" config="/etc/libvirt/qemu/lvpvm01.xml"
meta allow-migrate="true" op monitor timeout="30" interval="10" op
start timeout="120s" op stop timeout="120s"


I have just included IMM fencing, for the two physical nodes.


These are the steps that i observed :

1. When i shutdown pacemaker service on physical node, it indefinetely waits at
 " waiting for managed resources to shutdown...."

2. As soon as, I try to stop the resource using : " pcs resource
rsc_lvpvm01 stop " , the above shutdown service

   cleanly shutsdown and i see that my services are migrated, but
VirtualDomain resource does not start in other

   physical node, unless i manually start it in other node using :
"pcs resouce rsc_lvpvm01 start".

3. If i forcefully shutdown my kvm physical host, I wont be able to
start VirtualDomain in other node as expected.

4. I would be able to start VirtualDomain in other node, only if i
manually stop the resource as in step 2, instead

   of abruptly powering off the physical host.


It might be that i need to add better constraints,delay and fencing
for VirtualDomain, but i do not understand where
exactly the problem is.

May i please ask for some help on this issue.

Please find my CIB dump as attached.

Thanks,

Lohit
<cib admin_epoch="0" cib-last-written="Sat Nov  9 16:37:19 2013" crm_feature_set="3.0.7" epoch="57" have-quorum="0" num_updates="3" update-client="crm_resource" update-origin="scl-p01" validate-with="pacemaker-1.2" dc-uuid="scl-p01">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.8-7.el6-394e906"/>
        <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="cman"/>
        <nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="true"/>
        <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
      </cluster_property_set>
    </crm_config>
    <nodes>
      <node id="scl-p01" uname="scl-p01"/>
      <node id="scl-p03" uname="scl-p03"/>
    </nodes>
    <resources>
      <primitive class="ocf" id="my_first_svc" provider="pacemaker" type="Dummy">
        <instance_attributes id="my_first_svc-instance_attributes"/>
        <operations>
          <op id="my_first_svc-interval-120s" interval="120s" name="monitor"/>
        </operations>
      </primitive>
      <primitive class="stonith" id="immfencing_sclp01" type="fence_imm">
        <instance_attributes id="immfencing_sclp01-instance_attributes">
          <nvpair id="immfencing_sclp01-instance_attributes-pcmk_host_list" name="pcmk_host_list" value="scl-p01"/>
          <nvpair id="immfencing_sclp01-instance_attributes-ipaddr" name="ipaddr" value="172.18.239.199"/>
          <nvpair id="immfencing_sclp01-instance_attributes-login" name="login" value="USERID"/>
          <nvpair id="immfencing_sclp01-instance_attributes-passwd" name="passwd" value="PASSW0RD"/>
        </instance_attributes>
        <operations>
          <op id="immfencing_sclp01-interval-60s" interval="60s" name="monitor"/>
        </operations>
      </primitive>
      <primitive class="stonith" id="immfencing_sclp03" type="fence_imm">
        <instance_attributes id="immfencing_sclp03-instance_attributes">
          <nvpair id="immfencing_sclp03-instance_attributes-pcmk_host_list" name="pcmk_host_list" value="scl-p03"/>
          <nvpair id="immfencing_sclp03-instance_attributes-ipaddr" name="ipaddr" value="172.18.239.197"/>
          <nvpair id="immfencing_sclp03-instance_attributes-login" name="login" value="USERID"/>
          <nvpair id="immfencing_sclp03-instance_attributes-passwd" name="passwd" value="PASSW0RD"/>
        </instance_attributes>
        <operations>
          <op id="immfencing_sclp03-interval-60s" interval="60s" name="monitor"/>
        </operations>
      </primitive>
      <master id="VmData2Clone">
        <primitive class="ocf" id="VmData2" provider="linbit" type="drbd">
          <instance_attributes id="VmData2-instance_attributes">
            <nvpair id="VmData2-instance_attributes-drbd_resource" name="drbd_resource" value="kvmdata2"/>
          </instance_attributes>
          <operations>
            <op id="VmData2-interval-60s" interval="60s" name="monitor"/>
          </operations>
        </primitive>
        <meta_attributes id="VmData2Clone-meta_attributes">
          <nvpair id="VmData2Clone-meta_attributes-master-max" name="master-max" value="1"/>
          <nvpair id="VmData2Clone-meta_attributes-master-node-max" name="master-node-max" value="1"/>
          <nvpair id="VmData2Clone-meta_attributes-clone-max" name="clone-max" value="2"/>
          <nvpair id="VmData2Clone-meta_attributes-clone-node-max" name="clone-node-max" value="1"/>
          <nvpair id="VmData2Clone-meta_attributes-notify" name="notify" value="true"/>
        </meta_attributes>
      </master>
      <primitive class="ocf" id="rsc_lvpvm01" provider="heartbeat" type="VirtualDomain">
        <instance_attributes id="rsc_lvpvm01-instance_attributes">
          <nvpair id="rsc_lvpvm01-instance_attributes-hypervisor" name="hypervisor" value="qemu:///system"/>
          <nvpair id="rsc_lvpvm01-instance_attributes-config" name="config" value="/etc/libvirt/qemu/lvpvm01.xml"/>
          <nvpair id="rsc_lvpvm01-instance_attributes-allow-migrate" name="allow-migrate" value="true"/>
        </instance_attributes>
        <operations>
          <op id="rsc_lvpvm01-timeout-120s" interval="10" name="monitor" timeout="120s"/>
        </operations>
        <meta_attributes id="rsc_lvpvm01-meta_attributes"/>
      </primitive>
    </resources>
    <constraints>
      <rsc_colocation id="colocation-rsc_lvpvm01-VmData2Clone-INFINITY" rsc="rsc_lvpvm01" score="INFINITY" with-rsc="VmData2Clone" with-rsc-role="Master"/>
      <rsc_order first="VmData2Clone" first-action="promote" id="order-VmData2Clone-rsc_lvpvm01-mandatory" then="rsc_lvpvm01" then-action="start"/>
      <rsc_location rsc="VmData2Clone" id="drbd-fence-by-handler-kvmdata2-VmData2Clone">
        <rule role="Master" score="-INFINITY" id="drbd-fence-by-handler-kvmdata2-rule-VmData2Clone">
          <expression attribute="#uname" operation="ne" value="scl-p01" id="drbd-fence-by-handler-kvmdata2-expr-VmData2Clone"/>
        </rule>
      </rsc_location>
    </constraints>
    <rsc_defaults>
      <meta_attributes id="rsc_defaults-options">
        <nvpair id="rsc_defaults-options-migration-threshold" name="migration-threshold" value="1"/>
      </meta_attributes>
    </rsc_defaults>
  </configuration>
  <status>
    <node_state id="scl-p01" uname="scl-p01" in_ccm="true" crmd="online" crm-debug-origin="do_update_resource" join="member" expected="member">
      <lrm id="scl-p01">
        <lrm_resources>
          <lrm_resource id="immfencing_sclp01" type="fence_imm" class="stonith">
            <lrm_rsc_op id="immfencing_sclp01_last_0" operation_key="immfencing_sclp01_stop_0" operation="stop" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="1:8:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;1:8:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="68" rc-code="0" op-status="0" interval="0" last-run="1384032602" last-rc-change="0" exec-time="1" queue-time="0" op-digest="04ca4248a94e11f564b39ea42fe425ed"/>
            <lrm_rsc_op id="immfencing_sclp01_last_failure_0" operation_key="immfencing_sclp01_start_0" operation="start" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="10:5:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="4:1;10:5:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="62" rc-code="1" op-status="4" interval="0" last-run="1384032581" last-rc-change="0" exec-time="19070" queue-time="0" op-digest="04ca4248a94e11f564b39ea42fe425ed"/>
          </lrm_resource>
          <lrm_resource id="immfencing_sclp03" type="fence_imm" class="stonith">
            <lrm_rsc_op id="immfencing_sclp03_last_0" operation_key="immfencing_sclp03_start_0" operation="start" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="19:0:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;19:0:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="31" rc-code="0" op-status="0" interval="0" last-run="1384032511" last-rc-change="0" exec-time="1427" queue-time="0" op-digest="6f9ab2ca5b9c555d7cf579742032cf12"/>
            <lrm_rsc_op id="immfencing_sclp03_monitor_60000" operation_key="immfencing_sclp03_monitor_60000" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="13:1:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;13:1:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="44" rc-code="0" op-status="0" interval="60000" last-rc-change="0" exec-time="425" queue-time="0" op-digest="30034225d48a9ec43741e5b189b54e82"/>
          </lrm_resource>
          <lrm_resource id="my_first_svc" type="Dummy" class="ocf" provider="pacemaker">
            <lrm_rsc_op id="my_first_svc_last_0" operation_key="my_first_svc_start_0" operation="start" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="15:0:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;15:0:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="29" rc-code="0" op-status="0" interval="0" last-run="1384032511" last-rc-change="0" exec-time="1003" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" op-force-restart=" state  op_sleep " op-restart-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
            <lrm_rsc_op id="my_first_svc_monitor_120000" operation_key="my_first_svc_monitor_120000" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="16:0:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;16:0:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="36" rc-code="0" op-status="0" interval="120000" last-rc-change="0" exec-time="13" queue-time="0" op-digest="4811cef7f7f94e3a35a70be7916cb2fd"/>
          </lrm_resource>
          <lrm_resource id="VmData2" type="drbd" class="ocf" provider="linbit">
            <lrm_rsc_op id="VmData2_last_0" operation_key="VmData2_promote_0" operation="promote" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="13:10:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;13:10:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="89" rc-code="0" op-status="0" interval="0" last-run="1384032991" last-rc-change="0" exec-time="164" queue-time="0" op-digest="fac6bfaba1e8ca402cba91e04bbdce78"/>
          </lrm_resource>
          <lrm_resource id="rsc_lvpvm01" type="VirtualDomain" class="ocf" provider="heartbeat">
            <lrm_rsc_op id="rsc_lvpvm01_last_0" operation_key="rsc_lvpvm01_start_0" operation="start" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="38:13:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;38:13:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="95" rc-code="0" op-status="0" interval="0" last-run="1384033039" last-rc-change="0" exec-time="524" queue-time="0" op-digest="7ef4b6ea866625f317c52028c534c02e"/>
            <lrm_rsc_op id="rsc_lvpvm01_monitor_10000" operation_key="rsc_lvpvm01_monitor_10000" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="39:13:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;39:13:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="98" rc-code="0" op-status="0" interval="10000" last-rc-change="0" exec-time="61" queue-time="0" op-digest="2ea83e4c02f5646ff17a39f2c64bb885"/>
          </lrm_resource>
        </lrm_resources>
      </lrm>
      <transient_attributes id="scl-p01">
        <instance_attributes id="status-scl-p01">
          <nvpair id="status-scl-p01-probe_complete" name="probe_complete" value="true"/>
          <nvpair id="status-scl-p01-master-VmData2" name="master-VmData2" value="10000"/>
          <nvpair id="status-scl-p01-fail-count-immfencing_sclp01" name="fail-count-immfencing_sclp01" value="INFINITY"/>
          <nvpair id="status-scl-p01-last-failure-immfencing_sclp01" name="last-failure-immfencing_sclp01" value="1384032602"/>
        </instance_attributes>
      </transient_attributes>
    </node_state>
    <node_state id="scl-p03" uname="scl-p03" in_ccm="false" crmd="offline" crm-debug-origin="post_cache_update" join="member" expected="down">
      <lrm id="scl-p03">
        <lrm_resources>
          <lrm_resource id="immfencing_sclp01" type="fence_imm" class="stonith">
            <lrm_rsc_op id="immfencing_sclp01_last_0" operation_key="immfencing_sclp01_stop_0" operation="stop" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="12:4:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;12:4:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="57" rc-code="0" op-status="0" interval="0" last-run="1384032581" last-rc-change="0" exec-time="1" queue-time="0" op-digest="04ca4248a94e11f564b39ea42fe425ed"/>
            <lrm_rsc_op id="immfencing_sclp01_monitor_60000" operation_key="immfencing_sclp01_monitor_60000" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="10:1:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;10:1:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="38" rc-code="0" op-status="0" interval="60000" last-rc-change="0" exec-time="552" queue-time="0" op-digest="524053ffd574d0949d48fbdbcf2cefed"/>
          </lrm_resource>
          <lrm_resource id="immfencing_sclp03" type="fence_imm" class="stonith">
            <lrm_rsc_op id="immfencing_sclp03_last_0" operation_key="immfencing_sclp03_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="12:0:7:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:7;12:0:7:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="13" rc-code="7" op-status="0" interval="0" last-run="1384032507" last-rc-change="0" exec-time="0" queue-time="0" op-digest="6f9ab2ca5b9c555d7cf579742032cf12"/>
          </lrm_resource>
          <lrm_resource id="my_first_svc" type="Dummy" class="ocf" provider="pacemaker">
            <lrm_rsc_op id="my_first_svc_last_0" operation_key="my_first_svc_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="10:0:7:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:7;10:0:7:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="5" rc-code="7" op-status="0" interval="0" last-run="1384032507" last-rc-change="1384032507" exec-time="13" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" op-force-restart=" state  op_sleep " op-restart-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
          </lrm_resource>
          <lrm_resource id="VmData2" type="drbd" class="ocf" provider="linbit">
            <lrm_rsc_op id="VmData2_last_0" operation_key="VmData2_stop_0" operation="stop" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="16:9:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;16:9:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="87" rc-code="0" op-status="0" interval="0" last-run="1384032990" last-rc-change="0" exec-time="165" queue-time="0" op-digest="fac6bfaba1e8ca402cba91e04bbdce78"/>
          </lrm_resource>
          <lrm_resource id="rsc_lvpvm01" type="VirtualDomain" class="ocf" provider="heartbeat">
            <lrm_rsc_op id="rsc_lvpvm01_last_0" operation_key="rsc_lvpvm01_stop_0" operation="stop" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="41:9:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;41:9:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="71" rc-code="0" op-status="0" interval="0" last-run="1384032984" last-rc-change="0" exec-time="6290" queue-time="0" op-digest="7ef4b6ea866625f317c52028c534c02e"/>
            <lrm_rsc_op id="rsc_lvpvm01_monitor_10000" operation_key="rsc_lvpvm01_monitor_10000" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.7" transition-key="47:2:0:7ea244c5-ad9c-42b3-8d90-af304164733e" transition-magic="0:0;47:2:0:7ea244c5-ad9c-42b3-8d90-af304164733e" call-id="53" rc-code="0" op-status="0" interval="10000" last-rc-change="0" exec-time="62" queue-time="1" op-digest="2ea83e4c02f5646ff17a39f2c64bb885"/>
          </lrm_resource>
        </lrm_resources>
      </lrm>
      <transient_attributes id="scl-p03">
        <instance_attributes id="status-scl-p03">
          <nvpair id="status-scl-p03-probe_complete" name="probe_complete" value="true"/>
          <nvpair id="status-scl-p03-standby" name="standby" value="true"/>
          <nvpair id="status-scl-p03-shutdown" name="shutdown" value="1384032992"/>
        </instance_attributes>
      </transient_attributes>
    </node_state>
  </status>
</cib>
_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://bugs.clusterlabs.org

Reply via email to