[Linux-HA] 2.1.2 and failover of colocated resources

James Oakley Mon, 13 Aug 2007 10:02:35 -0700

I am having some trouble getting resources to failover in my 2.1.2 2-node 
cluster.


I have attached my cib.

I created a number of resources and colocation rules. All of the resources 
need to run on the same node that has the drbd partition mounted.

If I kill the daemon process of one of the dependent resources, it is simply 
restarted, which is good. Unfortunately, if I make the failure more severe by 
renaming the daemon binary and then killing it, heartbeat simply calls the 
stop action and leaves it at that. It doesn't bother migrating the resources 
to the other node.

I understand that if all of the resources were in a group, the whole group 
would failover, but that also means that if one resource in the group failed 
on both nodes, the whole group would be shut down, and I *definitely* don't 
want that.

Also, if a resource fails and I later fix the problem, I can't seem to get 
heartbeat to take it over again. I tried "crm_failcount -G -r <resource>" and 
starting the process manually, then calling "crm_resource -P -H <node> -r 
<resource>", but crm_mon still shows "(unmanaged) FAILED." The only thing 
that works is stopping and restarting heartbeat on both nodes.

-- 
James Oakley
[EMAIL PROTECTED]

<cib admin_epoch="0" ccm_transition="2" cib_feature_revision="1.3" dc_uuid="1f7cf7cc-7d90-43fb-a7bd-f13fb5c203c1" epoch="6" generated="true" have_quorum="true" ignore_dtd="false" num_peers="2" num_updates="288">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <attributes>
          <nvpair id="cib-bootstrap-options-short-resource-names" name="short-resource-names" value="true" />
          <nvpair id="cib-bootstrap-options-startup-fencing" name="startup-fencing" value="true" />
          <nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="false" />
          <nvpair id="cib-bootstrap-options-symmetric-cluster" name="symmetric-cluster" value="true" />
          <nvpair id="cib-bootstrap-options-stop-orphan-actions" name="stop-orphan-actions" value="true" />
          <nvpair id="cib-bootstrap-options-stonith-action" name="stonith-action" value="reboot" />
          <nvpair id="cib-bootstrap-options-is-managed-default" name="is-managed-default" value="true" />
          <nvpair id="cib-bootstrap-options-stop-orphan-resources" name="stop-orphan-resources" value="true" />
          <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="stop" />
          <nvpair id="cib-bootstrap-options-default-resource-failure-stickiness" name="default-resource-failure-stickiness" value="-100" />
          <nvpair id="cib-bootstrap-options-probe_complete" name="probe_complete" value="true" />
          <nvpair id="cib-bootstrap-options-transition-idle-timeout" name="transition-idle-timeout" value="60s" />
          <nvpair id="cib-bootstrap-options-default-resource-stickiness" name="default-resource-stickiness" value="0" />
        </attributes>
      </cluster_property_set>
    </crm_config>
    <nodes>
      <node id="1f7cf7cc-7d90-43fb-a7bd-f13fb5c203c1" type="normal" uname="redun2" />
      <node id="cd34ed93-fa5d-4092-a618-ac6349351d13" type="normal" uname="redun1" />
    </nodes>
    <resources>
      <master_slave id="shared_storage">
        <meta_attributes id="ma_shared_storage">
          <attributes>
            <nvpair id="shared_storage_ma_master_node_max" name="master_node_max" value="1" />
            <nvpair id="shared_storage_ma_master_max" name="master_max" value="1" />
            <nvpair id="shared_storage_ma_notify" name="notify" value="yes" />
            <nvpair id="shared_storage_ma_clone_max" name="clone_max" value="2" />
            <nvpair id="shared_storage_ma_clone_node_max" name="clone_node_max" value="1" />
            <nvpair id="shared_storage_ma_globally_unique" name="globally_unique" value="false" />
          </attributes>
        </meta_attributes>
        <primitive class="ocf" id="prim_shared_storage" provider="heartbeat" type="drbd">
          <operations>
            <op id="shared_storage_op_monitor" interval="120s" name="monitor" timeout="60s" />
          </operations>
          <instance_attributes id="ias_prim_shared_storage">
            <attributes>
              <nvpair id="shared_storage_attr_drbd_resource" name="drbd_resource" value="sip-shared" />
            </attributes>
          </instance_attributes>
        </primitive>
      </master_slave>
      <primitive class="ocf" id="shared_filesystem" provider="heartbeat" type="Filesystem">
        <operations>
          <op id="shared_filesystem_op_monitor" interval="120s" name="monitor" timeout="60s" />
          <op id="shared_filesystem_op_start" name="start" start_delay="10s" timeout="60s" />
        </operations>
        <instance_attributes id="ias_shared_filesystem">
          <attributes>
            <nvpair id="shared_filesystem_attr_device" name="device" value="/dev/drbd0" />
            <nvpair id="shared_filesystem_attr_directory" name="directory" value="/shared" />
            <nvpair id="shared_filesystem_attr_fstype" name="fstype" value="reiserfs" />
          </attributes>
        </instance_attributes>
      </primitive>
      <primitive class="heartbeat" id="drbdlinks" provider="heartbeat" type="drbdlinks">
        <operations>
          <op id="drbdlinks_op_monitor" interval="30s" name="monitor" timeout="30s" />
        </operations>
        <instance_attributes id="ias_drbdlinks">
          <attributes />
        </instance_attributes>
      </primitive>
      <primitive class="lsb" id="postgresql" type="postgresql">
        <operations>
          <op id="postgresql_op_monitor" interval="30s" name="monitor" timeout="30s" />
          <op id="postgresql_op_start" name="start" timeout="120s" />
        </operations>
        <instance_attributes id="ias_postgresql">
          <attributes />
        </instance_attributes>
      </primitive>
      <primitive class="lsb" id="exim" type="exim">
        <operations>
          <op id="exim_op_monitor" interval="30s" name="monitor" timeout="30s" />
        </operations>
        <instance_attributes id="ias_exim">
          <attributes />
        </instance_attributes>
      </primitive>
      <primitive class="lsb" id="inetd" type="inetd">
        <operations>
          <op id="inetd_op_monitor" interval="30s" name="monitor" timeout="30s" />
        </operations>
        <instance_attributes id="ias_inetd">
          <attributes />
        </instance_attributes>
      </primitive>
      <primitive class="lsb" id="dhcpd" type="dhcpd">
        <operations>
          <op id="dhcpd_op_monitor" interval="30s" name="monitor" timeout="30s" />
        </operations>
        <instance_attributes id="ias_dhcpd">
          <attributes />
        </instance_attributes>
      </primitive>
      <primitive class="lsb" id="atftpd" type="atftpd">
        <operations>
          <op id="atftpd_op_monitor" interval="30s" name="monitor" timeout="30s" />
        </operations>
        <instance_attributes id="ias_atftpd">
          <attributes />
        </instance_attributes>
      </primitive>
      <primitive class="lsb" id="apache" type="apache">
        <operations>
          <op id="apache_op_monitor" interval="30s" name="monitor" timeout="30s" />
        </operations>
        <instance_attributes id="ias_apache">
          <attributes />
        </instance_attributes>
      </primitive>
      <primitive class="lsb" id="cups" type="cups">
        <operations>
          <op id="cups_op_monitor" interval="30s" name="monitor" timeout="30s" />
        </operations>
        <instance_attributes id="ias_cups">
          <attributes />
        </instance_attributes>
      </primitive>
    </resources>
    <constraints>
      <rsc_colocation from="shared_filesystem" id="colocation_shared_filesystem_shared_storage:master:master" score="INFINITY" to="shared_storage" to_role="master" />
      <rsc_order from="shared_filesystem" id="order_shared_filesystem_after_shared_storage:promote" to="shared_storage" to_action="promote" />
      <rsc_colocation from="drbdlinks" id="colocation_drbdlinks_shared_filesystem" score="INFINITY" to="shared_filesystem" />
      <rsc_order from="drbdlinks" id="order_drbdlinks_after_shared_filesystem" to="shared_filesystem" />
      <rsc_colocation from="postgresql" id="colocation_postgresql_drbdlinks" score="INFINITY" to="drbdlinks" />
      <rsc_order from="postgresql" id="order_postgresql_after_drbdlinks" to="drbdlinks" />
      <rsc_colocation from="exim" id="colocation_exim_drbdlinks" score="INFINITY" to="drbdlinks" />
      <rsc_order from="exim" id="order_exim_after_drbdlinks" to="drbdlinks" />
      <rsc_colocation from="inetd" id="colocation_inetd_shared_filesystem" score="INFINITY" to="shared_filesystem" />
      <rsc_colocation from="dhcpd" id="colocation_dhcpd_shared_filesystem" score="INFINITY" to="shared_filesystem" />
      <rsc_colocation from="atftpd" id="colocation_atftpd_drbdlinks" score="INFINITY" to="drbdlinks" />
      <rsc_order from="atftpd" id="order_atftpd_after_drbdlinks" to="drbdlinks" />
      <rsc_colocation from="apache" id="colocation_apache_drbdlinks" score="INFINITY" to="drbdlinks" />
      <rsc_order from="apache" id="order_apache_after_drbdlinks" to="drbdlinks" />
      <rsc_colocation from="cups" id="colocation_cups_shared_filesystem" score="INFINITY" to="shared_filesystem" />
    </constraints>
  </configuration>
</cib>

_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

[Linux-HA] 2.1.2 and failover of colocated resources

Reply via email to