Re: [Pacemaker] [PATCH 1/8 ] cl_log: Make functions static and remove CircularBuffer

2010-09-24 Thread Dejan Muhamedagic
Hi Bernd,

On Thu, Sep 16, 2010 at 12:11:39AM +0200, Bernd Schubert wrote:
 cl_log: Make functions static and remove CircularBuffer
 
 CircularBuffer was added more than 5 years ago and still it is not used.
 So remove dead code, it can be retrieved from the repository history
 if required.
 Also make functions static only used with cl_log.c

Really can't go with more than one mutually independent changes
in a patch.  So, this needs to be split into two (at least).
There's also some funny, and unrelated, change in cl_opensyslog.
Please make sure that the changes are minimal.

Cheers,

Dejan

 Signed-off-by: Bernd Schubert bschub...@ddn.com
 
 diff --git a/include/clplumbing/cl_log.h b/include/clplumbing/cl_log.h
 --- a/include/clplumbing/cl_log.h
 +++ b/include/clplumbing/cl_log.h
 @@ -39,7 +39,6 @@ voidcl_log(int priority, con
  voidcl_perror(const char * fmt, ...) G_GNUC_PRINTF(1,2);
  void cl_log_enable_stderr(int truefalse);
  void cl_log_enable_stdout(int truefalse);
 -int  cl_set_logging_wqueue_maxlen(int);
  gboolean cl_log_test_logd(void);
  void cl_log_set_uselogd(int truefalse);
  void cl_log_enable_syslog_filefmt(int truefalse);
 @@ -48,7 +47,6 @@ voidcl_log_set_facility(int facility);
  void cl_log_set_entity(const char *  entity);
  void cl_log_set_logfile(const char * path);
  void cl_log_set_debugfile(const char * path);
 -void inherit_compress(void);
  void cl_inherit_logging_environment(int maxqlen);
  int  cl_log_set_logd_channel_source( void (*create_callback)(struct 
 IPC_CHANNEL* chan),
   GDestroyNotify 
 destroy_callback);
 @@ -64,31 +62,4 @@ void   cl_flush_logs(void);
  void cl_log_args(int argc, char **argv);
  int cl_log_is_logd_fd(int fd);
  
 -
 -typedef struct CircularBuffer_s 
 -{
 - const char* name;
 - size_t  size;
 - gbooleanempty_after_dump;
 - GQueue* queue;
 - 
 -} CircularBuffer_t;
 -
 -typedef struct CircularBufferEntry_s 
 -{
 - int level;
 - char *buf;
 - 
 -} CircularBufferEntry_t;
 -
 -CircularBuffer_t *NewCircularBuffer(
 - const char *name, unsigned int size, gboolean empty_after_dump);
 -void LogToCircularBuffer(
 - CircularBuffer_t *buffer, int level, const char *fmt, ...) 
 G_GNUC_PRINTF(3,4);
 -
 -void EmptyCircularBuffer(CircularBuffer_t *buffer);
 -
 -/* the prototype is designed to be easy to give to 
 G_main_add_SignalHandler() */
 -gboolean DumpCircularBuffer(int nsig, gpointer buffer);
 -
  #endif
 diff --git a/lib/clplumbing/cl_log.c b/lib/clplumbing/cl_log.c
 --- a/lib/clplumbing/cl_log.c
 +++ b/lib/clplumbing/cl_log.c
 @@ -69,12 +69,12 @@ static gboolean   syslogformatfile = TRUE
  int LogToDaemon(int priority, const char * buf, int bstrlen, gboolean 
 use_pri_str);
  
  static int LogToLoggingDaemon(int priority, const char * buf, int bstrlen, 
 gboolean use_pri_str);
 -IPC_Message* ChildLogIPCMessage(int priority, const char *buf, int bstrlen, 
 +static IPC_Message* ChildLogIPCMessage(int priority, const char *buf, int 
 bstrlen,
   gboolean use_priority_str, IPC_Channel* ch);
 -void FreeChildLogIPCMessage(IPC_Message* msg);
 -gboolean send_dropped_message(gboolean use_pri_str, IPC_Channel *chan);
 +static void  FreeChildLogIPCMessage(IPC_Message* msg);
 +static gboolean send_dropped_message(gboolean use_pri_str, IPC_Channel 
 *chan);
 +static int cl_set_logging_wqueue_maxlen(int qlen);
  
 -const char * prio2str(int priority);
  static int   use_logging_daemon =  FALSE;
  static int   conn_logd_time = 0;
  static char  cl_log_entity[MAXENTITY]= DFLT_ENTITY;
 @@ -173,7 +173,7 @@ cl_log_set_logdtime(int logdtime)
  #define TRADITIONAL_COMPRESSION HA_traditional_compression
  #define COMPRESSION   HA_compression
  
 -void
 +static void
  inherit_compress(void)
  {
   char* inherit_env = NULL;
 @@ -366,6 +366,17 @@ cl_log_test_logd(void)
   
  }
  
 +static void
 +cl_opensyslog(void)
 +{
 + if (*cl_log_entity == '\0' || cl_log_facility  0) {
 + return;
 + }
 + strncpy(common_log_entity, cl_log_entity, MAXENTITY);
 + openlog(common_log_entity, LOG_CONS, cl_log_facility);
 + syslog_enabled = 1;
 +}
 +
  /* FIXME: This is way too ugly to bear */
  
  void
 @@ -445,7 +456,7 @@ cl_log_set_logd_channel_source( void (*c
   return 0;
  }
  
 -const char *
 +static const char *
  prio2str(int priority)
  {
   static const char *log_prio[8] = {
 @@ -734,7 +745,7 @@ ha_timestamp(TIME_T t)
  }
  
  
 -int
 +static int
  cl_set_logging_wqueue_maxlen(int qlen)
  {
   int sendrc;
 @@ -897,7 +908,7 @@ LogToLoggingDaemon(int priority, const c
  }
  
  
 -gboolean
 +static gboolean
  send_dropped_message(gboolean use_pri_str, IPC_Channel *chan)
  {
   int sendrc;
 @@ -924,27 +935,7 @@ 

Re: [Pacemaker] Timeout after nodejoin

2010-09-24 Thread Dan Frincu

Hi,

Steven Dake wrote:

On 09/22/2010 05:43 AM, Dan Frincu wrote:

Hi all,

I have the following packages:

# rpm -qa | grep -i (openais|cluster|heartbeat|pacemaker|resource)
openais-0.80.5-15.2
cluster-glue-1.0-12.2
pacemaker-1.0.5-4.2
cluster-glue-libs-1.0-12.2
resource-agents-1.0-31.5
pacemaker-libs-1.0.5-4.2
pacemaker-mgmt-1.99.2-7.2
libopenais2-0.80.5-15.2
heartbeat-3.0.0-33.3
pacemaker-mgmt-client-1.99.2-7.2

When I start openais, I get nodejoin immediately, as seen in the logs
below. However, it takes some time before the nodes are visible in
crm_mon output. Any idea how to minimize this delay?

Sep 22 15:27:24 bench1 openais[12935]: [crm ] info:
send_member_notification: Sending membership update 8 to 1 children
Sep 22 15:27:24 bench1 openais[12935]: [CLM ] got nodejoin message
192.168.165.33
Sep 22 15:27:24 bench1 openais[12935]: [CLM ] got nodejoin message
192.168.165.35
Sep 22 15:27:24 bench1 mgmtd: [12947]: info: Started.
Sep 22 15:27:24 bench1 openais[12935]: [crm ] WARN: route_ais_message:
Sending message to local.crmd failed: unknown (rc=-2)
Sep 22 15:27:24 bench1 openais[12935]: [crm ] WARN: route_ais_message:
Sending message to local.crmd failed: unknown (rc=-2)
Sep 22 15:27:24 bench1 openais[12935]: [crm ] info: pcmk_ipc: Recorded
connection 0x174840d0 for crmd/12946
Sep 22 15:27:24 bench1 openais[12935]: [crm ] info: pcmk_ipc: Sending
membership update 8 to crmd
Sep 22 15:27:24 bench1 openais[12935]: [crm ] info:
update_expected_votes: Expected quorum votes 1024 - 2
Sep 22 15:27:25 bench1 crmd: [12946]: notice: ais_dispatch: Membership
8: quorum aquired
Sep 22 15:28:15 bench1 crmd: [12946]: info: do_election_count_vote:
Election 2 (owner: bench2) pass: vote from bench2 (Host name)
Sep 22 15:28:15 bench1 crmd: [12946]: info: do_state_transition: State
transition S_PENDING - S_ELECTION [ input=I_ELECTION
cause=C_FSA_INTERNAL origin=do_election_count_vote ]
Sep 22 15:28:15 bench1 crmd: [12946]: info: do_state_transition: State
transition S_ELECTION - S_INTEGRATION [ input=I_ELECTION_DC
cause=C_FSA_INTERNAL origin=do_election_check ]
Sep 22 15:28:15 bench1 crmd: [12946]: info: do_te_control: Registering
TE UUID: 87c28ab8-ba93-4111-a26a-67e88dd927fb
Sep 22 15:28:15 bench1 crmd: [12946]: WARN:
cib_client_add_notify_callback: Callback already present
Sep 22 15:28:15 bench1 crmd: [12946]: info: set_graph_functions: Setting
custom graph functions
Sep 22 15:28:15 bench1 crmd: [12946]: info: unpack_graph: Unpacked
transition -1: 0 actions in 0 synapses
Sep 22 15:28:15 bench1 crmd: [12946]: info: do_dc_takeover: Taking over
DC status for this partition
Sep 22 15:28:15 bench1 cib: [12942]: info: cib_process_readwrite: We are
now in R/W mode

Regards,

Dan



Where did you get that version of openais?  openais 0.80.x is 
deprecated in the community (and hence, no support).  We recommend 
using corosync instead which has improved testing with pacemaker.


From the SUSE repositories for Redhat, last year, when we began working 
with this cluster stack. I also pushed corosync forward, for obvious 
reasons, however for existing installations, upgrade is an option that 
will require some testing, because the platforms cannot be taken offline.


Anyway, thank you all for your input, I've done some researching and 
fiddling with the dc-timeout parameter did the trick.


Regards,

Dan  


--
Dan FRINCU
Systems Engineer
CCNA, RHCE
Streamwide Romania


___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


Re: [Pacemaker] About behavior in Action Lost.

2010-09-24 Thread Andrew Beekhof
Pushed as:
   http://hg.clusterlabs.org/pacemaker/1.1/rev/8433015faf18

Not sure about applying to 1.0 though, its a dramatic change in behavior.

On Wed, Sep 22, 2010 at 11:18 AM,  renayama19661...@ybb.ne.jp wrote:
 Hi Andrew,

 Thank you for comment.

 A long time ago in a galaxy far away, some messaging layers used to
 loose quite a few actions, including stops.
 About the same time, we decided that fencing because a stop action was
 lost wasn't a good idea.

 The rationale was that if the operation eventually completed, it would
 end up in the CIB anyway.
 And even if it didn't, the PE would continue to try the operation
 again until the whole node fell over at which point it would get shot
 anyway.

 Sorry...
 I did not know the fact that there was such an argument in old days.


 Now, having said that, things have improved since then and perhaps,
 the interest of speeding up recovery in these situations, it is time
 to stop treating stop operations differently.
 Would you agree?

 That means, you change it in the case of Action Lost of the stop this time 
 to carry out stonith?
 If my recognition is right, I agree too.

 if(timer-action-type != action_type_rsc) {
 send_update = FALSE;
 } else if(safe_str_eq(task, cancel)) {
 /* we dont need to update the CIB with these */
 send_update = FALSE;
 }
 --- delete else if(safe_str_eq(task, stop)){..} ?

 if(send_update) {
 /* cib_action_update(timer-action, LRM_OP_PENDING, EXECRA_STATUS_UNKNOWN); */
 cib_action_update(timer-action, LRM_OP_TIMEOUT, EXECRA_UNKNOWN_ERROR);
 }

 Best Regards,
 Hideo Yamauchi.

 --- Andrew Beekhof and...@beekhof.net wrote:

 On Tue, Sep 21, 2010 at 8:59 AM,  renayama19661...@ybb.ne.jp wrote:
  Hi,
 
  Node was in state that the load was very high, and we confirmed monitor 
  movement of Pacemeker.
  Action Lost occurred in stop movement after the error of the monitor 
  occurred.
 
  Sep #65533;8 20:02:22 cgl54 crmd: [3507]: ERROR: print_elem: Aborting 
  transition, action lost:
 [Action 9]:
  In-flight (id: prmApPostgreSQLDB1_stop_0, loc: cgl49, priority: 0)
  Sep #65533;8 20:02:22 cgl54 crmd: [3507]: info: abort_transition_graph: 
  action_timer_callback:486
 -
  Triggered transition abort (complete=0) : Action lost
 
 
  For the load of the node, We think that the stop movement did not go well.
  But cannot nodes execute stonith.

 A long time ago in a galaxy far away, some messaging layers used to
 loose quite a few actions, including stops.
 About the same time, we decided that fencing because a stop action was
 lost wasn't a good idea.

 The rationale was that if the operation eventually completed, it would
 end up in the CIB anyway.
 And even if it didn't, the PE would continue to try the operation
 again until the whole node fell over at which point it would get shot
 anyway.

 Now, having said that, things have improved since then and perhaps,
 the interest of speeding up recovery in these situations, it is time
 to stop treating stop operations differently.
 Would you agree?

 ___
 Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
 http://oss.clusterlabs.org/mailman/listinfo/pacemaker

 Project Home: http://www.clusterlabs.org
 Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
 Bugs: 
 http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker



 ___
 Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
 http://oss.clusterlabs.org/mailman/listinfo/pacemaker

 Project Home: http://www.clusterlabs.org
 Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
 Bugs: 
 http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


Re: [Pacemaker] Can somebody please explain pengine's urge to move all resources?

2010-09-24 Thread Andrew Beekhof
On Wed, Sep 22, 2010 at 12:37 PM, Raoul Bhatia [IPAX] r.bha...@ipax.at wrote:
 hi,

 i have a 2node cluster with drbd+nfs+webservices(clones)

 basically, i have some rules:

 1. promote drbd before starting fs+nfs-server (group_www_data)
 order drbd_before_group_www_data : ms_drbd_www:promote group_www_data:start

 2. start nfs-server (group_www_data) before nfsclient+apache
 (clone_webservice)
 order group_www_data_before_webservices : group_www_data:start 
 clone_webservice:start

 3. start ftp-server after everything is up:
 order fs_www_before_pure-ftpd 0: clone_webservice:start pure-ftpd:start
 order webservices_before_group_ftpd 0: clone_webservice:start 
 group_ftpd:start

 (actually, from what i see now, these two rules are redundant, right?)

well, one of them is. probably want the other one though :-)



 i also colocate clone_webservice (nfs client) with the ftps server, so
 that the ftp server can acutally serve the user's folders ;)
 colocation colo_webservices_group_ftpd inf: group_ftpd 
 clone_webservice:Started


 crm status:
  Resource Group: group_www_data
      fs_www_data        (ocf::heartbeat:Filesystem):    Started wc01
      nfs-kernel-server  (lsb:nfs-kernel-server):        Started wc01
      intip_nfs  (ocf::heartbeat:IPaddr2):       Started wc01
      backupip_nfs       (ocf::heartbeat:IPaddr2):       Started wc01
 ...
  Master/Slave Set: ms_drbd_www
      Masters: [ wc02 ]
      Slaves: [ wc01 ]
  Clone Set: clone_nfs-common
      Started: [ wc01 wc02 ]
  Clone Set: clone_webservice
      Started: [ wc02 wc01 ]
  Resource Group: group_ftpd
      intip_ftp  (ocf::heartbeat:IPaddr2):       Started wc01
      pure-ftpd  (ocf::heartbeat:Pure-FTPd):     Started wc01


 now i want to move pure-ftpd from wc01 to wc02:
 crm resource migrate pure-ftpd

 imho, as clone_webservice is running on both wc01 and wc02, only
 group_ftpd should be stopped and (re-)started.

 but pengine thinks:
 Sep 22 11:24:06 wc01 pengine: [4083]: notice: LogActions: Move resource 
 fs_www_data#011(Started wc01 - wc02)
 Sep 22 11:24:06 wc01 pengine: [4083]: notice: LogActions: Move resource 
 nfs-kernel-server#011(Started wc01 - wc02)
 Sep 22 11:24:06 wc01 pengine: [4083]: notice: LogActions: Move resource 
 intip_nfs#011(Started wc01 - wc02)
 Sep 22 11:24:06 wc01 pengine: [4083]: notice: LogActions: Move resource 
 backupip_nfs#011(Started wc01 - wc02)

 can someone please explain the reason for that?

Probably a bug.
The good news is that 1.1.3 doesn't have that behavior.
Lets see how 1.0 goes once all the relevant patches have been backported.

 hb report atached.

Note to self: pe-input-90.bz2 from wc01 is the relevant test file.

 thanks,
 raoul
 --
 
 DI (FH) Raoul Bhatia M.Sc.          email.          r.bha...@ipax.at
 Technischer Leiter

 IPAX - Aloy Bhatia Hava OG          web.          http://www.ipax.at
 Barawitzkagasse 10/2/2/11           email.            off...@ipax.at
 1190 Wien                           tel.               +43 1 3670030
 FN 277995t HG Wien                  fax.            +43 1 3670030 15
 

 ___
 Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
 http://oss.clusterlabs.org/mailman/listinfo/pacemaker

 Project Home: http://www.clusterlabs.org
 Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
 Bugs: 
 http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker



___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


Re: [Pacemaker] Release Matrix

2010-09-24 Thread Andrew Beekhof
On Wed, Sep 22, 2010 at 12:20 PM, Raoul Bhatia [IPAX] r.bha...@ipax.at wrote:
 hi,

 regarding the Release Matrix [1] and the ABI-change in cluster-glue/
 clplumbing [2], i wonder if pacemaker 1.0.9.1 really works with
 glue 1.0.3?

A _B_ I
It works with whatever version it was built against :-)
rpm/yum will make sure that version is installed.


 cheers,
 raoul

 [1] http://www.clusterlabs.org/wiki/ReleaseMatrix
 [2] http://www.gossamer-threads.com/lists/linuxha/pacemaker/65443
 --
 
 DI (FH) Raoul Bhatia M.Sc.          email.          r.bha...@ipax.at
 Technischer Leiter

 IPAX - Aloy Bhatia Hava OG          web.          http://www.ipax.at
 Barawitzkagasse 10/2/2/11           email.            off...@ipax.at
 1190 Wien                           tel.               +43 1 3670030
 FN 277995t HG Wien                  fax.            +43 1 3670030 15
 

 ___
 Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
 http://oss.clusterlabs.org/mailman/listinfo/pacemaker

 Project Home: http://www.clusterlabs.org
 Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
 Bugs: 
 http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


[Pacemaker] target-role default value

2010-09-24 Thread Pavlos Parissis
Hi,

What is the default value for target-role in resource?
I tried to query it with crm_resource but without success.
 crm_resource pbx_02 --get-property target-role
crm_resource pbx_02 --get-parameter target-role --meta


Cheers,
Pavlos
___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


Re: [Pacemaker] target-role default value

2010-09-24 Thread Pavlos Parissis
On 24 September 2010 11:40, Michael Schhwartzkopff mi...@clusterbau.comwrote:

 On Friday 24 September 2010 11:34:11 Pavlos Parissis wrote:
  Hi,
 
  What is the default value for target-role in resource?
  I tried to query it with crm_resource but without success.
   crm_resource pbx_02 --get-property target-role
  crm_resource pbx_02 --get-parameter target-role --meta
 
 
  Cheers,
  Pavlos

 started


thanks.
How do I get default values for parameters which are not set?

Thanks again,
Pavlos
___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


Re: [Pacemaker] default timeout for op start/stop

2010-09-24 Thread Michael Schhwartzkopff
On Friday 24 September 2010 13:50:49 Pavlos Parissis wrote:
 Hi,
 
 When I verify my conf I get complains about the timeout on start and stop
 operation
 crm(live)configure# verify
 WARNING: drbd_01: default timeout 20s for start is smaller than the advised
 240
 WARNING: drbd_01: default timeout 20s for stop is smaller than the advised
 100
 WARNING: drbd_02: default timeout 20s for start is smaller than the advised
 240
 WARNING: drbd_02: default timeout 20s for stop is smaller than the advised
 100
 
 Since I don't specifically set timeout for the mentioned resources I
 thought this 20s is coming from the defaults.
 So, I queried the defaults and got the following
 [r...@node-03 ~]# crm_attribute --type op_defaults --name timeout
 scope=op_defaults  name=timeout value=(null)
 
 So, I am wondering from where this 20s is coming from.
 
 I had the same issue for IP and Filesystem type resources and in order to
 get rid of the warning I specifically set it to be 60s.
 
 Regards,
 Pavlos
 
 
 [r...@node-03 ~]# crm configure show
 node $id=b8ad13a6-8a6e-4304-a4a1-8f69fa735100 node-02
 node $id=d5557037-cf8f-49b7-95f5-c264927a0c76 node-01
 node $id=e5195d6b-ed14-4bb3-92d3-9105543f9251 node-03
 primitive drbd_01 ocf:linbit:drbd \
 params drbd_resource=drbd_pbx_service_1 \
 op monitor interval=30s
 primitive drbd_02 ocf:linbit:drbd \
 params drbd_resource=drbd_pbx_service_2 \
 op monitor interval=30s
 primitive fs_01 ocf:heartbeat:Filesystem \
 params device=/dev/drbd1 directory=/pbx_service_01
 fstype=ext3 \
 meta migration-threshold=3 failure-timeout=60 \
 op monitor interval=20s timeout=40s OCF_CHECK_LEVEL=20 \
 op start interval=0 timeout=60s \
 op stop interval=0 timeout=60s
 primitive fs_02 ocf:heartbeat:Filesystem \
 params device=/dev/drbd2 directory=/pbx_service_02
 fstype=ext3 \
 meta migration-threshold=3 failure-timeout=60 \
 op monitor interval=20s timeout=40s OCF_CHECK_LEVEL=20 \
 op start interval=0 timeout=60s \
 op stop interval=0 timeout=60s
 primitive ip_01 ocf:heartbeat:IPaddr2 \
 params ip=10.10.10.10 cidr_netmask=25 broadcast=10.10.10.127
 \ meta failure-timeout=120 migration-threshold=3 \
 op monitor interval=5s
 primitive ip_02 ocf:heartbeat:IPaddr2 \
 params ip=10.10.10.11 cidr_netmask=25 broadcast=10.10.10.127
 \ op monitor interval=5s
 primitive pbx_01 ocf:heartbeat:Dummy \
 params state=/pbx_service_01/Dummy.state \
 meta failure-timeout=60 migration-threshold=3 \
 op monitor interval=20s timeout=40s
 primitive pbx_02 ocf:heartbeat:Dummy \
 params state=/pbx_service_02/Dummy.state \
 meta failure-timeout=60 migration-threshold=3
 group pbx_service_01 ip_01 fs_01 pbx_01 \
 meta target-role=Started
 group pbx_service_02 ip_02 fs_02 pbx_02 \
 meta target-role=Started
 ms ms-drbd_01 drbd_01 \
 meta master-max=1 master-node-max=1 clone-max=2
 clone-node-max=1 notify=true
 ms ms-drbd_02 drbd_02 \
 meta master-max=1 master-node-max=1 clone-max=2
 clone-node-max=1 notify=true target-role=Started
 location PrimaryNode-drbd_01 ms-drbd_01 100: node-01
 location PrimaryNode-drbd_02 ms-drbd_02 100: node-02
 location PrimaryNode-pbx_service_01 pbx_service_01 200: node-01
 location PrimaryNode-pbx_service_02 pbx_service_02 200: node-02
 location SecondaryNode-drbd_01 ms-drbd_01 0: node-03
 location SecondaryNode-drbd_02 ms-drbd_02 0: node-03
 location SecondaryNode-pbx_service_01 pbx_service_01 10: node-03
 location SecondaryNode-pbx_service_02 pbx_service_02 10: node-03
 colocation fs_01-on-drbd_01 inf: fs_01 ms-drbd_01:Master
 colocation fs_02-on-drbd_02 inf: fs_02 ms-drbd_02:Master
 colocation pbx_01-with-fs_01 inf: pbx_01 fs_01
 colocation pbx_01-with-ip_01 inf: pbx_01 ip_01
 colocation pbx_02-with-fs_02 inf: pbx_02 fs_02
 colocation pbx_02-with-ip_02 inf: pbx_02 ip_02
 order fs_01-after-drbd_01 inf: ms-drbd_01:promote fs_01:start
 order fs_02-after-drbd_02 inf: ms-drbd_02:promote fs_02:start
 order pbx_01-after-fs_01 inf: fs_01 pbx_01
 order pbx_01-after-ip_01 inf: ip_01 pbx_01
 order pbx_02-after-fs_02 inf: fs_02 pbx_02
 order pbx_02-after-ip_02 inf: ip_02 pbx_02
 property $id=cib-bootstrap-options \
 dc-version=1.0.9-89bd754939df5150de7cd76835f98fe90851b677 \
 cluster-infrastructure=Heartbeat \
 stonith-enabled=false \
 symmetric-cluster=false \
 last-lrm-refresh=1285323745
 rsc_defaults $id=rsc-options \
 resource-stickiness=1000

Default timeout is coded into the resource agent. You safely can ignore the 
WARNINGs. These are also removed from more recent versions of pacemaker.


-- 
Dr. Michael Schwartzkopff
Guardinistr. 63
81375 München

Tel: (0163) 172 50 98

___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: 

Re: [Pacemaker] default timeout for op start/stop

2010-09-24 Thread Pavlos Parissis
On 24 September 2010 13:54, Michael Schhwartzkopff mi...@clusterbau.comwrote:

 On Friday 24 September 2010 13:50:49 Pavlos Parissis wrote:
  Hi,
 
  When I verify my conf I get complains about the timeout on start and stop
  operation
  crm(live)configure# verify
  WARNING: drbd_01: default timeout 20s for start is smaller than the
 advised
  240
  WARNING: drbd_01: default timeout 20s for stop is smaller than the
 advised
  100
  WARNING: drbd_02: default timeout 20s for start is smaller than the
 advised
  240
  WARNING: drbd_02: default timeout 20s for stop is smaller than the
 advised
  100
 
  Since I don't specifically set timeout for the mentioned resources I
  thought this 20s is coming from the defaults.
  So, I queried the defaults and got the following
  [r...@node-03 ~]# crm_attribute --type op_defaults --name timeout
  scope=op_defaults  name=timeout value=(null)
 
 

 Default timeout is coded into the resource agent. You safely can ignore the
 WARNINGs. These are also removed from more recent versions of pacemaker.

 thanks again
Pavlos
___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


Re: [Pacemaker] migration-threshold causing unnecessary restart of underlying resources

2010-09-24 Thread Cnut Jansen
Am 12.08.2010 04:12, schrieb Cnut Jansen:

 Basically I have a cluster of 2 nodes with cloned DLM-, O2CB-, DRBD-,
 mount-resources, and a MySQL-resource (grouped with an IPaddr-resource)
 running on top of the other ones.
 The MySQL(-group)-resource depends on the mount-resource, which depends
 on both, the DRBD- and the O2CB-resources equally, and the O2CB-resource
 depends on the DLM-resource.
 cloneDlm - cloneO2cb -\
 }- cloneMountMysql - mysql / grpMysql( mysql
 - ipMysql )
 msDrbdMysql ---/
 Furthermore for the MySQL(-group)-resource I set meta-attributes
 migration-threshold=1 and failure-timeout=90 (later also tried
 settings 3 and 130 for these).

 Now through a lot of testing I found out that:
 a) the stops/restarts of the underlying resources happen only when
 failcounter hits the limit set by migration-threshold; i.e. when set to
 3, on first 2 failures only mysql/grpMysql is restarted on the same node
 and only on 3rd one underlying resources are left in a mess (while
 mysql/grpMysql migrates) (for DRBD reproducable; unsure about
 DLM/O2CB-side, but there's sometimes hard trouble too after having
 picked on mysql; just couldn't definitively link it yet)
 b) upon causing mysql/grpMysql's migration, score for
 msDrbdMysql:promote changes from 10020 to -inf and stays there for the
 time of mysql/grpMysql's failure-timeout (proved with also setting to
 130), before it rises back up to 1
 c) msDrbdMysql remains slave until the next cluster-recheck after its
 promote-score went back up to 1
 d) I also have the impression that fail-counters don't get reset after
 their failure-timeout, because when migration-threshold=3 is set, upon
 every(!) following picking-on those issues occure, even when I've waited
 for nearly 5 minutes (with failure-timeout=90) without any touching the
 cluster
 
 I experienced this on both test-clusters, a SLES 11 HAE SP1 with
 Pacemaker 1.1.2, and a Debian Squeeze with Pacemaker 1.0.9. When
 migration-threshold for mysql/grpMysql is removed, everything is fine
 (except no migration of course). I can't remember such happening with
 SLES 11 HAE SP0's Pacemaker 1.0.6.

 p.s.: Just for fun / testing / proving I just also contrainted
 grpLdirector to cloneMountShared... and could perfectly reproduce that
 problem with its then underlying resources too.

For reference:
SLES11-HAE-SP1: Issues seem to be solved with latest officially released
packages (upgraded yesterday directly from Novell's repositories),
including Pacemaker version 1.1.2-0.6.1 (Arch: x86_64), shown
in crm_mon as 1.1.2-ecb1e2ea172ba2551f0bd763e557fccde68c849b. At
least so far I couldn't reproduce any unnecessary restart of underlying
resources (nor any other touching them at all), and fail-counters now -
after failure-timeout is over - get reset upon next cluster-recheck
(event- or interval-driven).
Debian Squeeze: Not tested again yet


___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


Re: [Pacemaker] Designated reaction of Pacemaker to monitor-op returning rc=7 (OCF_NOT_RUNNING)

2010-09-24 Thread Cnut Jansen
Am 26.08.2010 10:38, schrieb Dejan Muhamedagic:
 Hi,
 
 On Wed, Aug 25, 2010 at 08:56:08PM +0200, Cnut Jansen wrote:
 Am 25.08.2010 16:00, schrieb Dejan Muhamedagic:
 Hi,

 On Tue, Aug 24, 2010 at 05:19:23PM +0200, Cnut Jansen wrote:
 Hi,

 just (for now) a short question for to make sure I didn't miss anything:
 What's the designated reaction of Pacemaker when a resource agents
 called for monitoring a resource, which is supposed to run and thus
 resulting in a return of 0 (OCF_SUCCESS), returns 7 (OCF_NOT_RUNNING)?
 Shall Pacemaker's very next call be for stopping the resource or shall
 it be yet another (or even several) monitorings?

 It should be stop, followed by start, either on the same node or
 on another depending on the migration-threshold setting and
 failcount.

 Ok, that's what I expected.
 So there are neither so-far-unknown-to-me circumstances where it's by
 design that Pacemaker - after having gotten a rc=7 from the RA; and for
 adding a FAILED behind the resource in crm_mon, it obviously also
 understood it correctly - calls the RA yet another several times for
 monitoring (while letting the rest of the cluster hang) before finally
 calling the desired stop, instead of immediately calling the RA for
 stopping and continueing with the pending transactions and migrations.
 
 Yes, that sounds quite unusual.

Just for reference:
Though I'm not absolutely sure about it, from today's point of view that
strange not-stopping-resource-after-after-rc=7 maybe might have been
symptoms/combinations of quite sluggish cluster (Pacemaker still waiting
for returns of RAs and/or Pacemaker itself) and zombie-monitor-ops
(since I only saw in my own RAs' outputs that they'd get called for
monitor-action, but not the id or something of the monitor-op calling them).
Since yesterday, when we patched to latest officially released
SLES11-HAE-SP1-packages, the zombie-monitor-ops (as well as many other
problems) are gone (and only a few minor new ones so far (-;); and though
not having explicitly looked/searched for it, I lately haven't seen such
ignoring rc=7 and re-calling monitor-actions several times anymore.
(But lately I also - due to enhancements to my own RAs (Tomcat6/Apache)
- could remove the 15sec-start-delays for the monitor-op, which speeded
them up a lot and thus them then only rarely being the ones attracting
the zombie-monitor-ops)

Current version now is (SLES11-HAE-SP1): 1.1.2-0.6.1 (Arch: x86_64)
Displays in crm_mon as: 1.1.2-ecb1e2ea172ba2551f0bd763e557fccde68c849b


 (btw., jfyi: migration-thresholds are currently completely banned out of
 
 Why? Anything wrong with them?

See my other thread, the filed bugzilla linked in there and Andrew
Beekhof's confirmational cleared-upstream-note about fail-counts in
bugzilla.
http://developerbugs.linux-foundation.org/show_bug.cgi?id=2468

migration-threshold and failure-timeout seem to be fixed in this new,
current SLES-release too.


 my configurations, so this is another issue; I probably also might have
 yet another issue / possible bug regarding zombie-(monitor-)operations,
 with symptoms like of an off-by-one-error)

 Please file a bugzilla if you find a bug.

Though I allready had collected dozens of hb_reports with
zombie-monitor-operations occuring and could quite exactly predict
such a zombie from only watching the crm_mon during nodes switching to
standby, I haven't found/identified an exact cause for it yet (turned
out to at least not show up as an ordinary off-by-one-error; in the
beginning it often hit the resources controlled by my own RAs, which
were the ones starting last, but after having speeded them up, it rather
hit them the least(-#), therefor I haven't file anything about that yet.
Anyway, those zombie-monitor-operations seem to be gone now too, so they
probably were only yet another long resolved old-version-bug, due to the
very conservative policies for enterprise distributions.


___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker


[Pacemaker] cib

2010-09-24 Thread Shravan Mishra
Hi All,

We recently upgraded to

/usr/sbin/corosync -v
Corosync Cluster Engine, version '1.2.1' SVN revision '2723:2724'
Copyright (c) 2006-2009 Red Hat, Inc.

In my logs I see the following lines:

crmd[20612]: 2010/09/24_15:29:57 ERROR: crm_log_init_worker: Cannot
change active directory to /var/lib/heartbeat/cores/hacluster:
Permission denied (13)



cib is not comming up.


Others look ok.

ps -ef | grep heart
root 27797 27791  0 15:24 ?00:00:00 /usr/lib64/heartbeat/stonithd
nobody   27799 27791  0 15:24 ?00:00:00 /usr/lib64/heartbeat/lrmd
82   27801 27791  0 15:24 ?00:00:00 /usr/lib64/heartbeat/pengine
82   29064 27791  0 15:39 ?00:00:00 /usr/lib64/heartbeat/crmd


More info:


getent group haclient
haclient:x:101:
 getent passwd hacluster
hacluster:x:82:101:cluster user:/var/lib/heartbeat/cores/hacluster:/sbin/nologin


What could be the problem.

Thanks
Shravan

___
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker