This patch looks good but there are some poll apis used which are 
available via coroapi.

Regards
-steve

On 09/23/2010 06:55 PM, Angus Salkeld wrote:
> Signed-off-by: Angus Salkeld<[email protected]>
> ---
>   configure.ac                |   36 ++
>   corosync.spec.in            |   14 +
>   exec/Makefile.am            |    2 +-
>   exec/service.c              |   12 +
>   include/corosync/corodefs.h |    4 +-
>   services/Makefile.am        |    6 +
>   services/mon.c              |  635 ++++++++++++++++++++++++++++++++++++
>   services/wd.c               |  755 
> +++++++++++++++++++++++++++++++++++++++++++
>   8 files changed, 1462 insertions(+), 2 deletions(-)
>   create mode 100644 services/mon.c
>   create mode 100644 services/wd.c
>
> diff --git a/configure.ac b/configure.ac
> index b57fdd2..ad4b6c1 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -252,6 +252,16 @@ AC_ARG_ENABLE([rdma],
>       [ enable_rdma="no" ])
>   AM_CONDITIONAL(BUILD_RDMA, test x$enable_rdma = xyes)
>
> +AC_ARG_ENABLE([monitoring],
> +     [  --enable-monitoring              : resource monitoring ],,
> +     [ default="no" ])
> +AM_CONDITIONAL(BUILD_MONITORING, test x$enable_monitoring = xyes)
> +
> +AC_ARG_ENABLE([watchdog],
> +     [  --enable-watchdog                   : Watchdog support ],,
> +     [ edefault="no" ])
> +AM_CONDITIONAL(BUILD_WATCHDOG, test x$enable_watchdog = xyes)
> +
>   AC_ARG_ENABLE([augeas],
>       [  --enable-augeas                 : Install the augeas lens for 
> corosync.conf ],,
>       [ enable_augeas="no" ])
> @@ -277,6 +287,7 @@ AC_ARG_WITH([socket-dir],
>   # THIS SECTION MUST DIE!
>   CP=cp
>   OS_LDL="-ldl"
> +have_linux="no"
>   case "$host_os" in
>       *linux*)
>               AC_DEFINE_UNQUOTED([COROSYNC_LINUX], [1],
> @@ -286,6 +297,7 @@ case "$host_os" in
>               OS_LDFLAGS=""
>               OS_DYFLAGS="-rdynamic"
>               DARWIN_OPTS=""
> +             have_linux="yes"
>       ;;
>       darwin*)
>               AC_DEFINE_UNQUOTED([COROSYNC_DARWIN], [1],
> @@ -387,6 +399,30 @@ if test "x${enable_rdma}" = xyes; then
>       PACKAGE_FEATURES="$PACKAGE_FEATURES rdma"
>   fi
>
> +if test "x${enable_monitoring}" = xyes; then
> +
> +     AC_CHECK_LIB([statgrab], [sg_get_mem_stats], have_libstatgrab="yes", 
> have_libstatgrab="no")
> +
> +     if test "x${have_libstatgrab}" = xyes; then
> +             AC_DEFINE_UNQUOTED([HAVE_LIBSTATGRAB], 1, [have libstatgrab])
> +             statgrab_LIBS="-lstatgrab"
> +     else
> +             if test "x${have_linux}" = xno; then
> +                     AC_MSG_ERROR(monitoring requires libstatgrab on 
> non-linux systems)
> +             fi
> +     fi
> +     AC_SUBST([statgrab_LIBS])
> +     AC_DEFINE_UNQUOTED([HAVE_MONITORING], 1, [have resource monitoring])
> +     PACKAGE_FEATURES="$PACKAGE_FEATURES monitoring"
> +fi
> +
> +if test "x${enable_watchdog}" = xyes; then
> +     AC_CHECK_HEADER(linux/watchdog.h,,AC_MSG_ERROR(watchdog requires 
> linux/watchdog.h))
> +     AC_CHECK_HEADER(linux/reboot.h,,AC_MSG_ERROR(watchdog requires 
> linux/reboot.h))
> +     AC_DEFINE_UNQUOTED([HAVE_WATCHDOG], 1, [have watchdog])
> +     PACKAGE_FEATURES="$PACKAGE_FEATURES watchdog"
> +fi
> +
>   if test "x${enable_augeas}" = xyes; then
>       PACKAGE_FEATURES="$PACKAGE_FEATURES augeas"
>   fi
> diff --git a/corosync.spec.in b/corosync.spec.in
> index dafdb3c..ed531c3 100644
> --- a/corosync.spec.in
> +++ b/corosync.spec.in
> @@ -5,6 +5,8 @@
>   # Invoke "rpmbuild --without<feature>" or "rpmbuild --with<feature>"
>   # to disable or enable specific features
>   %bcond_with testagents
> +%bcond_with watchdog
> +%bcond_with monitoring
>
>   Name: corosync
>   Summary: The Corosync Cluster Engine and Application Programming Interfaces
> @@ -52,6 +54,12 @@ export rdmacm_LIBS=-lrdmacm \
>   %if %{with testagents}
>       --enable-testagents \
>   %endif
> +%if %{with watchdog}
> +     --enable-watchdog \
> +%endif
> +%if %{with monitoring}
> +     --enable-monitoring \
> +%endif
>       --enable-rdma \
>       --with-initddir=%{_initrddir}
>
> @@ -115,6 +123,12 @@ fi
>   %{_libexecdir}/lcrso/quorum_testquorum.lcrso
>   %{_libexecdir}/lcrso/vsf_quorum.lcrso
>   %{_libexecdir}/lcrso/vsf_ykd.lcrso
> +%if %{with watchdog}
> +%{_libexecdir}/lcrso/service_wd.lcrso
> +%endif
> +%if %{with monitoring}
> +%{_libexecdir}/lcrso/service_mon.lcrso
> +%endif
>   %dir %{_localstatedir}/lib/corosync
>   %dir %{_localstatedir}/log/cluster
>   %{_mandir}/man8/corosync_overview.8*
> diff --git a/exec/Makefile.am b/exec/Makefile.am
> index a3a49bf..938237c 100644
> --- a/exec/Makefile.am
> +++ b/exec/Makefile.am
> @@ -59,7 +59,7 @@ libcoroipcs_a_SOURCES       = $(COROIPCS_SRC)
>   corosync_SOURCES    = main.c util.c sync.c apidef.c service.c \
>                         timer.c totemconfig.c mainconfig.c quorum.c 
> schedwrk.c \
>                         ../lcr/lcr_ifact.c evil.c syncv2.c
> -corosync_LDADD               = -ltotem_pg -llogsys -lcoroipcs
> +corosync_LDADD               = -ltotem_pg -llogsys -lcoroipcs 
> $(statgrab_LIBS)
>   corosync_DEPENDENCIES       = libtotem_pg.so.$(SONAME) 
> liblogsys.so.$(SONAME) libcoroipcs.so.$(SONAME)
>   corosync_LDFLAGS    = $(OS_DYFLAGS) -L./
>
> diff --git a/exec/service.c b/exec/service.c
> index be55459..dc30406 100644
> --- a/exec/service.c
> +++ b/exec/service.c
> @@ -85,6 +85,18 @@ static struct default_service default_services[] = {
>               .name                    = "corosync_pload",
>               .ver                     = 0,
>       },
> +#ifdef HAVE_MONITORING
> +     {
> +             .name                    = "corosync_mon",
> +             .ver                     = 0,
> +     },
> +#endif
> +#ifdef HAVE_WATCHDOG
> +     {
> +             .name                    = "corosync_wd",
> +             .ver                     = 0,
> +     },
> +#endif
>       {
>               .name                    = "corosync_quorum",
>               .ver                     = 0,
> diff --git a/include/corosync/corodefs.h b/include/corosync/corodefs.h
> index 57923e2..a1e6539 100644
> --- a/include/corosync/corodefs.h
> +++ b/include/corosync/corodefs.h
> @@ -59,7 +59,9 @@ enum corosync_service_types {
>       NTF_SERVICE = 16,
>       AMF_V2_SERVICE = 17,
>       TST_SV1_SERVICE = 18,
> -     TST_SV2_SERVICE = 19
> +     TST_SV2_SERVICE = 19,
> +     MON_SERVICE = 20,
> +     WD_SERVICE = 21
>   };
>
>   #ifdef HAVE_SMALL_MEMORY_FOOTPRINT
> diff --git a/services/Makefile.am b/services/Makefile.am
> index cb64016..f39adc3 100644
> --- a/services/Makefile.am
> +++ b/services/Makefile.am
> @@ -38,6 +38,12 @@ INCLUDES           = -I$(top_builddir)/include 
> -I$(top_srcdir)/include \
>                         -I$(top_srcdir)/include/corosync
>
>   SERVICE_LCRSO               = evs cfg cpg confdb pload
> +if BUILD_WATCHDOG
> +SERVICE_LCRSO                += wd
> +endif
> +if BUILD_MONITORING
> +SERVICE_LCRSO                += mon
> +endif
>
>   QUORUM_LCRSO                = votequorum testquorum
>
> diff --git a/services/mon.c b/services/mon.c
> new file mode 100644
> index 0000000..3e475a1
> --- /dev/null
> +++ b/services/mon.c
> @@ -0,0 +1,635 @@
> +/*
> + * Copyright (c) 2010 Red Hat, Inc.
> + *
> + * All rights reserved.
> + *
> + * Author: Angus Salkeld<[email protected]>
> + *
> + * This software licensed under BSD license, the text of which follows:
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are 
> met:
> + *
> + * - Redistributions of source code must retain the above copyright notice,
> + *   this list of conditions and the following disclaimer.
> + * - Redistributions in binary form must reproduce the above copyright 
> notice,
> + *   this list of conditions and the following disclaimer in the 
> documentation
> + *   and/or other materials provided with the distribution.
> + * - Neither the name of the MontaVista Software, Inc. nor the names of its
> + *   contributors may be used to endorse or promote products derived from 
> this
> + *   software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
> IS"
> + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
> + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
> + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
> + * THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include<config.h>
> +
> +#include<unistd.h>
> +#if defined(HAVE_LIBSTATGRAB)
> +#include<statgrab.h>
> +#endif
> +
> +#include<corosync/corotypes.h>
> +#include<corosync/corodefs.h>
> +#include<corosync/lcr/lcr_comp.h>
> +#include<corosync/engine/coroapi.h>
> +#include<corosync/list.h>
> +#include<corosync/totem/coropoll.h>
> +#include<corosync/engine/logsys.h>
> +#include "../exec/fsm.h"
> +
> +
> +LOGSYS_DECLARE_SUBSYS ("MON");
> +
> +#undef ENTER
> +#define ENTER() log_printf (LOGSYS_LEVEL_INFO, "%s", __func__)
> +
> +/*
> + * Service Interfaces required by service_message_handler struct
> + */
> +static int mon_exec_init_fn (
> +     struct corosync_api_v1 *corosync_api);
> +
> +hdb_handle_t mon_poll = 0;
> +static struct corosync_api_v1 *api;
> +static hdb_handle_t resources_obj;
> +static pthread_t mon_poll_thread;
> +#define MON_DEFAULT_PERIOD 3
> +
> +struct corosync_service_engine mon_service_engine = {
> +     .name                   = "corosync resource monitoring service",
> +     .id                     = MON_SERVICE,
> +     .priority               = 1,
> +     .private_data_size      = 0,
> +     .flow_control           = CS_LIB_FLOW_CONTROL_NOT_REQUIRED,
> +     .lib_init_fn            = NULL,
> +     .lib_exit_fn            = NULL,
> +     .lib_engine             = NULL,
> +     .lib_engine_count       = 0,
> +     .exec_engine            = NULL,
> +     .exec_engine_count      = 0,
> +     .confchg_fn             = NULL,
> +     .exec_init_fn           = mon_exec_init_fn,
> +     .exec_dump_fn           = NULL,
> +     .sync_mode              = CS_SYNC_V2
> +};
> +
> +static DECLARE_LIST_INIT (confchg_notify);
> +
> +
> +struct resource_instance {
> +     hdb_handle_t handle;
> +     const char *name;
> +     poll_timer_handle timer_handle;
> +     void (*update_stats_fn) (void *data);
> +     struct cs_fsm fsm;
> +     int32_t period;
> +     objdb_value_types_t max_type;
> +     union {
> +             int32_t int32;
> +             double dbl;
> +     } max;
> +};
> +
> +static void mem_update_stats_fn (void *data);
> +static void load_update_stats_fn (void *data);
> +
> +static struct resource_instance memory_used_inst = {
> +     .name = "memory_used",
> +     .update_stats_fn = mem_update_stats_fn,
> +     .max_type = OBJDB_VALUETYPE_INT32,
> +     .max.int32 = INT32_MAX,
> +     .period = MON_DEFAULT_PERIOD,
> +};
> +
> +static struct resource_instance load_15min_inst = {
> +     .name = "load_15min",
> +     .update_stats_fn = load_update_stats_fn,
> +     .max_type = OBJDB_VALUETYPE_DOUBLE,
> +     .max.dbl = INT32_MAX,
> +     .period = MON_DEFAULT_PERIOD,
> +};
> +
> +
> +/*
> + * F S M
> + */
> +static void mon_config_changed (struct cs_fsm* fsm, int32_t event, void * 
> data);
> +static void mon_resource_failed (struct cs_fsm* fsm, int32_t event, void * 
> data);
> +
> +const char * mon_ok_str = "ok";
> +const char * mon_failed_str = "failed";
> +const char * mon_failure_str = "failure";
> +const char * mon_disabled_str = "disabled";
> +const char * mon_config_changed_str = "config_changed";
> +
> +enum mon_resource_state {
> +     MON_S_DISABLED,
> +     MON_S_OK,
> +     MON_S_FAILED
> +};
> +enum mon_resource_event {
> +     MON_E_CONFIG_CHANGED,
> +     MON_E_FAILURE
> +};
> +
> +struct cs_fsm_entry mon_fsm_table[] = {
> +     { MON_S_DISABLED,       MON_E_CONFIG_CHANGED,   mon_config_changed,     
> {MON_S_DISABLED, MON_S_OK, -1} },
> +     { MON_S_DISABLED,       MON_E_FAILURE,          NULL,                   
> {-1} },
> +     { MON_S_OK,             MON_E_CONFIG_CHANGED,   mon_config_changed,     
> {MON_S_OK, MON_S_DISABLED, -1} },
> +     { MON_S_OK,             MON_E_FAILURE,          mon_resource_failed,    
> {MON_S_FAILED, -1} },
> +     { MON_S_FAILED,         MON_E_CONFIG_CHANGED,   mon_config_changed,     
> {MON_S_OK, MON_S_DISABLED, -1} },
> +     { MON_S_FAILED,         MON_E_FAILURE,          NULL,                   
> {-1} },
> +};
> +
> +/*
> + * Dynamic loading descriptor
> + */
> +
> +static struct corosync_service_engine *mon_get_service_engine_ver0 (void);
> +
> +static struct corosync_service_engine_iface_ver0 mon_service_engine_iface = {
> +     .corosync_get_service_engine_ver0       = mon_get_service_engine_ver0
> +};
> +
> +static struct lcr_iface corosync_mon_ver0[1] = {
> +     {
> +             .name                   = "corosync_mon",
> +             .version                = 0,
> +             .versions_replace       = 0,
> +             .versions_replace_count = 0,
> +             .dependencies           = 0,
> +             .dependency_count       = 0,
> +             .constructor            = NULL,
> +             .destructor             = NULL,
> +             .interfaces             = NULL,
> +     }
> +};
> +
> +static struct lcr_comp mon_comp_ver0 = {
> +     .iface_count    = 1,
> +     .ifaces         = corosync_mon_ver0
> +};
> +
> +static struct corosync_service_engine *mon_get_service_engine_ver0 (void)
> +{
> +     return (&mon_service_engine);
> +}
> +
> +#ifdef COROSYNC_SOLARIS
> +void corosync_lcr_component_register (void);
> +
> +void corosync_lcr_component_register (void) {
> +#else
> +__attribute__ ((constructor)) static void corosync_lcr_component_register 
> (void) {
> +#endif
> +     lcr_interfaces_set (&corosync_mon_ver0[0],&mon_service_engine_iface);
> +
> +     lcr_component_register (&mon_comp_ver0);
> +}
> +
> +static const char * mon_res_state_to_str(struct cs_fsm* fsm,
> +     int32_t state)
> +{
> +     switch (state) {
> +     case MON_S_DISABLED:
> +             return mon_disabled_str;
> +             break;
> +     case MON_S_OK:
> +             return mon_ok_str;
> +             break;
> +     case MON_S_FAILED:
> +             return mon_failed_str;
> +             break;
> +     }
> +     return NULL;
> +}
> +
> +static const char * mon_res_event_to_str(struct cs_fsm* fsm,
> +     int32_t event)
> +{
> +     switch (event) {
> +     case MON_E_CONFIG_CHANGED:
> +             return mon_config_changed_str;
> +             break;
> +     case MON_E_FAILURE:
> +             return mon_failure_str;
> +             break;
> +     }
> +     return NULL;
> +}
> +
> +static void mon_fsm_state_set (struct cs_fsm* fsm,
> +     enum mon_resource_state next_state, struct resource_instance* inst)
> +{
> +     enum mon_resource_state prev_state = fsm->curr_state;
> +     const char *state_str;
> +
> +     ENTER();
> +
> +     cs_fsm_state_set(fsm, next_state, inst);
> +
> +     if (prev_state == fsm->curr_state) {
> +             return;
> +     }
> +     state_str = mon_res_state_to_str(fsm, fsm->curr_state);
> +
> +     api->object_key_replace (inst->handle,
> +             "state", strlen ("state"),
> +             state_str, strlen (state_str));
> +}
> +
> +
> +static void mon_config_changed (struct cs_fsm* fsm, int32_t event, void * 
> data)
> +{
> +     struct resource_instance * inst = (struct resource_instance *)data;
> +     char *str;
> +     size_t str_len;
> +     objdb_value_types_t type;
> +     int32_t tmp_value;
> +     int32_t res;
> +
> +     ENTER();
> +
> +     res = api->object_key_get_typed (inst->handle,
> +                     "poll_period",
> +                     (void**)&str,&str_len,
> +                     &type);
> +     if (res == 0) {
> +             tmp_value = strtol (str, NULL, 0);
> +             if (tmp_value>  0&&  tmp_value<  120) {
> +                     if (inst->period != tmp_value) {
> +                             inst->period = tmp_value;
> +                     }
> +             }
> +     }
> +
> +     res = api->object_key_get_typed (inst->handle, "max",
> +                     (void**)&str,&str_len,&type);
> +     if (res != 0) {
> +             if (inst->max_type == OBJDB_VALUETYPE_INT32) {
> +                     inst->max.int32 = INT32_MAX;
> +             } else
> +             if (inst->max_type == OBJDB_VALUETYPE_DOUBLE) {
> +                     inst->max.dbl = INT32_MAX;
> +             }
> +             mon_fsm_state_set (fsm, MON_S_DISABLED, inst);
> +     } else {
> +             if (inst->max_type == OBJDB_VALUETYPE_INT32) {
> +                     inst->max.int32 = strtol (str, NULL, 0);
> +             } else
> +             if (inst->max_type == OBJDB_VALUETYPE_DOUBLE) {
> +                     inst->max.dbl = strtod (str, NULL);
> +             }
> +             mon_fsm_state_set (fsm, MON_S_OK, inst);
> +     }
> +
> +     if (mon_poll == 0) {
> +             return;
> +     }
> +     poll_timer_delete (mon_poll, inst->timer_handle);
> +     /*
> +      * run the updater, incase the period has shortened
> +      */
> +     inst->update_stats_fn (inst);
> +     poll_timer_add (mon_poll,
> +             inst->period * 1000, NULL,
> +             inst->update_stats_fn,
> +             &inst->timer_handle);
> +}
> +
> +void mon_resource_failed (struct cs_fsm* fsm, int32_t event, void * data)
> +{
> +     struct resource_instance * inst = (struct resource_instance *)data;
> +     ENTER();
> +     mon_fsm_state_set (fsm, MON_S_FAILED, inst);
> +}
> +
> +static int32_t percent_mem_used_get(void)
> +{
> +#if defined(HAVE_LIBSTATGRAB)
> +     sg_mem_stats *mem_stats;
> +     sg_swap_stats *swap_stats;
> +     long long total, freemem;
> +
> +     mem_stats = sg_get_mem_stats();
> +     swap_stats = sg_get_swap_stats();
> +
> +     if (mem_stats == NULL || swap_stats != NULL) {
> +             log_printf (LOGSYS_LEVEL_ERROR, "Unable to get memory stats: 
> %s\n",
> +                     sg_str_error(sg_get_error()));
> +             return -1;
> +     }
> +     total = mem_stats->total + swap_stats->total;
> +     freemem = mem_stats->free + swap_stats->free;
> +     return ((total - freemem) * 100) / total;
> +#else
> +#if defined(COROSYNC_LINUX)
> +     char *line_ptr;
> +     char line[512];
> +     unsigned long long value;
> +     FILE *f;
> +     long long total = 0;
> +     long long freemem = 0;
> +
> +     if ((f = fopen("/proc/meminfo", "r")) == NULL) {
> +             return -1;
> +     }
> +
> +     while ((line_ptr = fgets(line, sizeof(line), f)) != NULL) {
> +             if (sscanf(line_ptr, "%*s %llu kB",&value) != 1) {
> +                     continue;
> +             }
> +             value *= 1024;
> +
> +             if (strncmp(line_ptr, "MemTotal:", 9) == 0) {
> +                     total += value;
> +             } else if (strncmp(line_ptr, "MemFree:", 8) == 0) {
> +                     freemem += value;
> +             } else if (strncmp(line_ptr, "SwapTotal:", 10) == 0) {
> +                     total += value;
> +             } else if (strncmp(line_ptr, "SwapFree:", 9) == 0) {
> +                     freemem += value;
> +             }
> +     }
> +
> +     fclose(f);
> +     return ((total - freemem) * 100) / total;
> +#else
> +#error need libstatgrab or linux.
> +#endif /* COROSYNC_LINUX */
> +#endif /* HAVE_LIBSTATGRAB */
> +}
> +
> +
> +static void mem_update_stats_fn (void *data)
> +{
> +     struct resource_instance * inst = (struct resource_instance *)data;
> +     int32_t new_value;
> +     uint64_t timestamp;
> +
> +     new_value = percent_mem_used_get();
> +     if (new_value>  0) {
> +             api->object_key_replace (inst->handle,
> +                     "current", strlen("current"),
> +                     &new_value, sizeof(new_value));
> +
> +             timestamp = time (NULL);
> +
> +             api->object_key_replace (inst->handle,
> +                     "last_updated", strlen("last_updated"),
> +                     &timestamp, sizeof(time_t));
> +
> +             if (new_value>  inst->max.int32) {
> +                     cs_fsm_process (&inst->fsm, MON_E_FAILURE, inst);
> +             }
> +     }
> +     poll_timer_add (mon_poll,
> +             inst->period * 1000, inst,
> +             inst->update_stats_fn,
> +             &inst->timer_handle);
> +}
> +
> +static double min15_loadavg_get(void)
> +{
> +#if defined(HAVE_LIBSTATGRAB)
> +     sg_load_stats *load_stats;
> +     load_stats = sg_get_load_stats ();
> +     if (load_stats == NULL) {
> +             log_printf (LOGSYS_LEVEL_ERROR, "Unable to get load stats: 
> %s\n",
> +                     sg_str_error (sg_get_error()));
> +             return -1;
> +     }
> +     return load_stats->min15;
> +#else
> +#if defined(COROSYNC_LINUX)
> +     double loadav[3];
> +     if (getloadavg(loadav,3)<  0) {
> +             return -1;
> +     }
> +     return loadav[2];
> +#else
> +#error need libstatgrab or linux.
> +#endif /* COROSYNC_LINUX */
> +#endif /* HAVE_LIBSTATGRAB */
> +}
> +
> +static void load_update_stats_fn (void *data)
> +{
> +     struct resource_instance * inst = (struct resource_instance *)data;
> +     uint64_t timestamp;
> +     int32_t res = 0;
> +     double min15 = min15_loadavg_get();
> +
> +     if (min15<  0) {
> +     }
> +     res = api->object_key_replace (inst->handle,
> +             "current", strlen("current"),
> +             &min15, sizeof (min15));
> +     if (res != 0)
> +             log_printf (LOGSYS_LEVEL_ERROR, "replace current failed: %d", 
> res);
> +
> +     timestamp = cs_timestamp_get();
> +
> +     res = api->object_key_replace (inst->handle,
> +             "last_updated", strlen("last_updated"),
> +             &timestamp, sizeof(uint64_t));
> +     if (res != 0)
> +             log_printf (LOGSYS_LEVEL_ERROR, "replace last_updated failed: 
> %d", res);
> +
> +     if (min15>  inst->max.dbl) {
> +             cs_fsm_process (&inst->fsm, MON_E_FAILURE,&inst);
> +     }
> +
> +     poll_timer_add (mon_poll,
> +             inst->period * 1000, inst,
> +             inst->update_stats_fn,
> +             &inst->timer_handle);
> +}
> +
> +static void *mon_thread_handler (void * unused)
> +{
> +#ifdef HAVE_LIBSTATGRAB
> +     sg_init();
> +#endif /* HAVE_LIBSTATGRAB */
> +     mon_poll = poll_create ();
> +
> +     poll_timer_add (mon_poll,
> +             memory_used_inst.period * 1000,
> +             &memory_used_inst,
> +             memory_used_inst.update_stats_fn,
> +             &memory_used_inst.timer_handle);
> +
> +     poll_timer_add (mon_poll,
> +             load_15min_inst.period * 1000,
> +             &load_15min_inst,
> +             load_15min_inst.update_stats_fn,
> +             &load_15min_inst.timer_handle);
> +     poll_run (mon_poll);
> +
> +     return NULL;
> +}
> +
> +static int object_find_or_create (
> +     hdb_handle_t parent_object_handle,
> +     hdb_handle_t *object_handle,
> +     const void *object_name,
> +     size_t object_name_len)
> +{
> +     hdb_handle_t obj_finder;
> +     hdb_handle_t obj;
> +     int ret = -1;
> +
> +     api->object_find_create (
> +             parent_object_handle,
> +             object_name,
> +             object_name_len,
> +             &obj_finder);
> +
> +     if (api->object_find_next (obj_finder,&obj) == 0) {
> +             /* found it */
> +             *object_handle = obj;
> +             ret = 0;
> +     }
> +     else {
> +             ret = api->object_create (parent_object_handle,
> +                     object_handle,
> +                     object_name, object_name_len);
> +     }
> +
> +     api->object_find_destroy (obj_finder);
> +     return ret;
> +}
> +
> +static void mon_key_change_notify (object_change_type_t change_type,
> +     hdb_handle_t parent_object_handle,
> +     hdb_handle_t object_handle,
> +     const void *object_name_pt, size_t object_name_len,
> +     const void *key_name_pt, size_t key_len,
> +     const void *key_value_pt, size_t key_value_len,
> +     void *priv_data_pt)
> +{
> +     struct resource_instance* inst = (struct 
> resource_instance*)priv_data_pt;
> +
> +     if ((strcmp ((char*)key_name_pt, "max") == 0) ||
> +             (strcmp ((char*)key_name_pt, "poll_period") == 0)) {
> +             ENTER();
> +             cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst);
> +     }
> +}
> +
> +static void mon_instance_init (hdb_handle_t parent, struct 
> resource_instance* inst)
> +{
> +     int32_t res;
> +     char mon_period_str[32];
> +     size_t mon_period_len;
> +     objdb_value_types_t mon_period_type;
> +     int32_t tmp_value;
> +     int32_t zero_32 = 0;
> +     time_t zero_64 = 0;
> +     double zero_double = 0;
> +
> +     ENTER();
> +
> +     object_find_or_create (parent,
> +             &inst->handle,
> +             inst->name, strlen (inst->name));
> +
> +     if (inst->max_type == OBJDB_VALUETYPE_INT32) {
> +             api->object_key_create_typed (inst->handle,
> +                     "current",&zero_32,
> +                     sizeof (zero_32), inst->max_type);
> +     } else {
> +             api->object_key_create_typed (inst->handle,
> +                     "current",&zero_double,
> +                     sizeof (zero_double), inst->max_type);
> +     }
> +
> +     api->object_key_create_typed (inst->handle,
> +             "last_updated",&zero_64,
> +             sizeof (time_t), OBJDB_VALUETYPE_INT64);
> +
> +     api->object_key_create_typed (inst->handle,
> +             "state", mon_disabled_str, strlen (mon_disabled_str),
> +             OBJDB_VALUETYPE_STRING);
> +
> +     inst->fsm.name = inst->name;
> +     inst->fsm.curr_entry = 0;
> +     inst->fsm.curr_state = MON_S_DISABLED;
> +     inst->fsm.table = mon_fsm_table;
> +     inst->fsm.entries = sizeof(mon_fsm_table) / sizeof(struct cs_fsm_entry);
> +     inst->fsm.state_to_str = mon_res_state_to_str;
> +     inst->fsm.event_to_str = mon_res_event_to_str;
> +
> +     res = api->object_key_get_typed (inst->handle,
> +                     "poll_period",
> +                     (void**)&mon_period_str,&mon_period_len,
> +                     &mon_period_type);
> +     if (res != 0) {
> +             mon_period_len = snprintf (mon_period_str, 32, "%d",
> +                     inst->period);
> +             api->object_key_create_typed (inst->handle,
> +                     "poll_period",&mon_period_str,
> +                     mon_period_len,
> +                     OBJDB_VALUETYPE_STRING);
> +     }
> +     else {
> +             tmp_value = strtol (mon_period_str, NULL, 0);
> +             if (tmp_value>  0&&  tmp_value<  120)
> +                     inst->period = tmp_value;
> +     }
> +     cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst);
> +
> +     poll_timer_add (mon_poll,
> +             inst->period * 1000, inst,
> +             inst->update_stats_fn,
> +             &inst->timer_handle);
> +
> +     api->object_track_start (inst->handle, OBJECT_TRACK_DEPTH_ONE,
> +             mon_key_change_notify,
> +             NULL, NULL, NULL, NULL);
> +
> +}
> +
> +static int mon_exec_init_fn (
> +     struct corosync_api_v1 *corosync_api)
> +{
> +     hdb_handle_t obj;
> +     hdb_handle_t parent;
> +
> +#ifdef COROSYNC_SOLARIS
> +     logsys_subsys_init();
> +#endif
> +     api = corosync_api;
> +     ENTER();
> +
> +     object_find_or_create (OBJECT_PARENT_HANDLE,
> +             &resources_obj,
> +             "resources", strlen ("resources"));
> +
> +     object_find_or_create (resources_obj,
> +             &obj,
> +             "system", strlen ("system"));
> +
> +     parent = obj;
> +
> +     mon_instance_init (parent,&memory_used_inst);
> +     mon_instance_init (parent,&load_15min_inst);
> +
> +
> +     pthread_create (&mon_poll_thread, NULL, mon_thread_handler, NULL);
> +
> +     return 0;
> +}
> +
> +
> diff --git a/services/wd.c b/services/wd.c
> new file mode 100644
> index 0000000..9c9ad97
> --- /dev/null
> +++ b/services/wd.c
> @@ -0,0 +1,755 @@
> +/*
> + * Copyright (c) 2010 Red Hat, Inc.
> + *
> + * All rights reserved.
> + *
> + * Author: Angus Salkeld<[email protected]>
> + *
> + * This software licensed under BSD license, the text of which follows:
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are 
> met:
> + *
> + * - Redistributions of source code must retain the above copyright notice,
> + *   this list of conditions and the following disclaimer.
> + * - Redistributions in binary form must reproduce the above copyright 
> notice,
> + *   this list of conditions and the following disclaimer in the 
> documentation
> + *   and/or other materials provided with the distribution.
> + * - Neither the name of the MontaVista Software, Inc. nor the names of its
> + *   contributors may be used to endorse or promote products derived from 
> this
> + *   software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
> IS"
> + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
> + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
> + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
> + * THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include<config.h>
> +
> +#include<unistd.h>
> +#include<fcntl.h>
> +#include<sys/ioctl.h>
> +#include<linux/types.h>
> +#include<linux/watchdog.h>
> +#include<linux/reboot.h>
> +
> +#include<corosync/corotypes.h>
> +#include<corosync/corodefs.h>
> +#include<corosync/lcr/lcr_comp.h>
> +#include<corosync/engine/coroapi.h>
> +#include<corosync/list.h>
> +#include<corosync/engine/logsys.h>
> +#include "../exec/fsm.h"
> +
> +
> +typedef enum {
> +     WD_RESOURCE_GOOD,
> +     WD_RESOURCE_FAILED,
> +     WD_RESOURCE_STATE_UNKNOWN,
> +     WD_RESOURCE_NOT_MONITORED
> +} wd_resource_state_t;
> +
> +struct resource {
> +     hdb_handle_t handle;
> +     char *recovery;
> +     char name[128];
> +     time_t last_updated;
> +     struct cs_fsm fsm;
> +
> +     corosync_timer_handle_t check_timer;
> +     uint32_t check_timeout;
> +};
> +
> +LOGSYS_DECLARE_SUBSYS("WD");
> +
> +/*
> + * Service Interfaces required by service_message_handler struct
> + */
> +static int wd_exec_init_fn (
> +     struct corosync_api_v1 *corosync_api);
> +static int wd_exec_exit_fn (void);
> +static void wd_resource_check_fn (void* resource_ref);
> +
> +static struct corosync_api_v1 *api;
> +#define WD_DEFAULT_TIMEOUT 6
> +static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT;
> +static uint32_t tickle_timeout = (WD_DEFAULT_TIMEOUT / 2);
> +static int dog = -1;
> +static corosync_timer_handle_t wd_timer;
> +static hdb_handle_t resources_obj;
> +static int watchdog_ok = 1;
> +
> +struct corosync_service_engine wd_service_engine = {
> +     .name                   = "corosync self-fencing service",
> +     .id                     = WD_SERVICE,
> +     .priority               = 1,
> +     .private_data_size      = 0,
> +     .flow_control           = CS_LIB_FLOW_CONTROL_REQUIRED,
> +     .lib_init_fn            = NULL,
> +     .lib_exit_fn            = NULL,
> +     .lib_engine             = NULL,
> +     .lib_engine_count       = 0,
> +     .exec_engine            = NULL,
> +     .exec_engine_count      = 0,
> +     .confchg_fn             = NULL,
> +     .exec_init_fn           = wd_exec_init_fn,
> +     .exec_exit_fn           = wd_exec_exit_fn,
> +     .exec_dump_fn           = NULL,
> +     .sync_mode              = CS_SYNC_V2
> +};
> +
> +static DECLARE_LIST_INIT (confchg_notify);
> +
> +/*
> + * F S M
> + */
> +static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * 
> data);
> +static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * 
> data);
> +
> +enum wd_resource_state {
> +     WD_S_GOOD,
> +     WD_S_FAILED,
> +     WD_S_DISABLED
> +};
> +
> +enum wd_resource_event {
> +     WD_E_FAILURE,
> +     WD_E_CONFIG_CHANGED
> +};
> +
> +const char * wd_ok_str                       = "ok";
> +const char * wd_failed_str           = "failed";
> +const char * wd_failure_str          = "failure";
> +const char * wd_disabled_str         = "disabled";
> +const char * wd_config_changed_str   = "config_changed";
> +
> +struct cs_fsm_entry wd_fsm_table[] = {
> +     { WD_S_DISABLED,        WD_E_CONFIG_CHANGED,    wd_config_changed,      
> {WD_S_DISABLED, WD_S_GOOD, -1} },
> +     { WD_S_DISABLED,        WD_E_FAILURE,           NULL,                   
> {-1} },
> +     { WD_S_GOOD,            WD_E_CONFIG_CHANGED,    wd_config_changed,      
> {WD_S_GOOD, WD_S_DISABLED, -1} },
> +     { WD_S_GOOD,            WD_E_FAILURE,           wd_resource_failed,     
> {WD_S_FAILED, -1} },
> +     { WD_S_FAILED,          WD_E_CONFIG_CHANGED,    wd_config_changed,      
> {WD_S_GOOD, WD_S_DISABLED, -1} },
> +     { WD_S_FAILED,          WD_E_FAILURE,           NULL,                   
> {-1} },
> +};
> +
> +/*
> + * Dynamic loading descriptor
> + */
> +
> +static struct corosync_service_engine *wd_get_service_engine_ver0 (void);
> +
> +static struct corosync_service_engine_iface_ver0 wd_service_engine_iface = {
> +     .corosync_get_service_engine_ver0       = wd_get_service_engine_ver0
> +};
> +
> +static struct lcr_iface corosync_wd_ver0[1] = {
> +     {
> +             .name                   = "corosync_wd",
> +             .version                = 0,
> +             .versions_replace       = 0,
> +             .versions_replace_count = 0,
> +             .dependencies           = 0,
> +             .dependency_count       = 0,
> +             .constructor            = NULL,
> +             .destructor             = NULL,
> +             .interfaces             = NULL,
> +     }
> +};
> +
> +static struct lcr_comp wd_comp_ver0 = {
> +     .iface_count    = 1,
> +     .ifaces         = corosync_wd_ver0
> +};
> +
> +static struct corosync_service_engine *wd_get_service_engine_ver0 (void)
> +{
> +     return (&wd_service_engine);
> +}
> +
> +#ifdef COROSYNC_SOLARIS
> +void corosync_lcr_component_register (void);
> +
> +void corosync_lcr_component_register (void) {
> +#else
> +__attribute__ ((constructor)) static void corosync_lcr_component_register 
> (void) {
> +#endif
> +     lcr_interfaces_set (&corosync_wd_ver0[0],&wd_service_engine_iface);
> +
> +     lcr_component_register (&wd_comp_ver0);
> +}
> +
> +static int object_find_or_create (
> +     hdb_handle_t parent_object_handle,
> +     hdb_handle_t *object_handle,
> +     const void *object_name,
> +     size_t object_name_len)
> +{
> +     hdb_handle_t obj_finder;
> +     hdb_handle_t obj;
> +     int ret = -1;
> +
> +     api->object_find_create (
> +             parent_object_handle,
> +             object_name,
> +             object_name_len,
> +             &obj_finder);
> +
> +     if (api->object_find_next (obj_finder,&obj) == 0) {
> +             /* found it */
> +             *object_handle = obj;
> +             ret = 0;
> +     }
> +     else {
> +             ret = api->object_create (parent_object_handle,
> +                     object_handle,
> +                     object_name, object_name_len);
> +     }
> +
> +     api->object_find_destroy (obj_finder);
> +     return ret;
> +}
> +
> +static const char * wd_res_state_to_str(struct cs_fsm* fsm,
> +     int32_t state)
> +{
> +     switch (state) {
> +     case WD_S_DISABLED:
> +             return wd_disabled_str;
> +             break;
> +     case WD_S_GOOD:
> +             return wd_ok_str;
> +             break;
> +     case WD_S_FAILED:
> +             return wd_failed_str;
> +             break;
> +     }
> +     return NULL;
> +}
> +
> +static const char * wd_res_event_to_str(struct cs_fsm* fsm,
> +     int32_t event)
> +{
> +     switch (event) {
> +     case WD_E_CONFIG_CHANGED:
> +             return wd_config_changed_str;
> +             break;
> +     case WD_E_FAILURE:
> +             return wd_failure_str;
> +             break;
> +     }
> +     return NULL;
> +}
> +
> +/*
> + * returns (0 == OK, 1 == failed)
> + */
> +static int32_t wd_resource_has_failed (struct resource *ref)
> +{
> +     hdb_handle_t resource = ref->handle;
> +     int res;
> +     char* state;
> +     size_t state_len;
> +     objdb_value_types_t type;
> +     time_t *last_updated;
> +     time_t my_time;
> +     size_t last_updated_len;
> +
> +     res = api->object_key_get_typed (resource,
> +             "last_updated", (void*)&last_updated,&last_updated_len,&type);
> +     if (res != 0) {
> +             /* key does not exist.
> +             */
> +             return 1;
> +     }
> +     res = api->object_key_get_typed (resource,
> +             "state", (void**)&state,&state_len,&type);
> +     if (res != 0 || strncmp (state, "disabled", strlen ("disabled")) == 0) {
> +             /* key does not exist.
> +             */
> +             return 1;
> +     }
> +
> +     my_time = time (NULL);
> +
> +     if ((*last_updated + ref->check_timeout)<  my_time) {
> +             log_printf (LOGSYS_LEVEL_INFO, "delayed %ld + %d<  %ld",
> +                     *last_updated, ref->check_timeout, my_time);
> +             return 1;
> +     }
> +
> +     if ((*last_updated + ref->check_timeout)<  my_time ||
> +             strcmp (state, "bad") == 0) {
> +             return 1;
> +     }
> +     return 0;
> +}
> +
> +static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * 
> data)
> +{
> +     int res;
> +     size_t len;
> +     char *state;
> +     objdb_value_types_t type;
> +     char mon_period_str[32];
> +     int32_t tmp_value;
> +     struct resource *ref = (struct resource*)data;
> +
> +     res = api->object_key_get_typed (ref->handle,
> +                     "poll_period",
> +                     (void**)&mon_period_str,&len,
> +                     &type);
> +     if (res == 0) {
> +             tmp_value = strtol (mon_period_str, NULL, 0);
> +             if (tmp_value>  0&&  tmp_value<  120)
> +                     ref->check_timeout = (tmp_value * 5)/4;
> +     }
> +
> +     res = api->object_key_get_typed (ref->handle,
> +             "recovery", (void*)&ref->recovery,&len,&type);
> +     if (res != 0) {
> +             /* key does not exist.
> +              */
> +             log_printf (LOGSYS_LEVEL_WARNING,
> +                     "resource %s missing a recovery key.", ref->name);
> +             cs_fsm_state_set(&ref->fsm, WD_S_DISABLED, ref);
> +             return;
> +     }
> +     res = api->object_key_get_typed (ref->handle,
> +             "state", (void*)&state,&len,&type);
> +     if (res != 0) {
> +             /* key does not exist.
> +             */
> +             log_printf (LOGSYS_LEVEL_WARNING,
> +                     "resource %s missing a state key.", ref->name);
> +             cs_fsm_state_set(&ref->fsm, WD_S_DISABLED, ref);
> +             return;
> +     }
> +
> +     cs_fsm_state_set(&ref->fsm, WD_S_GOOD, ref);
> +
> +     if (ref->check_timer) {
> +             api->timer_delete(ref->check_timer);
> +     }
> +     api->timer_add_duration((unsigned long 
> long)ref->check_timeout*1000000000,
> +             ref,
> +             wd_resource_check_fn,&ref->check_timer);
> +
> +}
> +
> +static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * 
> data)
> +{
> +     struct resource* ref = (struct resource*)data;
> +
> +     if (ref->check_timer) {
> +             api->timer_delete(ref->check_timer);
> +     }
> +
> +     log_printf (LOGSYS_LEVEL_CRIT, "%s resource \"%s\" failed!",
> +             ref->recovery, (char*)ref->name);
> +     if (strcmp (ref->recovery, "watchdog") == 0 ||
> +         strcmp (ref->recovery, "quit") == 0) {
> +             watchdog_ok = 0;
> +     }
> +     else if (strcmp (ref->recovery, "reboot") == 0) {
> +             //reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, 
> LINUX_REBOOT_CMD_RESTART, NULL);
> +     }
> +     else if (strcmp (ref->recovery, "shutdown") == 0) {
> +             //reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, 
> LINUX_REBOOT_CMD_POWER_OFF, NULL);
> +     }
> +     cs_fsm_state_set(fsm, WD_S_FAILED, data);
> +}
> +
> +static void wd_key_changed(object_change_type_t change_type,
> +     hdb_handle_t parent_object_handle,
> +     hdb_handle_t object_handle,
> +     const void *object_name_pt, size_t object_name_len,
> +     const void *key_name_pt, size_t key_len,
> +     const void *key_value_pt, size_t key_value_len,
> +     void *priv_data_pt)
> +{
> +     struct resource* ref = (struct resource*)priv_data_pt;
> +
> +     if (strcmp(key_name_pt, "last_updated") == 0 ||
> +             strcmp(key_name_pt, "current") == 0) {
> +             return;
> +     }
> +//   log_printf (LOGSYS_LEVEL_WARNING,
> +//           "watchdog resource key changed: %s.%s=%s ref=%p.",
> +//           (char*)object_name_pt, (char*)key_name_pt, (char*)key_value_pt, 
> ref);
> +
> +     if (ref == NULL) {
> +             return;
> +     }
> +     cs_fsm_process(&ref->fsm, WD_E_CONFIG_CHANGED, ref);
> +}
> +
> +static void wd_object_destroyed(
> +     hdb_handle_t parent_object_handle,
> +     const void *name_pt, size_t name_len,
> +     void *priv_data_pt)
> +{
> +     struct resource* ref = (struct resource*)priv_data_pt;
> +
> +     log_printf (LOGSYS_LEVEL_WARNING,
> +                     "watchdog resource \"%s\" deleted from objdb!",
> +                     (char*)name_pt);
> +
> +     if (ref) {
> +             api->timer_delete(ref->check_timer);
> +             ref->check_timer = NULL;
> +     }
> +}
> +
> +static void wd_resource_check_fn (void* resource_ref)
> +{
> +     struct resource* ref = (struct resource*)resource_ref;
> +
> +     log_printf (LOGSYS_LEVEL_INFO,
> +                     "checking watchdog resource \"%s\".",
> +                     ref->name);
> +     if (wd_resource_has_failed (ref) ) {
> +             cs_fsm_process(&ref->fsm, WD_E_FAILURE, ref);
> +             log_printf (LOGSYS_LEVEL_CRIT,
> +                     "watchdog resource \"%s\" failed!",
> +                     (char*)ref->name);
> +             return;
> +     }
> +     api->timer_add_duration((unsigned long 
> long)ref->check_timeout*1000000000,
> +             ref, wd_resource_check_fn,&ref->check_timer);
> +}
> +
> +
> +static void wd_resource_create (hdb_handle_t resource_obj)
> +{
> +     int res;
> +     size_t len;
> +     char *state;
> +     objdb_value_types_t type;
> +     char mon_period_str[32];
> +     int32_t tmp_value;
> +     struct resource *ref = malloc (sizeof (struct resource));
> +
> +     ref->handle = resource_obj;
> +     ref->check_timeout = WD_DEFAULT_TIMEOUT;
> +     ref->check_timer = NULL;
> +     api->object_name_get (resource_obj,
> +             ref->name,
> +             &len);
> +     ref->name[len] = '\0';
> +     ref->fsm.name = ref->name;
> +     ref->fsm.table = wd_fsm_table;
> +     ref->fsm.entries = sizeof(wd_fsm_table) / sizeof(struct cs_fsm_entry);
> +     ref->fsm.curr_entry = 0;
> +     ref->fsm.curr_state = WD_S_DISABLED;
> +     ref->fsm.state_to_str = wd_res_state_to_str;
> +     ref->fsm.event_to_str = wd_res_event_to_str;
> +     api->object_priv_set (resource_obj, NULL);
> +
> +     res = api->object_key_get_typed (resource_obj,
> +                     "poll_period",
> +                     (void**)&mon_period_str,&len,
> +                     &type);
> +     if (res != 0) {
> +             log_printf (LOGSYS_LEVEL_ERROR, "%s : %d",__func__, res);
> +             len = snprintf (mon_period_str, 32, "%d", ref->check_timeout);
> +             api->object_key_create_typed (resource_obj,
> +                     "poll_period",&mon_period_str,
> +                     len,
> +                     OBJDB_VALUETYPE_STRING);
> +     }
> +     else {
> +             tmp_value = strtol (mon_period_str, NULL, 0);
> +             if (tmp_value>  0&&  tmp_value<  120)
> +                     ref->check_timeout = (tmp_value * 5)/4;
> +     }
> +
> +     api->object_track_start (resource_obj, OBJECT_TRACK_DEPTH_ONE,
> +                     wd_key_changed, NULL, wd_object_destroyed,
> +                     NULL, ref);
> +
> +     res = api->object_key_get_typed (resource_obj,
> +             "recovery", (void*)&ref->recovery,&len,&type);
> +     if (res != 0) {
> +             /* key does not exist.
> +              */
> +             log_printf (LOGSYS_LEVEL_WARNING,
> +                     "resource %s missing a recovery key.", ref->name);
> +             return;
> +     }
> +     res = api->object_key_get_typed (resource_obj,
> +             "state", (void*)&state,&len,&type);
> +     if (res != 0) {
> +             /* key does not exist.
> +             */
> +             log_printf (LOGSYS_LEVEL_WARNING,
> +                     "resource %s missing a state key.", ref->name);
> +             return;
> +     }
> +
> +     res = api->object_key_get_typed (resource_obj,
> +             "last_updated", (void*)&ref->last_updated,&len,&type);
> +     if (res != 0) {
> +             /* key does not exist.
> +              */
> +             ref->last_updated = 0;
> +     }
> +
> +     api->timer_add_duration((unsigned long 
> long)ref->check_timeout*1000000000,
> +             ref,
> +             wd_resource_check_fn,&ref->check_timer);
> +
> +     cs_fsm_state_set(&ref->fsm, WD_S_GOOD, ref);
> +}
> +
> +
> +static void wd_tickle_fn (void* arg)
> +{
> +     ENTER();
> +
> +     if (watchdog_ok) {
> +             if (dog>  0)
> +                     ioctl(dog, WDIOC_KEEPALIVE,&watchdog_ok);
> +     }
> +     else {
> +             log_printf (LOGSYS_LEVEL_ALERT, "NOT tickling the watchdog!");
> +     }
> +
> +     api->timer_add_duration((unsigned long long)tickle_timeout*1000000000, 
> NULL,
> +                             wd_tickle_fn,&wd_timer);
> +}
> +
> +static void wd_resource_object_created(hdb_handle_t parent_object_handle,
> +     hdb_handle_t object_handle,
> +     const void *name_pt, size_t name_len,
> +     void *priv_data_pt)
> +{
> +     wd_resource_create (object_handle);
> +}
> +
> +static void wd_scan_resources (void)
> +{
> +     hdb_handle_t obj_finder;
> +     hdb_handle_t obj_finder2;
> +     hdb_handle_t resource_type;
> +     hdb_handle_t resource;
> +     int res;
> +
> +     ENTER();
> +
> +     api->object_find_create (
> +             OBJECT_PARENT_HANDLE,
> +             "resources", strlen ("resources"),
> +             &obj_finder);
> +
> +     res = api->object_find_next (obj_finder,&resources_obj);
> +     api->object_find_destroy (obj_finder);
> +     if (res != 0) {
> +             log_printf (LOGSYS_LEVEL_INFO, "no resources.");
> +             return;
> +     }
> +
> +     /* this will be the system or process level
> +      */
> +     api->object_find_create (
> +             resources_obj,
> +             NULL, 0,
> +             &obj_finder);
> +     while (api->object_find_next (obj_finder,
> +                     &resource_type) == 0) {
> +
> +             api->object_find_create (
> +                     resource_type,
> +                     NULL, 0,
> +                     &obj_finder2);
> +
> +             while (api->object_find_next (obj_finder2,
> +                             &resource) == 0) {
> +
> +                     wd_resource_create (resource);
> +             }
> +             api->object_find_destroy (obj_finder2);
> +
> +             api->object_track_start (resource_type, OBJECT_TRACK_DEPTH_ONE,
> +                     NULL, wd_resource_object_created, NULL,
> +                     NULL, NULL);
> +     }
> +     api->object_find_destroy (obj_finder);
> +}
> +
> +
> +static void watchdog_timeout_apply (uint32_t new)
> +{
> +     struct watchdog_info ident;
> +
> +     if (new<  2) {
> +             watchdog_timeout = 2;
> +     }
> +     else if (new>  120) {
> +             watchdog_timeout = 120;
> +     }
> +     else {
> +             watchdog_timeout = new;
> +     }
> +
> +     if (dog>  0) {
> +             ioctl(dog, WDIOC_GETSUPPORT,&ident);
> +             if (ident.options&  WDIOF_SETTIMEOUT) {
> +                     /* yay! the dog is trained.
> +                      */
> +                     ioctl(dog, WDIOC_SETTIMEOUT,&watchdog_timeout);
> +             }
> +             ioctl(dog, WDIOC_GETTIMEOUT,&watchdog_timeout);
> +     }
> +     tickle_timeout = watchdog_timeout / 2;
> +
> +     log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds\n", 
> watchdog_timeout);
> +     log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %d seconds\n", 
> tickle_timeout);
> +}
> +
> +static int setup_watchdog(void)
> +{
> +     struct watchdog_info ident;
> +
> +     ENTER();
> +     if (access ("/dev/watchdog", W_OK) != 0) {
> +             log_printf (LOGSYS_LEVEL_WARNING, "No Watchdog, try modprobe<a 
> watchdog>");
> +             dog = -1;
> +             return -1;
> +     }
> +
> +     /* here goes, lets hope they have "Magic Close"
> +      */
> +     dog = open("/dev/watchdog", O_WRONLY);
> +
> +     if (dog == -1) {
> +             log_printf (LOGSYS_LEVEL_WARNING, "Watchdog exists but couldn't 
> be opened.");
> +             dog = -1;
> +             return -1;
> +     }
> +
> +     /* Right we have the dog.
> +      * Lets see what breed it is.
> +      */
> +
> +     ioctl(dog, WDIOC_GETSUPPORT,&ident);
> +     log_printf (LOGSYS_LEVEL_INFO, "Watchdog is now been tickled by 
> corosync.");
> +     log_printf (LOGSYS_LEVEL_DEBUG, "%s", ident.identity);
> +
> +     watchdog_timeout_apply (watchdog_timeout);
> +
> +     ioctl(dog, WDIOC_SETOPTIONS, WDIOS_ENABLECARD);
> +
> +     return 0;
> +}
> +
> +static void wd_top_level_key_changed(object_change_type_t change_type,
> +     hdb_handle_t parent_object_handle,
> +     hdb_handle_t object_handle,
> +     const void *object_name_pt, size_t object_name_len,
> +     const void *key_name_pt, size_t key_len,
> +     const void *key_value_pt, size_t key_value_len,
> +     void *priv_data_pt)
> +{
> +     uint32_t tmp_value;
> +
> +     ENTER();
> +     if (change_type != OBJECT_KEY_DELETED&&
> +             strncmp ((char*)key_name_pt, "watchdog_timeout", key_value_len) 
> == 0) {
> +             tmp_value = strtol (key_value_pt, NULL, 0);
> +             watchdog_timeout_apply (tmp_value);
> +     }
> +     else {
> +             watchdog_timeout_apply (WD_DEFAULT_TIMEOUT);
> +     }
> +     log_printf (LOGSYS_LEVEL_INFO, "new(%d) tickle_timeout: %d", 
> change_type, tickle_timeout);
> +}
> +
> +
> +static void watchdog_timeout_get_initial (void)
> +{
> +     int32_t res;
> +     char watchdog_timeout_str[32];
> +     size_t watchdog_timeout_len;
> +     objdb_value_types_t watchdog_timeout_type;
> +     uint32_t tmp_value;
> +
> +     ENTER();
> +
> +     res = api->object_key_get_typed (resources_obj,
> +                     "watchdog_timeout",
> +                     (void**)&watchdog_timeout_str,&watchdog_timeout_len,
> +                     &watchdog_timeout_type);
> +     if (res != 0) {
> +             watchdog_timeout_apply (WD_DEFAULT_TIMEOUT);
> +
> +             watchdog_timeout_len = snprintf (watchdog_timeout_str, 32, 
> "%d", watchdog_timeout);
> +             api->object_key_create_typed (resources_obj,
> +                     "watchdog_timeout",&watchdog_timeout_str,
> +                     watchdog_timeout_len,
> +                     OBJDB_VALUETYPE_STRING);
> +     }
> +     else {
> +             tmp_value = strtol (watchdog_timeout_str, NULL, 0);
> +             watchdog_timeout_apply (tmp_value);
> +     }
> +
> +     api->object_track_start (resources_obj, OBJECT_TRACK_DEPTH_ONE,
> +             wd_top_level_key_changed, NULL, NULL,
> +             NULL, NULL);
> +
> +}
> +
> +static int wd_exec_init_fn (
> +     struct corosync_api_v1 *corosync_api)
> +{
> +     hdb_handle_t obj;
> +
> +     ENTER();
> +#ifdef COROSYNC_SOLARIS
> +     logsys_subsys_init();
> +#endif
> +     api = corosync_api;
> +
> +     object_find_or_create (OBJECT_PARENT_HANDLE,
> +             &resources_obj,
> +             "resources", strlen ("resources"));
> +     object_find_or_create (resources_obj,
> +             &obj,
> +             "system", strlen ("system"));
> +     object_find_or_create (resources_obj,
> +             &obj,
> +             "process", strlen ("process"));
> +
> +     watchdog_timeout_get_initial();
> +
> +     setup_watchdog();
> +
> +     wd_scan_resources();
> +
> +     api->timer_add_duration((unsigned long long)tickle_timeout*1000000000, 
> NULL,
> +                             wd_tickle_fn,&wd_timer);
> +
> +     return 0;
> +}
> +
> +static int wd_exec_exit_fn (void)
> +{
> +     char magic = 'V';
> +     ENTER();
> +
> +     if (dog>  0) {
> +             log_printf (LOGSYS_LEVEL_INFO, "magically closing the 
> watchdog.");
> +             write (dog,&magic, 1);
> +     }
> +     return 0;
> +}
> +
> +

_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to