Hi,
The recent heartbeat on the tip would cause an assertion fail in
pacemaker-1.0 and generate a core:
{{{
Oct 25 17:15:08 srv02 cib: [31333]: ERROR: crm_abort:
crm_glib_handler: Forked child 31338 to record non-fatal assert at
utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed
Oct 25 17:15:08 srv02 cib: [31333]: ERROR: crm_abort:
crm_glib_handler: Forked child 31339 to record non-fatal assert at
utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed
Oct 25 17:15:11 srv02 crmd: [31337]: ERROR: crm_abort:
crm_glib_handler: Forked child 31341 to record non-fatal assert at
utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed
Oct 25 17:15:11 srv02 crmd: [31337]: ERROR: crm_abort:
crm_glib_handler: Forked child 31342 to record non-fatal assert at
utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed
}}}
This seems introduced by the following changeset:
http://hg.linux-ha.org/dev/rev/231b0b8555be
The stack trace and my suggested patch are attached.
The changeset in question had changed to use get_next_random() here
which eventually calls g_main_loop_is_running() but it may fail
because g_main_loop is not initialized yet in cib/crmd.
My suggested patch would just revert the old behavior but only changes
the delay as 50ms.
Thanks,
--
Keisuke MORI
(gdb) where
#0 0x00669410 in __kernel_vsyscall ()
#1 0x00692df0 in raise () from /lib/libc.so.6
#2 0x00694701 in abort () from /lib/libc.so.6
#3 0x00c0d82f in crm_abort (file=0xc26955 "utils.c",
function=0xc26dda "crm_glib_handler", line=449,
assert_condition=0x8933d58 "g_main_loop_is_running: assertion `loop !=
NULL' failed", do_core=1, do_fork=1) at utils.c:1382
#4 0x00c09f05 in crm_glib_handler (log_domain=0x167686 "GLib",
flags=G_LOG_LEVEL_CRITICAL,
message=0x8933d58 "g_main_loop_is_running: assertion `loop != NULL'
failed", user_data=0x0) at utils.c:449
#5 0x00143b67 in g_logv () from /lib/libglib-2.0.so.0
#6 0x00143d39 in g_log () from /lib/libglib-2.0.so.0
#7 0x00143e1b in g_return_if_fail_warning () from /lib/libglib-2.0.so.0
#8 0x0013981b in g_main_loop_is_running () from /lib/libglib-2.0.so.0
#9 0x00880811 in get_more_random () at cl_random.c:95
#10 0x00880945 in cl_init_random () at cl_random.c:128
#11 0x00880644 in gen_a_random () at cl_random.c:68
#12 0x00880896 in get_next_random () at cl_random.c:106
#13 0x00fdbabb in get_clientstatus (lcl=0x8931bd8, host=0x0,
clientid=0x805b779 "cib", timeout=-1) at client_lib.c:974
#14 0x080557ee in cib_init () at main.c:461
#15 0x08054c4b in main (argc=1, argv=0xbfcd6124) at main.c:218
(gdb)
# HG changeset patch
# User Keisuke MORI <[email protected]>
# Date 1288003477 -32400
# Node ID 96b67422b12814f64dc7dd61c670801c7ba213b6
# Parent 82fc843fbcf9733e50bbc169c95e51b6c7f97c54
Medium: reduce max delay in get_client_status (revised 231b0b8555be)
revert the old code to avoid calling g_main_loop_is_running()
which may fail when used in Pacemaker cib/crmd.
diff -r 82fc843fbcf9 -r 96b67422b128 lib/hbclient/client_lib.c
--- a/lib/hbclient/client_lib.c Mon Oct 04 22:12:37 2010 +0200
+++ b/lib/hbclient/client_lib.c Mon Oct 25 19:44:37 2010 +0900
@@ -966,16 +966,6 @@ get_nodesite(ll_cluster_t* lcl, const ch
* Return the status of the given client.
*/
-#ifndef HAVE_CL_RAND_FROM_INTERVAL
-/* you should grab latest glue headers! */
-static inline int cl_rand_from_interval(const int a, const int b)
-{
- /* RAND_MAX may be INT_MAX, or (b-a) may be huge. */
- long long r = get_next_random();
- return a + (r * (b-a) + RAND_MAX/2)/RAND_MAX;
-}
-#endif
-
static const char *
get_clientstatus(ll_cluster_t* lcl, const char *host
, const char *clientid, int timeout)
@@ -1027,8 +1017,9 @@ get_clientstatus(ll_cluster_t* lcl, cons
* in a 100-node cluster, the max delay is 5 seconds
*/
num_nodes = get_num_nodes(lcl);
- max_delay = num_nodes * 50000;
- delay = cl_rand_from_interval(0, max_delay);
+ max_delay = num_nodes * 50000; /* in microsecond*/
+ srand(cl_randseed());
+ delay = (1.0* rand()/RAND_MAX)*max_delay;
if (ANYDEBUG){
cl_log(LOG_DEBUG, "Delaying cstatus request for %d ms", delay/1000);
}
_______________________________________________________
Linux-HA-Dev: [email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/