> Best application for such test is testcpg.c. If there is really bug,
> can you please create BZ (ideally with way to reproduce, because I'm really
> not able to reproduce such behavior).
I still wait for a BZ account, so I post it here. The attached
program 'cpgtest' reproduces the problem. Compile with:
# gcc -Wall cpgtest.c $(shell pkg-config --cflags --libs libcpg libcoroipcc) -o
cpgtest
It executes a simple loop:
start:
cpg_initialize
cpg_join
cpg_dispatch
send one message in confchg_callback
cpg_finalize after receiving that message
goto start
When I run that it executes several successful iterations, but sometime
the join fails:
# cgptest
...
starting cpgtest
calling cpg_initialize
calling cpg_join
cpg_join failed: 14
An worse, sometimes it hangs in main loop:
# cpgtest
...
starting cpgtest
calling cpg_initialize
calling cpg_join
starting main loop (hangs here)
When that happens, I abort with CTRL-C. After that there is
such a stale CPG member. After several runs I get:
# corosync-cpgtool
TESTGROUP\x00
4610 3 (192.168.2.8)
27678 3 (192.168.2.8)
21828 3 (192.168.2.8)
16841 3 (192.168.2.8)
10901 3 (192.168.2.8)
10773 3 (192.168.2.8)
10496 3 (192.168.2.8)
9866 3 (192.168.2.8)
8552 3 (192.168.2.8)
7439 3 (192.168.2.8)
6782 3 (192.168.2.8)
Not a single of those PIDs exist! I currently run on Debian squeeze,
kernel 2.6.32 and corosync 1.2.0.
Is somebody able to reproduce that issue?
- Dietmar
> Regards,
> Honza
>
> Dietmar Maurer wrote:
> > Just found the following commit:
> >
> >
> http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;
> h=bcc5fdef8473d99399c624a7bc15423a2af645c1
> >
> > The problematic test case looks very similar to my tests - maybe that
> problem still exists?
> >
> >> It's strange, but the problem only occurs when fencing is involved,
> >> and cman kills a node. I will try to write a minimal CPG application
> >> which
> >> triggers that bug.
> >>
> >> btw, can a memory corruption inside my application cause such
> behavior?
> >>
> >> - Dietmar
> >>
> >>> Dietmar,
> >>> process *should* be removed after IPC is finished.
> >>>
> >>> Maybe it is bug. Do you have any reproduces?
> >>>
> >>> Thanks,
> >>> Honza
> >>>
> >>> Dietmar Maurer wrote:
> >>>>> Inside my CPG application, The confchg callback is called with
> >>> 'dead'
> >>>>> members:
> >>>>>
> >>>>> [debug] cpg member node 3 pid 1132
> >>>>> [debug] cpg member node 3 pid 14640
> >>>>>
> >>>>> for example process 1132 does not exists any longer on node 3.
> Any
> >>> idea
> >>>>> what
> >>>>> can cause such 'ghost' entries?
> >>>> If I run corosync-cpgtool on the node I get:
> >>>>
> >>>>> # corosync-cpgtool
> >>>>> Group Name PID Node ID
> >>>>> mygroup
> >>>>> 1132 3 (192.168.2.8)
> >>>>> 14887 3 (192.168.2.8)
> >>>> But process 1132 does not exists? How can that happen? I thought a
> >>> process
> >>>> is automatically removed from the CPG member list if it exits (or
> >>> crash)?
> >>>> - Dietmar
> >>>>
> >>>> _______________________________________________
> >>>> Openais mailing list
> >>>> [email protected]
> >>>> https://lists.linux-foundation.org/mailman/listinfo/openais
> >
> >
>
/*
Copyright (C) 2009 Proxmox Server Solutions GmbH
Copyright: This program is under GNU GPL, the GNU General Public License.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 dated June, 1991.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
Author: Dietmar Maurer <[email protected]>
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <corosync/corotypes.h>
#include <corosync/cpg.h>
static int cpg_mode_leave;
static void my_cpg_deliver_callback (
cpg_handle_t handle,
const struct cpg_name *groupName,
uint32_t nodeid,
uint32_t pid,
void *msg,
size_t msg_len)
{
printf("got message form %d/%d\n", nodeid, pid);
cpg_mode_leave = 1;
return;
}
static void my_cpg_confchg_callback (
cpg_handle_t handle,
const struct cpg_name *groupName,
const struct cpg_address *member_list, size_t member_list_entries,
const struct cpg_address *left_list, size_t left_list_entries,
const struct cpg_address *joined_list, size_t joined_list_entries)
{
int i;
printf("cpg_confchg_callback %ld joined, %ld left, %ld members\n",
joined_list_entries, left_list_entries, member_list_entries);
for (i = 0; i < member_list_entries; i++) {
printf("cpg member %d/%d\n", member_list[i].nodeid,
member_list[i].pid);
}
/* send update message */
char *inbuf = "This is jus a test message\n";
struct iovec iov;
iov.iov_base = inbuf;
iov.iov_len = strlen(inbuf)+1;
cpg_error_t result;
loop:
result = cpg_mcast_joined(handle, CPG_TYPE_AGREED, &iov, 1);
if (result == CPG_ERR_TRY_AGAIN) {
usleep(1000);
printf("cpg_send_message retry");
goto loop;
}
if (result != CS_OK)
printf("cpg_send_message failed: %d\n", result);
}
static cpg_callbacks_t callbacks = {
.cpg_deliver_fn = my_cpg_deliver_callback,
.cpg_confchg_fn = my_cpg_confchg_callback,
};
int main(int argc, char *argv[])
{
struct cpg_name group_name;
char *gn = "TESTGROUP";
strcpy(group_name.value, gn);
group_name.length = strlen(gn) + 1;
cs_error_t result;
cpg_handle_t handle;
start:
printf("starting cpgtest\n");
cpg_mode_leave = 0;
handle = 0;
printf("calling cpg_initialize\n");
result = cpg_initialize(&handle, &callbacks);
if (result != CS_OK) {
printf("cpg_initialize failed: %d\n", result);
goto retry;
}
printf("calling cpg_join\n");
while ((result = cpg_join(handle, &group_name)) == CS_ERR_TRY_AGAIN) {
printf("cpg_join returned CS_ERR_TRY_AGAIN\n");
sleep (1);
}
if (result != CS_OK) {
printf("cpg_join failed: %d\n", result);
exit(-1);
}
fd_set read_fds;
FD_ZERO(&read_fds);
int cpg_fd;
cpg_fd_get(handle, &cpg_fd);
printf("starting main loop\n");
do {
FD_SET(cpg_fd, &read_fds);
struct timeval timeout = { 1, 0};
result = select(cpg_fd + 1, &read_fds, 0, 0, &timeout);
if (result == -1) {
printf("select error: %d\n", result);
break;
}
if (result > 0) {
if (FD_ISSET(cpg_fd, &read_fds)) {
cs_error_t res = cpg_dispatch(handle,
CPG_DISPATCH_ALL);
if (res != CS_OK) {
printf("cpg_dispatch failed: %d\n",
res);
break;
}
}
}
if (cpg_mode_leave)
break;
} while(1);
retry:
printf("end loop - trying to restart\n");
usleep (1000);
if (handle) {
result = cpg_finalize(handle);
if (result != CS_OK) {
printf("cpg_finalize failed: %d\n", result);
exit(-1);
}
}
goto start;
exit(0);
}
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais