> Best application for such test is testcpg.c. If there is really bug,
> can you please create BZ (ideally with way to reproduce, because I'm really
> not able to reproduce such behavior).

I still wait for a BZ account, so I post it here. The attached 
program 'cpgtest' reproduces the problem. Compile with:

# gcc -Wall cpgtest.c $(shell pkg-config --cflags --libs libcpg libcoroipcc) -o 
cpgtest

It executes a simple loop:

start:
  cpg_initialize
  cpg_join
  cpg_dispatch
  send one message in confchg_callback
  cpg_finalize after receiving that message
  goto start
 
When I run that it executes several successful iterations, but sometime
the join fails:

# cgptest
...
starting cpgtest
calling cpg_initialize
calling cpg_join
cpg_join failed: 14

An worse, sometimes it hangs in main loop:

# cpgtest
...
starting cpgtest
calling cpg_initialize
calling cpg_join
starting main loop (hangs here)

When that happens, I abort with CTRL-C. After that there is
such a stale CPG member. After several runs I get:

# corosync-cpgtool
TESTGROUP\x00
                      4610               3 (192.168.2.8)
                     27678               3 (192.168.2.8)
                     21828               3 (192.168.2.8)
                     16841               3 (192.168.2.8)
                     10901               3 (192.168.2.8)
                     10773               3 (192.168.2.8)
                     10496               3 (192.168.2.8)
                      9866               3 (192.168.2.8)
                      8552               3 (192.168.2.8)
                      7439               3 (192.168.2.8)
                      6782               3 (192.168.2.8)

Not a single of those PIDs exist! I currently run on Debian squeeze,
kernel 2.6.32 and corosync 1.2.0.

Is somebody able to reproduce that issue?

- Dietmar

> Regards,
>   Honza
> 
> Dietmar Maurer wrote:
> > Just found the following commit:
> >
> >
> http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;
> h=bcc5fdef8473d99399c624a7bc15423a2af645c1
> >
> > The problematic test case looks very similar to my tests - maybe that
> problem still exists?
> >
> >> It's strange, but the problem only occurs when fencing is involved,
> >> and cman kills a node. I will try to write a minimal CPG application
> >> which
> >> triggers that bug.
> >>
> >> btw, can a memory corruption inside my application cause such
> behavior?
> >>
> >> - Dietmar
> >>
> >>> Dietmar,
> >>> process *should* be removed after IPC is finished.
> >>>
> >>> Maybe it is bug. Do you have any reproduces?
> >>>
> >>> Thanks,
> >>>   Honza
> >>>
> >>> Dietmar Maurer wrote:
> >>>>> Inside my CPG application, The confchg callback is called with
> >>> 'dead'
> >>>>> members:
> >>>>>
> >>>>> [debug] cpg member node 3 pid 1132
> >>>>> [debug] cpg member node 3 pid 14640
> >>>>>
> >>>>> for example process 1132 does not exists any longer on node 3.
> Any
> >>> idea
> >>>>> what
> >>>>> can cause such 'ghost' entries?
> >>>> If I run corosync-cpgtool on the node I get:
> >>>>
> >>>>> # corosync-cpgtool
> >>>>> Group Name             PID         Node ID
> >>>>> mygroup
> >>>>>                       1132               3 (192.168.2.8)
> >>>>>                      14887               3 (192.168.2.8)
> >>>> But process 1132 does not exists? How can that happen? I thought a
> >>> process
> >>>> is automatically removed from the CPG member list if it exits (or
> >>> crash)?
> >>>> - Dietmar
> >>>>
> >>>> _______________________________________________
> >>>> Openais mailing list
> >>>> [email protected]
> >>>> https://lists.linux-foundation.org/mailman/listinfo/openais
> >
> >
> 

/*
  Copyright (C) 2009 Proxmox Server Solutions GmbH

  Copyright: This program is under GNU GPL, the GNU General Public License.

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; version 2 dated June, 1991.
  
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  02111-1307, USA.

  Author: Dietmar Maurer <[email protected]>

*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

#include <corosync/corotypes.h>
#include <corosync/cpg.h>

static int cpg_mode_leave;

static void my_cpg_deliver_callback (
        cpg_handle_t handle,
        const struct cpg_name *groupName,
        uint32_t nodeid,
        uint32_t pid,
        void *msg,
        size_t msg_len)
{
        printf("got message form %d/%d\n", nodeid, pid);

        cpg_mode_leave = 1;

        return;
}

static void my_cpg_confchg_callback (
        cpg_handle_t handle,
        const struct cpg_name *groupName,
        const struct cpg_address *member_list, size_t member_list_entries,
        const struct cpg_address *left_list, size_t left_list_entries,
        const struct cpg_address *joined_list, size_t joined_list_entries)
{
        int i;

        printf("cpg_confchg_callback %ld joined, %ld left, %ld members\n",
               joined_list_entries, left_list_entries, member_list_entries);

        for (i = 0; i < member_list_entries; i++) {
                printf("cpg member %d/%d\n", member_list[i].nodeid, 
member_list[i].pid);
        }

        /* send update message */
        char *inbuf = "This is jus a test message\n";
        struct iovec iov;
        iov.iov_base = inbuf;
        iov.iov_len = strlen(inbuf)+1;

        cpg_error_t result;
loop:
        result = cpg_mcast_joined(handle, CPG_TYPE_AGREED, &iov, 1);
        if (result == CPG_ERR_TRY_AGAIN) {
                usleep(1000);
                printf("cpg_send_message retry");
                goto loop;
        }

        if (result != CS_OK) 
                printf("cpg_send_message failed: %d\n", result);

}

static cpg_callbacks_t callbacks = {
        .cpg_deliver_fn =            my_cpg_deliver_callback,
        .cpg_confchg_fn =            my_cpg_confchg_callback,
};


int main(int argc, char *argv[])
{
        struct cpg_name group_name;
        char *gn = "TESTGROUP";
        strcpy(group_name.value, gn);
        group_name.length = strlen(gn) + 1;

        cs_error_t result;
        cpg_handle_t handle;

start:
        printf("starting cpgtest\n");

        cpg_mode_leave = 0;
 
        handle = 0;

        printf("calling cpg_initialize\n");
        result = cpg_initialize(&handle, &callbacks);
        if (result != CS_OK) {
                printf("cpg_initialize failed: %d\n", result);
                goto retry;
        }

        printf("calling cpg_join\n");
        while ((result = cpg_join(handle, &group_name)) == CS_ERR_TRY_AGAIN) { 
                printf("cpg_join returned CS_ERR_TRY_AGAIN\n");
                sleep (1);
        }
        if (result != CS_OK) {
                printf("cpg_join failed: %d\n", result);
                exit(-1);               
        }

        fd_set read_fds;
        FD_ZERO(&read_fds);
        int cpg_fd;

        cpg_fd_get(handle, &cpg_fd);

        printf("starting main loop\n");

        do {
                FD_SET(cpg_fd, &read_fds);
                struct timeval timeout = { 1, 0};
                result = select(cpg_fd + 1, &read_fds, 0, 0, &timeout);

                if (result == -1) {
                        printf("select error: %d\n", result);
                        break;
                } 
                if (result > 0) {

                        if (FD_ISSET(cpg_fd, &read_fds)) {                      
                                cs_error_t res = cpg_dispatch(handle, 
CPG_DISPATCH_ALL);
                                if (res != CS_OK) {
                                        printf("cpg_dispatch failed: %d\n", 
res);
                                        break;
                                }
                        }
                }

                if (cpg_mode_leave)
                        break;

        } while(1);

retry:

        printf("end loop - trying to restart\n");

        usleep (1000);

        if (handle) {

                result = cpg_finalize(handle);
                if (result != CS_OK) {
                        printf("cpg_finalize failed: %d\n", result);
                        exit(-1);
                }
        }

        goto start;

        exit(0);
}
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to