Hello community,

here is the log from the commit of package corosync for openSUSE:Factory 
checked in at 2017-11-20 17:02:19
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/corosync (Old)
 and      /work/SRC/openSUSE:Factory/.corosync.new (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "corosync"

Mon Nov 20 17:02:19 2017 rev:47 rq:542519 version:2.4.3

Changes:
--------
--- /work/SRC/openSUSE:Factory/corosync/corosync.changes        2017-11-10 
14:43:03.083203378 +0100
+++ /work/SRC/openSUSE:Factory/.corosync.new/corosync.changes   2017-11-20 
17:02:27.940150543 +0100
@@ -1,0 +2,6 @@
+Fri Nov 17 02:54:23 UTC 2017 - [email protected]
+
+- corosync race condition when node leaves immediately after 
joining(bsc#1067958)
+    Added: 0007-sync-Call-sync_init-of-all-services-at-once.patch
+
+-------------------------------------------------------------------

New:
----
  0007-sync-Call-sync_init-of-all-services-at-once.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ corosync.spec ++++++
--- /var/tmp/diff_new_pack.HFysBh/_old  2017-11-20 17:02:29.192105212 +0100
+++ /var/tmp/diff_new_pack.HFysBh/_new  2017-11-20 17:02:29.196105067 +0100
@@ -63,6 +63,7 @@
 Patch11:        0004-mark-corosync-as-a-static-service.patch
 Patch12:        0005-do-not-convert-empty-uid-gid-to-0.patch
 Patch13:        0006-Fix-compile-warnings-with-GCC-7.2.1.patch
+Patch14:        0007-sync-Call-sync_init-of-all-services-at-once.patch
 
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 # openais is indeed gone and should be uninstalled. Yes, we do not
@@ -141,6 +142,7 @@
 %patch11 -p1
 %patch12 -p1
 %patch13 -p1
+%patch14 -p1
 
 %build
 %if %{with runautogen}

++++++ 0007-sync-Call-sync_init-of-all-services-at-once.patch ++++++
This patch solves situation which can happen very rearly:
- Node B is running
- Node A is started and tries to create singleton membership. It also
  initialize service S which tries to send message during initialization
- Just before node A finished move to operational state, it gets
  Node B multicast message so moves to gather state
- Node A and B creates membership and moves to operational state and
  sync is started
- Node A and B receives message sent by node A during initialization of
  service S
- Node A exits before sync of service is finished

In this situation, node B may never execute sync_init for
service S. So node B service S is not aware of existence of node A but
it received message from it.

Similar situation can theoretically also happen during merge.

Solution is to change flow of sync, so now it looks like:

- Build service_list
- Call sync_init for all local services
- Send service_list
- Receive service_list from all members and send barier
- For all services:
  - Receive barier
  - Call sync_activate if this is not first service
  - Call sync_process for next service or finish sync if previous
    this service is the last one
  - Send barier

---
 exec/sync.c | 174 +++++++++++++++---------------------------------------------
 1 file changed, 44 insertions(+), 130 deletions(-)

diff --git a/exec/sync.c b/exec/sync.c
index 283634a8..1efa3577 100644
--- a/exec/sync.c
+++ b/exec/sync.c
@@ -62,10 +62,8 @@ LOGSYS_DECLARE_SUBSYS ("SYNC");
 
 #define MESSAGE_REQ_SYNC_BARRIER 0
 #define MESSAGE_REQ_SYNC_SERVICE_BUILD 1
-#define MESSAGE_REQ_SYNC_MEMB_DETERMINE 2
 
 enum sync_process_state {
-       INIT,
        PROCESS,
        ACTIVATE
 };
@@ -96,11 +94,6 @@ struct processor_entry {
        int received;
 };
 
-struct req_exec_memb_determine_message {
-       struct qb_ipc_request_header header __attribute__((aligned(8)));
-       struct memb_ring_id ring_id __attribute__((aligned(8)));
-};
-
 struct req_exec_service_build_message {
        struct qb_ipc_request_header header __attribute__((aligned(8)));
        struct memb_ring_id ring_id __attribute__((aligned(8)));
@@ -117,14 +110,6 @@ static enum sync_state my_state = SYNC_BARRIER;
 
 static struct memb_ring_id my_ring_id;
 
-static struct memb_ring_id my_memb_determine_ring_id;
-
-static int my_memb_determine = 0;
-
-static unsigned int my_memb_determine_list[PROCESSOR_COUNT_MAX];
-
-static unsigned int my_memb_determine_list_entries = 0;
-
 static int my_processing_idx = 0;
 
 static hdb_handle_t my_schedwrk_handle;
@@ -157,6 +142,8 @@ static int schedwrk_processor (const void *context);
 
 static void sync_process_enter (void);
 
+static void sync_process_call_init (void);
+
 static struct totempg_group sync_group = {
     .group      = "sync",
     .group_len  = 4
@@ -234,7 +221,6 @@ static void sync_barrier_handler (unsigned int nodeid, 
const void *msg)
 
                my_processing_idx += 1;
                if (my_service_list_entries == my_processing_idx) {
-                       my_memb_determine_list_entries = 0;
                        sync_synchronization_completed ();
                } else {
                        sync_process_enter ();
@@ -242,15 +228,6 @@ static void sync_barrier_handler (unsigned int nodeid, 
const void *msg)
        }
 }
 
-static void dummy_sync_init (
-       const unsigned int *trans_list,
-       size_t trans_list_entries,
-       const unsigned int *member_list,
-       size_t member_list_entries,
-       const struct memb_ring_id *ring_id)
-{
-}
-
 static void dummy_sync_abort (void)
 {
 }
@@ -272,31 +249,6 @@ static int service_entry_compare (const void *a, const 
void *b)
        return (service_entry_a->service_id > service_entry_b->service_id);
 }
 
-static void sync_memb_determine (unsigned int nodeid, const void *msg)
-{
-       const struct req_exec_memb_determine_message 
*req_exec_memb_determine_message = msg;
-       int found = 0;
-       int i;
-
-       if (memcmp (&req_exec_memb_determine_message->ring_id,
-               &my_memb_determine_ring_id, sizeof (struct memb_ring_id)) != 0) 
{
-
-               log_printf (LOGSYS_LEVEL_DEBUG, "memb determine for old ring - 
discarding");
-               return;
-       }
-
-       my_memb_determine = 1;
-       for (i = 0; i < my_memb_determine_list_entries; i++) {
-               if (my_memb_determine_list[i] == nodeid) {
-                       found = 1;
-               }
-       }
-       if (found == 0) {
-               my_memb_determine_list[my_memb_determine_list_entries] = nodeid;
-               my_memb_determine_list_entries += 1;
-       }
-}
-
 static void sync_service_build_handler (unsigned int nodeid, const void *msg)
 {
        const struct req_exec_service_build_message 
*req_exec_service_build_message = msg;
@@ -321,15 +273,14 @@ static void sync_service_build_handler (unsigned int 
nodeid, const void *msg)
                        }
                }
                if (found == 0) {
-                       my_service_list[my_service_list_entries].state =
-                               INIT;
+                       my_service_list[my_service_list_entries].state = 
PROCESS;
                        my_service_list[my_service_list_entries].service_id =
                                req_exec_service_build_message->service_list[i];
                        sprintf (my_service_list[my_service_list_entries].name,
                                "Unknown External Service (id = %d)\n",
                                
req_exec_service_build_message->service_list[i]);
                        my_service_list[my_service_list_entries].sync_init =
-                               dummy_sync_init;
+                               NULL;
                        my_service_list[my_service_list_entries].sync_abort =
                                dummy_sync_abort;
                        my_service_list[my_service_list_entries].sync_process =
@@ -356,6 +307,7 @@ static void sync_service_build_handler (unsigned int 
nodeid, const void *msg)
                }
        }
        if (barrier_reached) {
+               log_printf (LOGSYS_LEVEL_DEBUG, "enter sync process");
                sync_process_enter ();
        }
 }
@@ -375,31 +327,9 @@ static void sync_deliver_fn (
                case MESSAGE_REQ_SYNC_SERVICE_BUILD:
                        sync_service_build_handler (nodeid, msg);
                        break;
-               case MESSAGE_REQ_SYNC_MEMB_DETERMINE:
-                       sync_memb_determine (nodeid, msg);
-                       break;
        }
 }
 
-static void memb_determine_message_transmit (void)
-{
-       struct iovec iovec;
-       struct req_exec_memb_determine_message req_exec_memb_determine_message;
-
-       req_exec_memb_determine_message.header.size = sizeof (struct 
req_exec_memb_determine_message);
-       req_exec_memb_determine_message.header.id = 
MESSAGE_REQ_SYNC_MEMB_DETERMINE;
-
-       memcpy (&req_exec_memb_determine_message.ring_id,
-               &my_memb_determine_ring_id,
-               sizeof (struct memb_ring_id));
-
-       iovec.iov_base = (char *)&req_exec_memb_determine_message;
-       iovec.iov_len = sizeof (req_exec_memb_determine_message);
-
-       (void)totempg_groups_mcast_joined (sync_group_handle,
-               &iovec, 1, TOTEMPG_AGREED);
-}
-
 static void barrier_message_transmit (void)
 {
        struct iovec iovec;
@@ -441,6 +371,38 @@ static void sync_barrier_enter (void)
        barrier_message_transmit ();
 }
 
+static void sync_process_call_init (void)
+{
+       unsigned int old_trans_list[PROCESSOR_COUNT_MAX];
+       size_t old_trans_list_entries = 0;
+       int o, m;
+       int i;
+
+       memcpy (old_trans_list, my_trans_list, my_trans_list_entries *
+               sizeof (unsigned int));
+       old_trans_list_entries = my_trans_list_entries;
+
+       my_trans_list_entries = 0;
+       for (o = 0; o < old_trans_list_entries; o++) {
+               for (m = 0; m < my_member_list_entries; m++) {
+                       if (old_trans_list[o] == my_member_list[m]) {
+                               my_trans_list[my_trans_list_entries] = 
my_member_list[m];
+                               my_trans_list_entries++;
+                               break;
+                       }
+               }
+       }
+
+       for (i = 0; i < my_service_list_entries; i++) {
+               if (my_sync_callbacks_retrieve(my_service_list[i].service_id, 
NULL) != -1) {
+                       my_service_list[i].sync_init (my_trans_list,
+                               my_trans_list_entries, my_member_list,
+                               my_member_list_entries,
+                               &my_ring_id);
+               }
+       }
+}
+
 static void sync_process_enter (void)
 {
        int i;
@@ -452,13 +414,13 @@ static void sync_process_enter (void)
         */
        if (my_service_list_entries == 0) {
                my_state = SYNC_SERVICELIST_BUILD;
-               my_memb_determine_list_entries = 0;
                sync_synchronization_completed ();
                return;
        }
        for (i = 0; i < my_processor_list_entries; i++) {
                my_processor_list[i].received = 0;
        }
+
        schedwrk_create (&my_schedwrk_handle,
                schedwrk_processor,
                NULL);
@@ -498,7 +460,7 @@ static void sync_servicelist_build_enter (
                if (sync_callbacks.sync_init == NULL) {
                        continue;
                }
-               my_service_list[my_service_list_entries].state = INIT;
+               my_service_list[my_service_list_entries].state = PROCESS;
                my_service_list[my_service_list_entries].service_id = i;
                strcpy (my_service_list[my_service_list_entries].name,
                        sync_callbacks.name);
@@ -516,42 +478,16 @@ static void sync_servicelist_build_enter (
        service_build.service_list_entries = my_service_list_entries;
 
        service_build_message_transmit (&service_build);
+
+       log_printf (LOGSYS_LEVEL_DEBUG, "call init for locally known services");
+       sync_process_call_init ();
 }
 
 static int schedwrk_processor (const void *context)
 {
        int res = 0;
 
-       if (my_service_list[my_processing_idx].state == INIT) {
-               unsigned int old_trans_list[PROCESSOR_COUNT_MAX];
-               size_t old_trans_list_entries = 0;
-               int o, m;
-               my_service_list[my_processing_idx].state = PROCESS;
-
-               memcpy (old_trans_list, my_trans_list, my_trans_list_entries *
-                       sizeof (unsigned int));
-               old_trans_list_entries = my_trans_list_entries;
-
-               my_trans_list_entries = 0;
-               for (o = 0; o < old_trans_list_entries; o++) {
-                       for (m = 0; m < my_member_list_entries; m++) {
-                               if (old_trans_list[o] == my_member_list[m]) {
-                                       my_trans_list[my_trans_list_entries] = 
my_member_list[m];
-                                       my_trans_list_entries++;
-                                       break;
-                               }
-                       }
-               }
-
-               if 
(my_sync_callbacks_retrieve(my_service_list[my_processing_idx].service_id, 
NULL) != -1) {
-                       my_service_list[my_processing_idx].sync_init 
(my_trans_list,
-                               my_trans_list_entries, my_member_list,
-                               my_member_list_entries,
-                               &my_ring_id);
-               }
-       }
        if (my_service_list[my_processing_idx].state == PROCESS) {
-               my_service_list[my_processing_idx].state = PROCESS;
                if 
(my_sync_callbacks_retrieve(my_service_list[my_processing_idx].service_id, 
NULL) != -1) {
                        res = my_service_list[my_processing_idx].sync_process 
();
                } else {
@@ -574,14 +510,8 @@ void sync_start (
        ENTER();
        memcpy (&my_ring_id, ring_id, sizeof (struct memb_ring_id));
 
-       if (my_memb_determine) {
-               my_memb_determine = 0;
-               sync_servicelist_build_enter (my_memb_determine_list,
-                       my_memb_determine_list_entries, ring_id);
-       } else {
-               sync_servicelist_build_enter (member_list, member_list_entries,
-                       ring_id);
-       }
+       sync_servicelist_build_enter (member_list, member_list_entries,
+               ring_id);
 }
 
 void sync_save_transitional (
@@ -610,19 +540,3 @@ void sync_abort (void)
         */
        memset (&my_ring_id, 0, sizeof (struct memb_ring_id));
 }
-
-void sync_memb_list_determine (const struct memb_ring_id *ring_id)
-{
-       ENTER();
-       memcpy (&my_memb_determine_ring_id, ring_id,
-               sizeof (struct memb_ring_id));
-
-       memb_determine_message_transmit ();
-}
-
-void sync_memb_list_abort (void)
-{
-       ENTER();
-       my_memb_determine_list_entries = 0;
-       memset (&my_memb_determine_ring_id, 0, sizeof (struct memb_ring_id));
-}
-- 
2.13.6


Reply via email to