During cluster start, one node (node 1) broadcast up msg to other node. The remote node (node 2) get this msg and send the connection to node 1 (connect()). Similarly node 1 send the connection to node 2 after node 2 broadcast up msg to. Beside of node 2 connect() to node 1, node 2 also add the IP and ID info of node 1 to database. But before of that, node 2 may also accept the connection that come from node 1. The acception is also add node ID of node 1. So there is 2 times adding the node ID info of node 1 to database in node 2. This causes the socket connection is closed and node is restart again.
The patch change to retrieve node from database by node IP instead node ID in processing connection. This will reject the double of establishing connection between 2 nodes and also double of adding node IP to database. --- src/dtm/dtmnd/dtm.h | 11 ++++-- src/dtm/dtmnd/dtm_inter_trans.cc | 3 +- src/dtm/dtmnd/dtm_node.cc | 2 +- src/dtm/dtmnd/dtm_node_db.cc | 79 ++++++++++++++++++++++++--------------- src/dtm/dtmnd/dtm_node_sockets.cc | 20 ++++++---- 5 files changed, 72 insertions(+), 43 deletions(-) diff --git a/src/dtm/dtmnd/dtm.h b/src/dtm/dtmnd/dtm.h index 28c811e65..a06b8f503 100644 --- a/src/dtm/dtmnd/dtm.h +++ b/src/dtm/dtmnd/dtm.h @@ -45,6 +45,11 @@ typedef enum { DTM_MBX_MSG_TYPE = 5, } MBX_POST_TYPES; +typedef enum { + DTM_NODE_ID_KEY_TYPE = 0, + DTM_NODE_IP_KEY_TYPE = 2, +} KEY_TYPES; + typedef struct dtm_rcv_msg_elem { void *next; MBX_POST_TYPES type; @@ -99,10 +104,10 @@ typedef struct dtm_snd_msg_elem { extern void node_discovery_process(void *arg); extern uint32_t dtm_cb_init(DTM_INTERNODE_CB *dtms_cb); -extern DTM_NODE_DB *dtm_node_get_by_id(uint32_t nodeid); +extern DTM_NODE_DB *dtm_node_get(uint8_t *key, KEY_TYPES type); extern DTM_NODE_DB *dtm_node_getnext_by_id(uint32_t node_id); -extern uint32_t dtm_node_add(DTM_NODE_DB *node, int i); -extern uint32_t dtm_node_delete(DTM_NODE_DB *nnode, int i); +extern uint32_t dtm_node_add(DTM_NODE_DB *node, KEY_TYPES type); +extern uint32_t dtm_node_delete(DTM_NODE_DB *nnode, KEY_TYPES type); extern DTM_NODE_DB *dtm_node_new(const DTM_NODE_DB *new_node); extern void dtm_print_config(DTM_INTERNODE_CB *config); extern int dtm_read_config(DTM_INTERNODE_CB *config, diff --git a/src/dtm/dtmnd/dtm_inter_trans.cc b/src/dtm/dtmnd/dtm_inter_trans.cc index 9d8335466..9b4194614 100644 --- a/src/dtm/dtmnd/dtm_inter_trans.cc +++ b/src/dtm/dtmnd/dtm_inter_trans.cc @@ -235,9 +235,10 @@ static uint32_t dtm_internode_snd_msg_common(DTM_NODE_DB *node, uint8_t *buffer, uint32_t dtm_internode_snd_msg_to_node(uint8_t *buffer, uint16_t len, NODE_ID node_id) { DTM_NODE_DB *node = nullptr; + uint8_t *key = reinterpret_cast<uint8_t *>(&node_id); TRACE_ENTER(); - node = dtm_node_get_by_id(node_id); + node = dtm_node_get(key, DTM_NODE_ID_KEY_TYPE); if (nullptr != node) { if (NCSCC_RC_SUCCESS != dtm_internode_snd_msg_common(node, buffer, len)) { diff --git a/src/dtm/dtmnd/dtm_node.cc b/src/dtm/dtmnd/dtm_node.cc index de2f94738..72506f262 100644 --- a/src/dtm/dtmnd/dtm_node.cc +++ b/src/dtm/dtmnd/dtm_node.cc @@ -125,7 +125,7 @@ uint32_t dtm_process_node_info(DTM_INTERNODE_CB *dtms_cb, DTM_NODE_DB *node, memcpy(node->node_name, data, nodename_len); node->node_name[nodename_len] = '\0'; node->comm_status = true; - if (dtm_node_add(node, 0) != NCSCC_RC_SUCCESS) { + if (dtm_node_add(node, DTM_NODE_ID_KEY_TYPE) != NCSCC_RC_SUCCESS) { LOG_ER( "DTM: A node already exists in the cluster with similar " "configuration (possible duplicate IP address and/or node id), please " diff --git a/src/dtm/dtmnd/dtm_node_db.cc b/src/dtm/dtmnd/dtm_node_db.cc index 1c9da4dac..1038f0918 100644 --- a/src/dtm/dtmnd/dtm_node_db.cc +++ b/src/dtm/dtmnd/dtm_node_db.cc @@ -123,24 +123,49 @@ uint32_t dtm_cb_init(DTM_INTERNODE_CB *dtms_cb) { } /** - * Retrieve node from node db by nodeid + * Retrieve node from node db * - * @param nodeid + * @param key + * @param i * - * @return NCSCC_RC_SUCCESS - * @return NCSCC_RC_FAILURE + * @return node * */ -DTM_NODE_DB *dtm_node_get_by_id(uint32_t nodeid) { +DTM_NODE_DB *dtm_node_get(uint8_t *key, KEY_TYPES type) { TRACE_ENTER(); DTM_INTERNODE_CB *dtms_cb = dtms_gl_cb; + DTM_NODE_DB *node = nullptr; - DTM_NODE_DB *node = reinterpret_cast<DTM_NODE_DB *>(ncs_patricia_tree_get( - &dtms_cb->nodeid_tree, reinterpret_cast<uint8_t *>(&nodeid))); - if (node != nullptr) { - /* Adjust the pointer */ - node = reinterpret_cast<DTM_NODE_DB *>(reinterpret_cast<char *>(node) - - offsetof(DTM_NODE_DB, pat_nodeid)); + osafassert(key != nullptr); + + switch (type) { + case DTM_NODE_ID_KEY_TYPE: + TRACE("DTM: Getting node from the database by node_id : %u as key", + *reinterpret_cast<NODE_ID *>(key)); + node = reinterpret_cast<DTM_NODE_DB *>(ncs_patricia_tree_get( + &dtms_cb->nodeid_tree, key)); + if (node != nullptr) { + // Adjust the pointer + node = reinterpret_cast<DTM_NODE_DB *>(reinterpret_cast<char *>(node) - + offsetof(DTM_NODE_DB, pat_nodeid)); + } + break; + + case DTM_NODE_IP_KEY_TYPE: + TRACE("DTM: Getting node from the database by node_ip : %s as key", + reinterpret_cast<char *>(key)); + node = reinterpret_cast<DTM_NODE_DB *>(ncs_patricia_tree_get( + &dtms_cb->ip_addr_tree, key)); + if (node != nullptr) { + // Adjust the pointer + node = reinterpret_cast<DTM_NODE_DB *>(reinterpret_cast<char *>(node) - + offsetof(DTM_NODE_DB, pat_ip_address)); + } + break; + + default: + osafassert(false); + break; } TRACE_LEAVE(); @@ -189,16 +214,16 @@ DTM_NODE_DB *dtm_node_getnext_by_id(uint32_t node_id) { * @return NCSCC_RC_FAILURE * */ -uint32_t dtm_node_add(DTM_NODE_DB *node, int i) { +uint32_t dtm_node_add(DTM_NODE_DB *node, KEY_TYPES type) { uint32_t rc = NCSCC_RC_SUCCESS; DTM_INTERNODE_CB *dtms_cb = dtms_gl_cb; TRACE_ENTER(); - TRACE("DTM:value of i %d", i); + TRACE("DTM:value of i %d", type); osafassert(node != nullptr); - switch (i) { - case 0: + switch (type) { + case DTM_NODE_ID_KEY_TYPE: TRACE("DTM:Adding node_id to the database with node_id :%u as key", node->node_id); node->pat_nodeid.key_info = reinterpret_cast<uint8_t *>(&(node->node_id)); @@ -210,11 +235,8 @@ uint32_t dtm_node_add(DTM_NODE_DB *node, int i) { goto done; } break; - case 1: - osafassert(false); - break; - case 2: + case DTM_NODE_IP_KEY_TYPE: TRACE("DTM:Adding node_ip to the database with node_ip :%s as key", node->node_ip); node->pat_ip_address.key_info = @@ -230,8 +252,8 @@ uint32_t dtm_node_add(DTM_NODE_DB *node, int i) { default: TRACE("DTM:Invalid Patricia add"); - rc = NCSCC_RC_FAILURE; - goto done; + osafassert(false); + break; } done: @@ -248,15 +270,15 @@ done: * @return NCSCC_RC_FAILURE * */ -uint32_t dtm_node_delete(DTM_NODE_DB *node, int i) { +uint32_t dtm_node_delete(DTM_NODE_DB *node, KEY_TYPES type) { uint32_t rc = NCSCC_RC_SUCCESS; DTM_INTERNODE_CB *dtms_cb = dtms_gl_cb; - TRACE_ENTER2("DTM:value of i %d", i); + TRACE_ENTER2("DTM:value of i %d", type); osafassert(node != nullptr); - switch (i) { - case 0: + switch (type) { + case DTM_NODE_ID_KEY_TYPE: if (node->node_id != 0 && node->pat_nodeid.key_info) { TRACE("DTM:Deleting node_id from the database with node_id :%u as key", node->node_id); @@ -269,11 +291,8 @@ uint32_t dtm_node_delete(DTM_NODE_DB *node, int i) { } } break; - case 1: - osafassert(false); - break; - case 2: + case DTM_NODE_IP_KEY_TYPE: if (node->node_ip != nullptr && node->pat_ip_address.key_info) { TRACE("DTM:Deleting node_ip from the database with node_ip :%s as key", node->node_ip); @@ -290,7 +309,7 @@ uint32_t dtm_node_delete(DTM_NODE_DB *node, int i) { default: TRACE( "DTM:Deleting node from the database with unknown option :%d as key", - i); + type); osafassert(0); } diff --git a/src/dtm/dtmnd/dtm_node_sockets.cc b/src/dtm/dtmnd/dtm_node_sockets.cc index 0ddfc6f58..a0b835c29 100644 --- a/src/dtm/dtmnd/dtm_node_sockets.cc +++ b/src/dtm/dtmnd/dtm_node_sockets.cc @@ -171,11 +171,11 @@ void dtm_comm_socket_close(DTM_NODE_DB *node) { } } - if (dtm_node_delete(node, 0) != NCSCC_RC_SUCCESS) { + if (dtm_node_delete(node, DTM_NODE_ID_KEY_TYPE) != NCSCC_RC_SUCCESS) { LOG_ER("DTM :dtm_node_delete failed "); } - if (dtm_node_delete(node, 2) != NCSCC_RC_SUCCESS) { + if (dtm_node_delete(node, DTM_NODE_IP_KEY_TYPE) != NCSCC_RC_SUCCESS) { LOG_ER("DTM :dtm_node_delete failed "); } @@ -566,7 +566,11 @@ DTM_NODE_DB *dtm_process_connect(DTM_INTERNODE_CB *dtms_cb, uint8_t *data, } } - new_node = dtm_node_get_by_id(node.node_id); + // Retrieve node from nodeip instead of nodeid to prevent the case dtm already + // accepted remote node connection and dtm already add the nodeip to database. + // If so, dtm should drop this message as discovery in progress. + uint8_t *nodeip = reinterpret_cast<uint8_t *>(node.node_ip); + new_node = dtm_node_get(nodeip, DTM_NODE_IP_KEY_TYPE); if (new_node != nullptr) { if ((new_node->node_id == 0) || (new_node->node_id == node.node_id) || (strncmp(node.node_ip, new_node->node_ip, INET6_ADDRSTRLEN) == 0)) { @@ -593,10 +597,10 @@ DTM_NODE_DB *dtm_process_connect(DTM_INTERNODE_CB *dtms_cb, uint8_t *data, 0))) { TRACE("DTM: deleting stale enty cluster_id: %d, node_id :%u, node_ip:%s", node.cluster_id, node.node_id, node.node_ip); - if (dtm_node_delete(new_node, 0) != NCSCC_RC_SUCCESS) { + if (dtm_node_delete(new_node, DTM_NODE_ID_KEY_TYPE) != NCSCC_RC_SUCCESS) { LOG_ER("DTM :dtm_node_delete failed (recv())"); } - if (dtm_node_delete(new_node, 2) != NCSCC_RC_SUCCESS) { + if (dtm_node_delete(new_node, DTM_NODE_IP_KEY_TYPE) != NCSCC_RC_SUCCESS) { LOG_ER("DTM :dtm_node_delete failed (recv())"); } free(new_node); @@ -621,7 +625,7 @@ DTM_NODE_DB *dtm_process_connect(DTM_INTERNODE_CB *dtms_cb, uint8_t *data, if (sock_desc != -1) { TRACE("DTM: dtm_node_add .node_ip: %s node_id: %u, comm_socket %d", new_node->node_ip, new_node->node_id, new_node->comm_socket); - if (dtm_node_add(new_node, 0) != NCSCC_RC_SUCCESS) { + if (dtm_node_add(new_node, DTM_NODE_ID_KEY_TYPE) != NCSCC_RC_SUCCESS) { LOG_ER("DTM: dtm_node_add failed .node_ip: %s, node_id: %u", new_node->node_ip, new_node->node_id); dtm_comm_socket_close(new_node); @@ -629,7 +633,7 @@ DTM_NODE_DB *dtm_process_connect(DTM_INTERNODE_CB *dtms_cb, uint8_t *data, goto node_fail; } - if (dtm_node_add(new_node, 2) != NCSCC_RC_SUCCESS) { + if (dtm_node_add(new_node, DTM_NODE_IP_KEY_TYPE) != NCSCC_RC_SUCCESS) { LOG_ER("DTM: dtm_node_add failed .node_ip: %s, node_id: %u", new_node->node_ip, new_node->node_id); dtm_comm_socket_close(new_node); @@ -748,7 +752,7 @@ DTM_NODE_DB *dtm_process_accept(DTM_INTERNODE_CB *dtms_cb, int stream_sock) { continue; } - if (dtm_node_add(new_node, 2) != NCSCC_RC_SUCCESS) { + if (dtm_node_add(new_node, DTM_NODE_IP_KEY_TYPE) != NCSCC_RC_SUCCESS) { LOG_ER("DTM: dtm_node_add failed .node_ip: %s, node_id: %u", new_node->node_ip, new_node->node_id); dtm_comm_socket_close(new_node); -- 2.15.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel