Github user zcorrea commented on a diff in the pull request:
https://github.com/apache/trafodion/pull/1457#discussion_r171424566
--- Diff: core/sqf/monitor/linux/zclient.cxx ---
@@ -488,6 +488,103 @@ int CZClient::ZooExistRetry(zhandle_t *zh, const char
*path, int watch, struct S
return rc;
}
+const char* CZClient::WaitForAndReturnMaster( bool doWait )
+{
+ const char method_name[] = "CZClient::WaitForAndReturnMaster";
+ TRACE_ENTRY;
+
+ bool found = false;
+ int rc = -1;
+ int retries = 0;
+ Stat stat;
+
+ struct String_vector nodes = {0, NULL};
+ stringstream ss;
+ ss.str( "" );
+ ss << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_MASTER_ZNODE;
+ string masterMonitor( ss.str( ) );
+
+ // wait for 3 minutes for giving up.
+ while ( (!found) && (retries < 180))
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d trafCluster=%s\n"
+ , method_name, __LINE__, masterMonitor.c_str() );
+ }
+ // Verify the existence of the parent ZCLIENT_MASTER_ZNODE
+ rc = ZooExistRetry( ZHandle, masterMonitor.c_str( ), 0, &stat );
+
+ if ( rc == ZNONODE )
+ {
+ if (doWait == false)
+ {
+ break;
+ }
+ continue;
+ }
+ else if ( rc == ZOK )
+ {
+ // Now get the list of available znodes in the cluster.
+ //
+ // This will return child znodes for each monitor process that
has
+ // registered, including this process.
+ rc = zoo_get_children( ZHandle, masterMonitor.c_str( ), 0,
&nodes );
+ if ( nodes.count > 0 )
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d nodes.count=%d\n"
+ , method_name, __LINE__
+ , nodes.count );
+ }
+ found = true;
+ }
+ else
+ {
+ if (doWait == false)
+ {
+ break;
+ }
+ usleep(1000000); // sleep for a second as to not overwhelm
the system
+ retries++;
+ continue;
+ }
+ }
+
+ else // error
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d Error (MasterMonitor)
WaitForAndReturnMaster returned rc (%d), retries %d\n"
+ , method_name, __LINE__, rc, retries );
+ }
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], ZooExistRetry() for %s failed with error %s\n"
+ , method_name, masterMonitor.c_str( ), zerror(rc));
+ mon_log_write(MON_ZCLIENT_WAITFORANDRETURNMASTER, SQ_LOG_ERR,
buf);
+ break;
+ }
+ }
+
+ //should we assert nodes.count == 1?
+ if (found)
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d (MasterMonitor) Master Monitor found
(%s)\n"
+ , method_name, __LINE__, masterMonitor.c_str() );
+ }
+ return nodes.data[0];
--- End diff --
yep
---