[
https://issues.apache.org/jira/browse/MESOS-2451?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14351632#comment-14351632
]
craig bordelon commented on MESOS-2451:
---------------------------------------
ok. i have test program ready.
Probably could be a little simpler if i took out the log.h and log stuff, but
anyway....
I compiled on redhat rhel 6.5 with dev toolset 2.0 for the c++11 support
--- gnu Makefile like this - fix your MESOS_HOME here
HOW_BUILD=cpp11build
MESOS_HOME=??
M=$MESOS_HOME
Z=${M}/${HOW_BUILD}/3rdparty/zookeeper-3.4.5/src/c
MA=${M}/3rdparty/libprocess
MB=${M}/${HOW_BUILD}/3rdparty/libprocess
all: bug
%: %.cpp
g++ -g -std=c++11 -DTHREADED -I$(Z)/include -I$(Z)/generated -I$(M)/src
-I$(MA)/3rdparty/stout/include -I$(MA)/include -I$(MB)/include
-I$(MB)/3rdparty/picojson-4f93734 -I$(MB)/3rdparty/boost-1.53.0
-I$(MB)/3rdparty/glog-0.3.3/src -L$(Z)/.libs -L$(M)/${HOW_BUILD}/src/.libs
-lzookeeper_mt -lmesos $< -o $@
---- end of Makefile
-- source code of log.h (similar to the oreilly zookeeper book's code example
and zookeeper c code -- feel free to take out)
#ifndef ZK_LOG_H_
#define ZK_LOG_H_
#include <zookeeper.h>
#ifdef __cplusplus
extern "C" {
#endif
extern ZOOAPI ZooLogLevel logLevel;
#define LOGSTREAM getLogStream()
#define LOG_ERROR(x) if(logLevel>=ZOO_LOG_LEVEL_ERROR) \
log_message(ZOO_LOG_LEVEL_ERROR,__LINE__,__func__,/*format_log_message*/ x)
#define LOG_WARN(x) if(logLevel>=ZOO_LOG_LEVEL_WARN) \
log_message(ZOO_LOG_LEVEL_WARN,__LINE__,__func__,/*format_log_message*/ x)
#define LOG_INFO(x) if(logLevel>=ZOO_LOG_LEVEL_INFO) \
log_message(ZOO_LOG_LEVEL_INFO,__LINE__,__func__,/*format_log_message*/ x)
#define LOG_DEBUG(x) if(logLevel==ZOO_LOG_LEVEL_DEBUG) \
log_message(ZOO_LOG_LEVEL_DEBUG,__LINE__,__func__,/*format_log_message*/ x)
ZOOAPI void log_message(ZooLogLevel curLevel, int line,const char* funcName,
const char* message);
ZOOAPI const char* format_log_message(const char* format,...);
FILE* getLogStream();
#ifdef __cplusplus
}
#endif
#endif /*ZK_LOG_H_*/
--- source code of bug.cpp ---
#include <assert.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
#include <unistd.h>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <string>
#include <sstream>
#include <vector>
#include <iostream>
#include <unordered_set>
#include "log.h"
using std::string;
using std::stringstream;
using std::vector;
using std::unordered_set;
#include <zookeeper/zookeeper.hpp>
std::mutex initMutex;
std::condition_variable initCondv;
static const char * type2string(int type){
if (type == ZOO_CREATED_EVENT)
return "CREATED_EVENT";
if (type == ZOO_DELETED_EVENT)
return "DELETED_EVENT";
if (type == ZOO_CHANGED_EVENT)
return "CHANGED_EVENT";
if (type == ZOO_CHILD_EVENT)
return "CHILD_EVENT";
if (type == ZOO_SESSION_EVENT)
return "SESSION_EVENT";
if (type == ZOO_NOTWATCHING_EVENT)
return "NOTWATCHING_EVENT";
return "UNKNOWN_EVENT_TYPE";
}
static const char * rc2string(int rc){
if (rc == ZOK) {
return "OK";
}
if (rc == ZSYSTEMERROR) {
return "System error";
}
if (rc == ZRUNTIMEINCONSISTENCY) {
return "Runtime inconsistency";
}
if (rc == ZDATAINCONSISTENCY) {
return "Data inconsistency";
}
if (rc == ZCONNECTIONLOSS) {
return "Connection to the server has been lost";
}
if (rc == ZMARSHALLINGERROR) {
return "Error while marshalling or unmarshalling data ";
}
if (rc == ZUNIMPLEMENTED) {
return "Operation not implemented";
}
if (rc == ZOPERATIONTIMEOUT) {
return "Operation timeout";
}
if (rc == ZBADARGUMENTS) {
return "Invalid argument";
}
if (rc == ZINVALIDSTATE) {
return "Invalid zhandle state";
}
if (rc == ZAPIERROR) {
return "API error";
}
if (rc == ZNONODE) {
return "Znode does not exist";
}
if (rc == ZNOAUTH) {
return "Not authenticated";
}
if (rc == ZBADVERSION) {
return "Version conflict";
}
if (rc == ZNOCHILDRENFOREPHEMERALS) {
return "Ephemeral nodes may not have children";
}
if (rc == ZNODEEXISTS) {
return "Znode already exists";
}
if (rc == ZNOTEMPTY) {
return "The znode has children";
}
if (rc == ZSESSIONEXPIRED) {
return "The session has been expired by the server";
}
if (rc == ZINVALIDCALLBACK) {
return "Invalid callback specified";
}
if (rc == ZINVALIDACL) {
return "Invalid ACL specified";
}
if (rc == ZAUTHFAILED) {
return "Client authentication failed";
}
if (rc == ZCLOSING) {
return "ZooKeeper session is closing";
}
if (rc == ZNOTHING) {
return "No response from server";
}
if (rc == ZSESSIONMOVED) {
return "Session moved to a different server";
}
/*
* Return codes related to reconfiguration.
* Available only from version 3.5.0.
if (rc == ZNEWCONFIGNOQUORUM) {
return "Missing new configuration quorum";
}
if (rc == ZRECONFIGINPROGRESS) {
return "Reconfiguration in progress";
}
*/
return "UNKNOWN_EVENT_TYPE";
}
struct MyWatcher : public Watcher
{
bool connected;
ZooKeeper* zook;
MyWatcher(): connected(false), zook(nullptr) {
}
void process(
int type,
int state,
int64_t sessionId,
const std::string& path) {
LOG_DEBUG(/*LOGCALLBACK(zh),*/ format_log_message("Received event, type %d
%d %s", type, state, path.c_str()));
if( type == ZOO_CHILD_EVENT) {
if (path == "") ;
else if (path == "") ;
}
else if( type == ZOO_DELETED_EVENT) {
if (path == "") ;
}
else if( type == ZOO_CHANGED_EVENT) {
Stat stat;
string result;
int r = zook->get(path, true, &result, &stat);
std::cout << "get return " << rc2string(r) << " result " << result << "\n";
}
else if (type == ZOO_SESSION_EVENT) {
if (state == ZOO_CONNECTED_STATE) {
connected = true;
{
std::lock_guard<std::mutex> lk(initMutex);
initCondv.notify_all(); }
LOG_DEBUG(/*LOGCALLBACK(zh),*/ format_log_message("Received a
connected event."));
} else if (state == ZOO_EXPIRED_SESSION_STATE) {
//expired = 1;
}
LOG_DEBUG(/*LOGCALLBACK(zh),*/ format_log_message("Event: %s %d",
type2string(type), state));
}
else {
LOG_DEBUG(/*LOGCALLBACK(zh),*/ format_log_message("Event: %s %d",
type2string(type), state));
}
}
};
ZooKeeper* zkclient(const string &connect, const int zktimeout) {
auto w = new MyWatcher();
std::unique_lock<std::mutex> lk(initMutex);
auto zook = new ZooKeeper(connect, Milliseconds(zktimeout), w);
auto now = std::chrono::system_clock::now();
auto until = now + std::chrono::milliseconds(zktimeout);
while (!w->connected) {
auto now = std::chrono::system_clock::now();
#if __cplusplus >= 201103L
if (initCondv.wait_for(lk, until - now) == std::cv_status::timeout)
#else
if (initCondv.wait_for(lk, until - now) == false) //timed out
#endif
{
return nullptr;
}
}
w->zook = zook;
return zook;
}
int main (int argc, char * argv[]) {
char* connect = argv[1];
int zktimeout = atoi(argv[2]);
ZooKeeper* zkc = zkclient(connect, zktimeout);
{
string* resPath = new string;
int r = zkc->create("/foo", "n", ZOO_OPEN_ACL_UNSAFE, 0, resPath);
std::cout << "create return " << rc2string(r) << "\n";
}
{
Stat dummyStat;
int r = zkc->exists("/foo", true, &dummyStat);
std::cout << "exists return " << rc2string(r) << "\n";
}
{
int r = zkc->set("/foo", "bar", -1);
std::cout << "set return " << rc2string(r) << "\n";
}
std::this_thread::sleep_for (std::chrono::seconds(4));
delete zkc;
}
--- bash script that runs it
HOW_BUILD=cpp11build
MESOS_HOME=??
M=$MESOS_HOME
Z=$M/$HOW_BUILD/3rdparty/zookeeper-3.4.5/src/c
MA=$M/3rdparty/libprocess
MB=$M/$HOW_BUILD/3rdparty/libprocess
export LD_LIBRARY_PATH=$Z/.libs:$M/$HOW_BUILD/src/.libs
./bug <ip-address-of-standalone-running-zk>:2181 2000
> mesos c++ zookeeper code hangs from api operation from within watcher of
> CHANGE event
> -------------------------------------------------------------------------------------
>
> Key: MESOS-2451
> URL: https://issues.apache.org/jira/browse/MESOS-2451
> Project: Mesos
> Issue Type: Bug
> Components: c++ api
> Affects Versions: 0.22.0
> Environment: red hat linux 6.5
> Reporter: craig bordelon
> Assignee: Benjamin Hindman
>
> We've observed that that the mesos 0.22.0-rc1 c++ zookeeper code appears to
> hang (two threads stuck in indefinite pthread condition waits) on a test case
> that as best we can tell is mesos issue and not issue with underlying apache
> zookeeper C binding.
> (that is we tried same type case using apache zookeeper C binding directly
> and saw no issues.)
> This happens with a properly running zookeeper (standalone is sufficient).
> Heres how we hung it:
> We issue a mesos zk set via
> int ZooKeeper::set ( const std::string & path,
> const std::string & data,
> int version
> )
> then inside a Watcher we process on CHANGED event to issue a mesos zk get on
> the same path via
> int ZooKeeper::get ( const std::string & path,
> bool watch,
> std::string * result,
> Stat * stat
> )
> we end up with two threads in the process both in pthread_cond_waits
> #0 0x000000334e20b43c in pthread_cond_wait@@GLIBC_2.3.2 () from
> /lib64/libpthread.so.0
> #1 0x00007f6664ee1cf5 in Gate::arrive (this=0x7f6140, old=0)
> at ../../../3rdparty/libprocess/src/gate.hpp:82
> #2 0x00007f6664ecef6e in process::ProcessManager::wait (this=0x7f02e0,
> pid=...)
> at ../../../3rdparty/libprocess/src/process.cpp:2476
> #3 0x00007f6664ed2ce9 in process::wait (pid=..., duration=...)
> at ../../../3rdparty/libprocess/src/process.cpp:2958
> #4 0x00007f6664e90558 in process::Latch::await (this=0x7f6ba0, duration=...)
> at ../../../3rdparty/libprocess/src/latch.cpp:49
> #5 0x00007f66649452cc in process::Future<int>::await (this=0x7fffa0fd9040,
> duration=...)
> at ../../3rdparty/libprocess/include/process/future.hpp:1156
> #6 0x00007f666493a04d in process::Future<int>::get (this=0x7fffa0fd9040)
> at ../../3rdparty/libprocess/include/process/future.hpp:1167
> #7 0x00007f6664ab1aac in ZooKeeper::set (this=0x803ce0, path="/craig/mo",
> data=
> ...
> and
> #0 0x000000334e20b43c in pthread_cond_wait@@GLIBC_2.3.2 () from
> /lib64/libpthread.so.0
> #1 0x00007f6664ee1cf5 in Gate::arrive (this=0x7f66380013f0, old=0)
> at ../../../3rdparty/libprocess/src/gate.hpp:82
> #2 0x00007f6664ecef6e in process::ProcessManager::wait (this=0x7f02e0,
> pid=...)
> at ../../../3rdparty/libprocess/src/process.cpp:2476
> #3 0x00007f6664ed2ce9 in process::wait (pid=..., duration=...)
> at ../../../3rdparty/libprocess/src/process.cpp:2958
> #4 0x00007f6664e90558 in process::Latch::await (this=0x7f6638000d00,
> duration=...)
> at ../../../3rdparty/libprocess/src/latch.cpp:49
> #5 0x00007f66649452cc in process::Future<int>::await (this=0x7f66595fb6f0,
> duration=...)
> at ../../3rdparty/libprocess/include/process/future.hpp:1156
> #6 0x00007f666493a04d in process::Future<int>::get (this=0x7f66595fb6f0)
> at ../../3rdparty/libprocess/include/process/future.hpp:1167
> #7 0x00007f6664ab18d3 in ZooKeeper::get (this=0x803ce0, path="/craig/mo",
> watch=false,
> ....
> We of course have a separate "enhancement" suggestion that the mesos C++
> zookeeper api use timed waits and not block indefinitely for responses.
> But this case we think the mesos code itself is blocking on itself and not
> handling the responses.
> craig
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)