[ 
https://issues.apache.org/jira/browse/MESOS-2451?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14351632#comment-14351632
 ] 

craig bordelon commented on MESOS-2451:
---------------------------------------

ok. i have test program ready. 
Probably could be a little simpler if i took out the log.h and log stuff, but 
anyway....

I compiled on redhat rhel 6.5 with dev toolset 2.0 for the c++11 support

--- gnu Makefile like this - fix your MESOS_HOME here

HOW_BUILD=cpp11build
MESOS_HOME=??
M=$MESOS_HOME
Z=${M}/${HOW_BUILD}/3rdparty/zookeeper-3.4.5/src/c
MA=${M}/3rdparty/libprocess
MB=${M}/${HOW_BUILD}/3rdparty/libprocess

all: bug


%: %.cpp
  g++ -g -std=c++11 -DTHREADED -I$(Z)/include -I$(Z)/generated -I$(M)/src 
-I$(MA)/3rdparty/stout/include -I$(MA)/include -I$(MB)/include 
-I$(MB)/3rdparty/picojson-4f93734 -I$(MB)/3rdparty/boost-1.53.0 
-I$(MB)/3rdparty/glog-0.3.3/src -L$(Z)/.libs -L$(M)/${HOW_BUILD}/src/.libs 
-lzookeeper_mt -lmesos $< -o $@

---- end of Makefile

-- source code of log.h (similar to the oreilly zookeeper book's code example 
and zookeeper c code -- feel free to take out)

#ifndef ZK_LOG_H_
#define ZK_LOG_H_

#include <zookeeper.h>

#ifdef __cplusplus
extern "C" {
#endif

extern ZOOAPI ZooLogLevel logLevel;
#define LOGSTREAM getLogStream()


#define LOG_ERROR(x) if(logLevel>=ZOO_LOG_LEVEL_ERROR) \
    log_message(ZOO_LOG_LEVEL_ERROR,__LINE__,__func__,/*format_log_message*/ x)
#define LOG_WARN(x) if(logLevel>=ZOO_LOG_LEVEL_WARN) \
    log_message(ZOO_LOG_LEVEL_WARN,__LINE__,__func__,/*format_log_message*/ x)
#define LOG_INFO(x) if(logLevel>=ZOO_LOG_LEVEL_INFO) \
    log_message(ZOO_LOG_LEVEL_INFO,__LINE__,__func__,/*format_log_message*/ x)
#define LOG_DEBUG(x) if(logLevel==ZOO_LOG_LEVEL_DEBUG) \
    log_message(ZOO_LOG_LEVEL_DEBUG,__LINE__,__func__,/*format_log_message*/ x)

ZOOAPI void log_message(ZooLogLevel curLevel, int line,const char* funcName,
    const char* message);

ZOOAPI const char* format_log_message(const char* format,...);

FILE* getLogStream();

#ifdef __cplusplus
}
#endif

#endif /*ZK_LOG_H_*/


--- source code of bug.cpp --- 

#include <assert.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
#include <unistd.h>

#include <thread>
#include <mutex>
#include <condition_variable>
#include <string>
#include <sstream>
#include <vector>
#include <iostream>
#include <unordered_set>

#include "log.h"

using std::string;
using std::stringstream;
using std::vector;
using std::unordered_set;

#include <zookeeper/zookeeper.hpp>

    std::mutex initMutex;
    std::condition_variable initCondv;

static const char * type2string(int type){
    if (type == ZOO_CREATED_EVENT)
        return "CREATED_EVENT";
    if (type == ZOO_DELETED_EVENT)
        return "DELETED_EVENT";
    if (type == ZOO_CHANGED_EVENT)
        return "CHANGED_EVENT";
    if (type == ZOO_CHILD_EVENT)
        return "CHILD_EVENT";
    if (type == ZOO_SESSION_EVENT)
        return "SESSION_EVENT";
    if (type == ZOO_NOTWATCHING_EVENT)
        return "NOTWATCHING_EVENT";

    return "UNKNOWN_EVENT_TYPE";
}

static const char * rc2string(int rc){
    if (rc == ZOK) {
        return "OK";
    }
    if (rc == ZSYSTEMERROR) {
        return "System error";
    }
    if (rc == ZRUNTIMEINCONSISTENCY) {
        return "Runtime inconsistency";
    }
    if (rc == ZDATAINCONSISTENCY) {
        return "Data inconsistency";
    }
    if (rc == ZCONNECTIONLOSS) {
        return "Connection to the server has been lost";
    }
    if (rc == ZMARSHALLINGERROR) {
        return "Error while marshalling or unmarshalling data ";
    }
    if (rc == ZUNIMPLEMENTED) {
        return "Operation not implemented";
    }
    if (rc == ZOPERATIONTIMEOUT) {
        return "Operation timeout";
    }
    if (rc == ZBADARGUMENTS) {
        return "Invalid argument";
    }
    if (rc == ZINVALIDSTATE) {
        return "Invalid zhandle state";
    }
    if (rc == ZAPIERROR) {
        return "API error";
    }
    if (rc == ZNONODE) {
        return "Znode does not exist";
    }
    if (rc == ZNOAUTH) {
        return "Not authenticated";
    }
    if (rc == ZBADVERSION) {
        return "Version conflict";
    }
    if (rc == ZNOCHILDRENFOREPHEMERALS) {
        return "Ephemeral nodes may not have children";
    }
    if (rc == ZNODEEXISTS) {
        return "Znode already exists";
    }
    if (rc == ZNOTEMPTY) {
        return "The znode has children";
    }
    if (rc == ZSESSIONEXPIRED) {
        return "The session has been expired by the server";
    }
    if (rc == ZINVALIDCALLBACK) {
        return "Invalid callback specified";
    }
    if (rc == ZINVALIDACL) {
        return "Invalid ACL specified";
    }
    if (rc == ZAUTHFAILED) {
        return "Client authentication failed";
    }
    if (rc == ZCLOSING) {
        return "ZooKeeper session is closing";
    }
    if (rc == ZNOTHING) {
        return "No response from server";
    }
    if (rc == ZSESSIONMOVED) {
        return "Session moved to a different server";
    }

    /*
     * Return codes related to reconfiguration.
     * Available only from version 3.5.0.
    if (rc == ZNEWCONFIGNOQUORUM) {
        return "Missing new configuration quorum";
    }
    if (rc == ZRECONFIGINPROGRESS) {
        return "Reconfiguration in progress";
    }
     */

    return "UNKNOWN_EVENT_TYPE";
}

struct MyWatcher : public Watcher
{
  bool connected;
  ZooKeeper* zook;
  MyWatcher(): connected(false), zook(nullptr) {
  }
  void process(
      int type,
      int state,
      int64_t sessionId,
      const std::string& path) {

    LOG_DEBUG(/*LOGCALLBACK(zh),*/ format_log_message("Received event, type %d 
%d %s", type, state, path.c_str()));
    if( type == ZOO_CHILD_EVENT) {
        if (path == "") ;
        else if (path == "") ;
    }
    else if( type == ZOO_DELETED_EVENT) {
        if (path  == "") ;
    }
    else if( type == ZOO_CHANGED_EVENT) {
        Stat stat;
        string result;
        int r = zook->get(path, true, &result, &stat);
std::cout << "get return " << rc2string(r) << " result " << result << "\n";
    }
    else if (type == ZOO_SESSION_EVENT) {
        if (state == ZOO_CONNECTED_STATE) {
            connected = true;
            {
            std::lock_guard<std::mutex> lk(initMutex);
            initCondv.notify_all(); }

            LOG_DEBUG(/*LOGCALLBACK(zh),*/ format_log_message("Received a 
connected event."));
        } else if (state == ZOO_EXPIRED_SESSION_STATE) {
            //expired = 1;
        }
        LOG_DEBUG(/*LOGCALLBACK(zh),*/ format_log_message("Event: %s %d", 
type2string(type), state));
    }
    else {
        LOG_DEBUG(/*LOGCALLBACK(zh),*/ format_log_message("Event: %s %d", 
type2string(type), state));
    }
  }
};

ZooKeeper* zkclient(const string &connect, const int zktimeout) {
    auto w = new MyWatcher();
    std::unique_lock<std::mutex> lk(initMutex);
    auto zook = new ZooKeeper(connect, Milliseconds(zktimeout), w);
    auto now = std::chrono::system_clock::now();
    auto until = now + std::chrono::milliseconds(zktimeout);
    while (!w->connected) {
        auto now = std::chrono::system_clock::now();
#if  __cplusplus >= 201103L
        if (initCondv.wait_for(lk, until - now) == std::cv_status::timeout)
#else
        if (initCondv.wait_for(lk, until - now) == false) //timed out
#endif
        {
            return nullptr;
        }
    }
    w->zook = zook;
    return zook;
}


int main (int argc, char * argv[]) {

    char* connect = argv[1];
    int zktimeout = atoi(argv[2]);
    
    ZooKeeper* zkc = zkclient(connect, zktimeout);

    {
    string* resPath = new string;
    int r = zkc->create("/foo", "n", ZOO_OPEN_ACL_UNSAFE, 0, resPath);
std::cout << "create return " << rc2string(r) << "\n";
    }

    {
    Stat dummyStat;
    int r = zkc->exists("/foo", true, &dummyStat);
std::cout << "exists return " << rc2string(r) << "\n";
    }

    {
    int r = zkc->set("/foo", "bar", -1);
std::cout << "set return " << rc2string(r) << "\n";
    }

    std::this_thread::sleep_for (std::chrono::seconds(4));
    delete zkc;
}

--- bash script that runs it

HOW_BUILD=cpp11build

MESOS_HOME=??
M=$MESOS_HOME
Z=$M/$HOW_BUILD/3rdparty/zookeeper-3.4.5/src/c
MA=$M/3rdparty/libprocess
MB=$M/$HOW_BUILD/3rdparty/libprocess

export LD_LIBRARY_PATH=$Z/.libs:$M/$HOW_BUILD/src/.libs

./bug <ip-address-of-standalone-running-zk>:2181 2000




> mesos c++ zookeeper code hangs from api operation from within watcher of 
> CHANGE event
> -------------------------------------------------------------------------------------
>
>                 Key: MESOS-2451
>                 URL: https://issues.apache.org/jira/browse/MESOS-2451
>             Project: Mesos
>          Issue Type: Bug
>          Components: c++ api
>    Affects Versions: 0.22.0
>         Environment: red hat linux 6.5
>            Reporter: craig bordelon
>            Assignee: Benjamin Hindman
>
> We've observed that that the mesos 0.22.0-rc1 c++ zookeeper code appears to 
> hang (two threads stuck in indefinite pthread condition waits) on a test case 
> that as best we can tell is mesos issue and not issue with underlying apache 
> zookeeper C binding.
> (that is we tried same type case using apache zookeeper C binding directly 
> and saw no issues.)
> This happens with a properly running zookeeper (standalone is sufficient).
> Heres how we hung it:
> We issue a mesos zk set via
> int ZooKeeper::set      (       const std::string &     path,
> const std::string &     data,
> int     version 
> )       
> then inside a Watcher we process on CHANGED event to issue a mesos zk get on 
> the same path via
> int ZooKeeper::get      (       const std::string &     path,
> bool    watch,
> std::string *   result,
> Stat *  stat 
> )       
> we end up with two threads in the process both in pthread_cond_waits
> #0  0x000000334e20b43c in pthread_cond_wait@@GLIBC_2.3.2 () from 
> /lib64/libpthread.so.0
> #1  0x00007f6664ee1cf5 in Gate::arrive (this=0x7f6140, old=0)
>     at ../../../3rdparty/libprocess/src/gate.hpp:82
> #2  0x00007f6664ecef6e in process::ProcessManager::wait (this=0x7f02e0, 
> pid=...)
>     at ../../../3rdparty/libprocess/src/process.cpp:2476
> #3  0x00007f6664ed2ce9 in process::wait (pid=..., duration=...)
>     at ../../../3rdparty/libprocess/src/process.cpp:2958
> #4  0x00007f6664e90558 in process::Latch::await (this=0x7f6ba0, duration=...)
>     at ../../../3rdparty/libprocess/src/latch.cpp:49
> #5  0x00007f66649452cc in process::Future<int>::await (this=0x7fffa0fd9040, 
> duration=...)
>     at ../../3rdparty/libprocess/include/process/future.hpp:1156
> #6  0x00007f666493a04d in process::Future<int>::get (this=0x7fffa0fd9040)
>     at ../../3rdparty/libprocess/include/process/future.hpp:1167
> #7  0x00007f6664ab1aac in ZooKeeper::set (this=0x803ce0, path="/craig/mo", 
> data=
> ...
> and
> #0  0x000000334e20b43c in pthread_cond_wait@@GLIBC_2.3.2 () from 
> /lib64/libpthread.so.0
> #1  0x00007f6664ee1cf5 in Gate::arrive (this=0x7f66380013f0, old=0)
>     at ../../../3rdparty/libprocess/src/gate.hpp:82
> #2  0x00007f6664ecef6e in process::ProcessManager::wait (this=0x7f02e0, 
> pid=...)
>     at ../../../3rdparty/libprocess/src/process.cpp:2476
> #3  0x00007f6664ed2ce9 in process::wait (pid=..., duration=...)
>     at ../../../3rdparty/libprocess/src/process.cpp:2958
> #4  0x00007f6664e90558 in process::Latch::await (this=0x7f6638000d00, 
> duration=...)
>     at ../../../3rdparty/libprocess/src/latch.cpp:49
> #5  0x00007f66649452cc in process::Future<int>::await (this=0x7f66595fb6f0, 
> duration=...)
>     at ../../3rdparty/libprocess/include/process/future.hpp:1156
> #6  0x00007f666493a04d in process::Future<int>::get (this=0x7f66595fb6f0)
>     at ../../3rdparty/libprocess/include/process/future.hpp:1167
> #7  0x00007f6664ab18d3 in ZooKeeper::get (this=0x803ce0, path="/craig/mo", 
> watch=false,
> ....
> We of course have a separate "enhancement" suggestion that the mesos C++ 
> zookeeper api use timed waits and not block indefinitely for responses.
> But this case we think the mesos code itself is blocking on itself and not 
> handling the responses.
> craig



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to