costin 02/05/03 15:12:18 Modified: jk/native2/common jk_channel_apr_socket.c jk_endpoint.c jk_msg_ajp.c jk_requtil.c jk_workerEnv.c jk_worker_ajp13.c jk_worker_lb.c Log: Few more lb changes. This is getting a bit more agressive in trying to recover failed workers and simplifies the logic that is used. I tested it and seems to work very well with 'lbfactor=0' - if the default tomcat goes down, the request goes to the backup, when the default tomcat goes up ( after the timeout ) it'll be tried again. As soon as shm is finished, the default will go up when it re-register itself in the shm. Revision Changes Path 1.14 +4 -5 jakarta-tomcat-connectors/jk/native2/common/jk_channel_apr_socket.c Index: jk_channel_apr_socket.c =================================================================== RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_channel_apr_socket.c,v retrieving revision 1.13 retrieving revision 1.14 diff -u -r1.13 -r1.14 --- jk_channel_apr_socket.c 3 May 2002 18:44:03 -0000 1.13 +++ jk_channel_apr_socket.c 3 May 2002 22:12:17 -0000 1.14 @@ -426,8 +426,6 @@ #ifdef HAVE_UNIXSOCKETS unixsock=chD->unixsock; #endif - env->l->jkLog(env, env->l, JK_LOG_ERROR, - "jk2_channel_apr_send %d\n",chD->type); if (chD->type==TYPE_NET) { length = (apr_size_t) len; @@ -443,10 +441,11 @@ #ifdef HAVE_UNIXSOCKETS while(sent < len) { /* this_time = send(unixsock, (char *)b + sent , len - sent, 0); */ + errno=0; this_time = write(unixsock, (char *)b + sent , len - sent); env->l->jkLog(env, env->l, JK_LOG_INFO, - "channel.apr:send() send() %d %d %s\n", this_time, errno, + "channel.apr:send() write() %d %d %s\n", this_time, errno, strerror( errno)); /* if( errno != 0 ) { */ /* env->l->jkLog(env, env->l, JK_LOG_ERROR, */ @@ -551,7 +550,7 @@ blen=msg->checkHeader( env, msg, endpoint ); if( blen < 0 ) { env->l->jkLog(env, env->l, JK_LOG_ERROR, - "channelAprArp.receive(): Bad header\n" ); + "channelApr.receive(): Bad header\n" ); return JK_ERR; } @@ -559,7 +558,7 @@ if(rc < 0) { env->l->jkLog(env, env->l, JK_LOG_ERROR, - "channelAprApr.receive(): Error receiving message body %d %d\n", + "channelApr.receive(): Error receiving message body %d %d\n", rc, errno); return JK_ERR; } 1.9 +14 -1 jakarta-tomcat-connectors/jk/native2/common/jk_endpoint.c Index: jk_endpoint.c =================================================================== RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_endpoint.c,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- jk_endpoint.c 25 Apr 2002 18:50:22 -0000 1.8 +++ jk_endpoint.c 3 May 2002 22:12:17 -0000 1.9 @@ -73,6 +73,8 @@ #include "jk_objCache.h" #include "jk_registry.h" +static char *myAttInfo[]={ "channel", "active", NULL }; + /** Will return endpoint specific runtime properties * * uri The uri that is beeing processed, NULL if the endpoing is inactive @@ -83,6 +85,16 @@ * */ static void * JK_METHOD jk2_endpoint_getAttribute(jk_env_t *env, jk_bean_t *bean, char *name ) { + jk_endpoint_t *ep=(jk_endpoint_t *)bean->object; + + if( strcmp( name, "channel" )==0 ) { + return ep->worker->channel->mbean->name; + } else if (strcmp( name, "active" )==0 ) { + if( ep->currentRequest != NULL ) + return ep->currentRequest->req_uri; + } else { + return NULL; + } return NULL; } @@ -107,7 +119,8 @@ e->request = jk2_msg_ajp_create( env, e->pool, 0); e->reply = jk2_msg_ajp_create( env, e->pool, 0); e->post = jk2_msg_ajp_create( env, e->pool, 0); - + result->getAttributeInfo=myAttInfo; + result->getAttribute= jk2_endpoint_getAttribute; e->reuse = JK_FALSE; e->cPool=endpointPool->create(env, endpointPool, HUGE_POOL_SIZE ); 1.10 +2 -1 jakarta-tomcat-connectors/jk/native2/common/jk_msg_ajp.c Index: jk_msg_ajp.c =================================================================== RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_msg_ajp.c,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- jk_msg_ajp.c 3 May 2002 17:41:52 -0000 1.9 +++ jk_msg_ajp.c 3 May 2002 22:12:17 -0000 1.10 @@ -60,7 +60,7 @@ * Author: Costin Manolache * Author: Gal Shachor <[EMAIL PROTECTED]> * * Author: Henri Gomez <[EMAIL PROTECTED]> * - * Version: $Revision: 1.9 $ * + * Version: $Revision: 1.10 $ * ***************************************************************************/ #include "jk_pool.h" @@ -379,6 +379,7 @@ env->l->jkLog(env, env->l, JK_LOG_ERROR, "msgAjp.receive(): Bad signature %x%x\n", head[0], head[1]); + msg->dump( env, msg, "BAD MESSAGE: " ); return -1; } 1.14 +2 -2 jakarta-tomcat-connectors/jk/native2/common/jk_requtil.c Index: jk_requtil.c =================================================================== RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_requtil.c,v retrieving revision 1.13 retrieving revision 1.14 diff -u -r1.13 -r1.14 --- jk_requtil.c 3 May 2002 18:44:03 -0000 1.13 +++ jk_requtil.c 3 May 2002 22:12:17 -0000 1.14 @@ -288,8 +288,8 @@ break; default: - env->l->jkLog(env, env->l, JK_LOG_INFO, - "requtil.getHeaderId() long header %s\n", header_name); +/* env->l->jkLog(env, env->l, JK_LOG_INFO, */ +/* "requtil.getHeaderId() long header %s\n", header_name); */ return JK_ERR; } 1.34 +4 -3 jakarta-tomcat-connectors/jk/native2/common/jk_workerEnv.c Index: jk_workerEnv.c =================================================================== RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_workerEnv.c,v retrieving revision 1.33 retrieving revision 1.34 diff -u -r1.33 -r1.34 --- jk_workerEnv.c 3 May 2002 17:45:57 -0000 1.33 +++ jk_workerEnv.c 3 May 2002 22:12:17 -0000 1.34 @@ -59,7 +59,7 @@ * Description: Workers controller * * Author: Gal Shachor <[EMAIL PROTECTED]> * * Author: Henri Gomez <[EMAIL PROTECTED]> * - * Version: $Revision: 1.33 $ * + * Version: $Revision: 1.34 $ * ***************************************************************************/ #include "jk_env.h" @@ -387,7 +387,8 @@ handler=NULL; env->l->jkLog(env, env->l, JK_LOG_INFO, - "ajp14.processCallbacks() Waiting reply\n"); + "ajp14.processCallbacks() Waiting reply %s\n", + ep->worker->channel->mbean->name); msg->reset(env, msg); rc= ep->worker->channel->recv( env, ep->worker->channel, ep, @@ -399,7 +400,7 @@ return JK_ERR; } - ep->reply->dump(env, ep->reply, "Received "); + /* ep->reply->dump(env, ep->reply, "Received "); */ code = (int)msg->getByte(env, msg); rc=jk2_workerEnv_dispatch( env, wEnv, req, ep, code, msg ); 1.13 +14 -6 jakarta-tomcat-connectors/jk/native2/common/jk_worker_ajp13.c Index: jk_worker_ajp13.c =================================================================== RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_worker_ajp13.c,v retrieving revision 1.12 retrieving revision 1.13 diff -u -r1.12 -r1.13 --- jk_worker_ajp13.c 3 May 2002 17:47:31 -0000 1.12 +++ jk_worker_ajp13.c 3 May 2002 22:12:17 -0000 1.13 @@ -87,7 +87,7 @@ /* -------------------- Impl -------------------- */ static char *myAttInfo[]={ "lb_factor", "lb_value", "reqCnt", "errCnt", "route", "errorState", "recovering", - "epCount", NULL }; + "epCount", "errorTime", NULL }; static void * JK_METHOD jk2_worker_ajp14_getAttribute(jk_env_t *env, jk_bean_t *bean, char *name ) { jk_worker_t *worker=(jk_worker_t *)bean->object; @@ -99,6 +99,10 @@ return worker->channelName; } else if (strcmp( name, "route" )==0 ) { return worker->route; + } else if (strcmp( name, "errorTime" )==0 ) { + char *buf=env->tmpPool->calloc( env, env->tmpPool, 20 ); + sprintf( buf, "%d", worker->error_time ); + return buf; } else if (strcmp( name, "lb_value" )==0 ) { char *buf=env->tmpPool->calloc( env, env->tmpPool, 20 ); sprintf( buf, "%f", worker->lb_value ); @@ -381,7 +385,7 @@ } env->l->jkLog(env, env->l, JK_LOG_INFO, - "ajp14.service() processing callbacks\n"); + "ajp14.service() processing callbacks %s\n", e->worker->channel->mbean->name); err = e->worker->workerEnv->processCallbacks(env, e->worker->workerEnv, e, s); @@ -506,10 +510,10 @@ jk_worker_t *w; w= e->worker; - + if( e->cPool != NULL ) e->cPool->reset(env, e->cPool); - if (w->endpointCache != NULL ) { + if (! w->in_error_state && w->endpointCache != NULL ) { int err=0; err=w->endpointCache->put( env, w->endpointCache, e ); if( err==JK_OK ) { @@ -548,7 +552,8 @@ if (e!=NULL) { env->l->jkLog(env, env->l, JK_LOG_INFO, - "ajp14.getEndpoint(): Reusing endpoint\n"); + "ajp14.getEndpoint(): Reusing endpoint %s %s\n", + e->mbean->name, e->worker->mbean->name); *eP = e; return JK_OK; } @@ -579,6 +584,9 @@ err=jk2_worker_ajp14_service1( env, w, s, e ); + if( err!=JK_OK ) { + w->in_error_state=JK_TRUE; + } jk2_worker_ajp14_done( env, w, e); return err; } @@ -725,6 +733,7 @@ w->service = jk2_worker_ajp14_service; result->setAttribute= jk2_worker_ajp14_setAttribute; + result->getAttributeInfo=myAttInfo; result->getAttribute= jk2_worker_ajp14_getAttribute; result->object = w; w->mbean=result; @@ -732,7 +741,6 @@ w->workerEnv=env->getByName( env, "workerEnv" ); w->workerEnv->addWorker( env, w->workerEnv, w ); - result->getAttributeInfo=myAttInfo; return JK_OK; } 1.8 +53 -39 jakarta-tomcat-connectors/jk/native2/common/jk_worker_lb.c Index: jk_worker_lb.c =================================================================== RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_worker_lb.c,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- jk_worker_lb.c 3 May 2002 18:23:26 -0000 1.7 +++ jk_worker_lb.c 3 May 2002 22:12:17 -0000 1.8 @@ -74,7 +74,8 @@ #define DEFAULT_LB_FACTOR (1.0) /* Time to wait before retry... */ -#define WAIT_BEFORE_RECOVER (60*1) +/* XXX make it longer - debugging only */ +#define WAIT_BEFORE_RECOVER (5) #define ADDITINAL_WAIT_LOAD (20) @@ -132,21 +133,28 @@ /** Get one worker that is ready */ for(i = 0 ; i < lb->num_of_workers ; i++) { if(lb->lb_workers[i]->in_error_state) { - if(!lb->lb_workers[i]->in_recovering) { - if( now==0 ) - now = time(NULL); + /* Check if it's ready for recovery */ + /* if(!lb->lb_workers[i]->in_recovering) { */ + if( now==0 ) + now = time(NULL); - if((now - lb->lb_workers[i]->error_time) > WAIT_BEFORE_RECOVER) { - - lb->lb_workers[i]->in_recovering = JK_TRUE; - lb->lb_workers[i]->error_time = now; - lb->lb_workers[i]->retry_count++; - rc = lb->lb_workers[i]; + if((now - lb->lb_workers[i]->error_time) > WAIT_BEFORE_RECOVER) { + env->l->jkLog(env, env->l, JK_LOG_ERROR, + "lb.getWorker() timeout expired, reenable again %s\n", lb-> + lb_workers[i]->mbean->name); + + lb->lb_workers[i]->in_recovering = JK_TRUE; + lb->lb_workers[i]->in_error_state=JK_FALSE; + /* lb->lb_workers[i]->error_time = now; */ + /* lb->lb_workers[i]->retry_count++; */ + /* rc = lb->lb_workers[i]; */ - break; - } + /* Don't give bigger priority to recovered workers + break; + */ } - } else { + } + if( ! lb->lb_workers[i]->in_error_state ) { if(lb->lb_workers[i]->lb_value == 0 ) { /* That's the 'default' worker, it'll take all requests. * All other workers are not used unless this is in error state. @@ -160,22 +168,25 @@ if(lb->lb_workers[i]->lb_value < lb_min || ( rc==NULL ) ) { lb_min = lb->lb_workers[i]->lb_value; - rc = lb->lb_workers[i]; + rc = lb->lb_workers[i]; } - } + } } - + if ( rc==NULL ) { /* no workers found (rc is null), now try as hard as possible to get a worker anyway, pick one with largest error time.. */ + env->l->jkLog(env, env->l, JK_LOG_ERROR, + "lb.getWorker() All workers in error state, use the one with oldest error\n"); + for(i = 0 ; i < lb->num_of_workers ; i++) { - if(lb->lb_workers[i]->in_error_state) { - if(!lb->lb_workers[i]->in_recovering) { +/* if(lb->lb_workers[i]->in_error_state) { */ +/* if(!lb->lb_workers[i]->in_recovering) { */ /* if the retry count is zero, that means the worker only failed once, this is to e that the failed worker will not continue to be retried over and over again. */ - if ( lb->lb_workers[i]->retry_count == 0 ) { +/* if ( lb->lb_workers[i]->retry_count == 0 ) { */ if ( rc != NULL ) { /* pick the oldest failed worker */ if ( lb->lb_workers[i]->error_time < rc->error_time ) { @@ -184,24 +195,25 @@ } else { rc = lb->lb_workers[i]; } - } - } - } else { - /* This is a good worker - it may have come to life */ - if(lb->lb_workers[i]->lb_value < lb_min || rc != NULL) { - lb_min = lb->lb_workers[i]->lb_value; - rc = lb->lb_workers[i]; - break; - } - } +/* } */ +/* } */ +/* } else { */ + /* This is a good worker - it may have come to life */ +/* if(lb->lb_workers[i]->lb_value < lb_min || rc != NULL) { */ +/* lb_min = lb->lb_workers[i]->lb_value; */ +/* rc = lb->lb_workers[i]; */ +/* break; */ +/* } */ +/* } */ } - + if ( rc && rc->in_error_state ) { - if(now==0) - now = time(0); +/* if(now==0) */ +/* now = time(0); */ rc->in_recovering = JK_TRUE; - rc->error_time = now; - rc->retry_count++; + rc->in_error_state = JK_FALSE; +/* rc->error_time = now; */ +/* rc->retry_count++; */ } } @@ -299,9 +311,9 @@ s->realWorker=NULL; /* reset all the retry counts to 0. XXX may be a problem if we have many workers ? */ - for(i = 0 ; i < lb->num_of_workers ; i++) { - lb->lb_workers[i]->retry_count = 0; - } +/* for(i = 0 ; i < lb->num_of_workers ; i++) { */ +/* lb->lb_workers[i]->retry_count = 0; */ +/* } */ if( wEnv->shm != NULL && wEnv->shm->head != NULL ) { /* We have shm, let's check for updates. This is just checking one @@ -344,8 +356,8 @@ env->l->jkLog(env, env->l, JK_LOG_INFO, "lb.service() try %s\n", rec->mbean->name ); - s->jvm_route = s->pool->pstrdup(env, s->pool, rec->mbean->name); - + s->jvm_route = rec->route; + rec->reqCnt++; rc = rec->service(env, rec, s); @@ -366,6 +378,8 @@ return JK_OK; } + env->l->jkLog(env, env->l, JK_LOG_ERROR, + "lb.service() worker failed\n"); /* * Service failed !!! *
-- To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>