Author: mturk Date: Fri Apr 11 02:13:07 2008 New Revision: 647085 URL: http://svn.apache.org/viewvc?rev=647085&view=rev Log: Fix lb node in-error setting (see changelog)
Modified: tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h tomcat/connectors/trunk/jk/native/common/jk_shm.h tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml Modified: tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c URL: http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c?rev=647085&r1=647084&r2=647085&view=diff ============================================================================== --- tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c (original) +++ tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c Fri Apr 11 02:13:07 2008 @@ -50,8 +50,8 @@ * Note: activation == JK_LB_ACTIVATION_ACTIVE is equivalent to * activation is none of JK_LB_ACTIVATION_STOPPED, JK_LB_ACTIVATION_DISABLED */ -#define JK_WORKER_USABLE(s, activation) ((s)->state <= JK_LB_STATE_FORCE && activation == JK_LB_ACTIVATION_ACTIVE) -#define JK_WORKER_USABLE_STICKY(s, activation) ((s)->state <= JK_LB_STATE_BUSY && activation != JK_LB_ACTIVATION_STOPPED) +#define JK_WORKER_USABLE(s, activation) ((s) <= JK_LB_STATE_FORCE && activation == JK_LB_ACTIVATION_ACTIVE) +#define JK_WORKER_USABLE_STICKY(s, activation) ((s) <= JK_LB_STATE_BUSY && activation != JK_LB_ACTIVATION_STOPPED) static const char *lb_locking_type[] = { JK_LB_LOCK_TEXT_OPTIMISTIC, @@ -657,7 +657,7 @@ static int find_by_session(jk_ws_service_t *s, lb_worker_t *p, - const char *name, + const char *name, jk_logger_t *l) { @@ -676,6 +676,7 @@ static int find_best_bydomain(jk_ws_service_t *s, lb_worker_t *p, const char *domain, + int *states, jk_logger_t *l) { unsigned int i; @@ -701,7 +702,7 @@ JK_LB_ACTIVATION_UNSET; if (activation == JK_LB_ACTIVATION_UNSET) activation = wr.activation; - if (JK_WORKER_USABLE(wr.s, activation)) { + if (JK_WORKER_USABLE(states[wr.i], activation)) { if (candidate < 0 || wr.distance < d || (wr.s->lb_value < curmin && wr.distance == d)) { @@ -718,6 +719,7 @@ static int find_best_byvalue(jk_ws_service_t *s, lb_worker_t *p, + int *states, jk_logger_t *l) { unsigned int i; @@ -746,7 +748,7 @@ /* Take into calculation only the workers that are * not in error state, stopped, disabled or busy. */ - if (JK_WORKER_USABLE(wr.s, activation)) { + if (JK_WORKER_USABLE(states[wr.i], activation)) { if (candidate < 0 || wr.distance < d || (wr.s->lb_value < curmin && wr.distance == d)) { @@ -763,6 +765,7 @@ static int find_bysession_route(jk_ws_service_t *s, lb_worker_t *p, const char *name, + int *states, jk_logger_t *l) { int uses_domain = 0; @@ -771,7 +774,7 @@ candidate = find_by_session(s, p, name, l); if (candidate < 0) { uses_domain = 1; - candidate = find_best_bydomain(s, p, name, l); + candidate = find_best_bydomain(s, p, name, states, l); } if (candidate >= 0) { lb_sub_worker_t wr = p->lb_workers[candidate]; @@ -783,7 +786,7 @@ JK_LB_ACTIVATION_UNSET; if (activation == JK_LB_ACTIVATION_UNSET) activation = wr.activation; - if (!JK_WORKER_USABLE_STICKY(wr.s, activation)) { + if (!JK_WORKER_USABLE_STICKY(states[wr.i], activation)) { /* We have a worker that is error state or stopped. * If it has a redirection set use that redirection worker. * This enables to safely remove the member from the @@ -797,7 +800,7 @@ s->route = NULL; } else if (*wr.domain && !uses_domain) { - candidate = find_best_bydomain(s, p, wr.domain, l); + candidate = find_best_bydomain(s, p, wr.domain, states, l); s->route = wr.domain; } if (candidate >= 0) { @@ -807,7 +810,7 @@ JK_LB_ACTIVATION_UNSET; if (activation == JK_LB_ACTIVATION_UNSET) activation = wr.activation; - if (!JK_WORKER_USABLE_STICKY(wr.s, activation)) + if (!JK_WORKER_USABLE_STICKY(states[wr.i], activation)) candidate = -1; } } @@ -817,6 +820,7 @@ static int find_failover_worker(jk_ws_service_t *s, lb_worker_t * p, + int *states, jk_logger_t *l) { int rc = -1; @@ -830,26 +834,28 @@ } } if (redirect) - rc = find_bysession_route(s, p, redirect, l); + rc = find_bysession_route(s, p, redirect, states, l); return rc; } static int find_best_worker(jk_ws_service_t *s, lb_worker_t * p, + int *states, jk_logger_t *l) { int rc = -1; - rc = find_best_byvalue(s, p, l); + rc = find_best_byvalue(s, p, states, l); /* By default use worker route as session route */ if (rc < 0) - rc = find_failover_worker(s, p, l); + rc = find_failover_worker(s, p, states, l); return rc; } static lb_sub_worker_t *get_most_suitable_worker(jk_ws_service_t *s, lb_worker_t * p, char *sessionid, + int *states, jk_logger_t *l) { int rc = -1; @@ -865,7 +871,7 @@ JK_LB_ACTIVATION_UNSET; if (activation == JK_LB_ACTIVATION_UNSET) activation = p->lb_workers[0].activation; - if (JK_WORKER_USABLE_STICKY(p->lb_workers[0].s, activation)) { + if (JK_WORKER_USABLE_STICKY(states[0], activation)) { if (activation != JK_LB_ACTIVATION_DISABLED) { JK_TRACE_EXIT(l); return p->lb_workers; @@ -907,7 +913,7 @@ session_route); /* We have a session route. Whow! */ - rc = find_bysession_route(s, p, session_route, l); + rc = find_bysession_route(s, p, session_route, states, l); if (rc >= 0) { lb_sub_worker_t *wr = &(p->lb_workers[rc]); if (p->lblock == JK_LB_LOCK_PESSIMISTIC) @@ -940,7 +946,7 @@ return NULL; } } - rc = find_best_worker(s, p, l); + rc = find_best_worker(s, p, states, l); if (p->lblock == JK_LB_LOCK_PESSIMISTIC) jk_shm_unlock(); else { @@ -1014,6 +1020,8 @@ int recoverable = JK_TRUE; int rc = JK_UNSET; char *sessionid = NULL; + int *states = NULL; + int i; JK_TRACE_ENTER(l); @@ -1030,11 +1038,22 @@ /* Set returned error to OK */ *is_error = JK_HTTP_OK; + if (!(states = (int *)malloc(num_of_workers * sizeof(int)))) { + *is_error = JK_HTTP_SERVER_ERROR; + jk_log(l, JK_LOG_ERROR, + "Failed allocating private worker state memory"); + JK_TRACE_EXIT(l); + return JK_SERVER_ERROR; + } jk_shm_lock(); if (p->worker->sequence != p->worker->s->h.sequence) jk_lb_pull(p->worker, l); jk_shm_unlock(); + for (i = 0; i < num_of_workers; i++) { + /* Copy the shared state info */ + states[i] = p->worker->lb_workers[i].s->state; + } /* set the recovery post, for LB mode */ s->reco_buf = jk_b_new(s->pool); @@ -1068,7 +1087,7 @@ while (attempt <= num_of_workers && recoverable == JK_TRUE) { lb_sub_worker_t *rec = - get_most_suitable_worker(s, p->worker, sessionid, l); + get_most_suitable_worker(s, p->worker, sessionid, states, l); rc = JK_FALSE; *is_error = JK_HTTP_SERVER_BUSY; /* Do not reuse previous worker, because @@ -1093,8 +1112,10 @@ if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC) jk_shm_lock(); - if (rec->s->state == JK_LB_STATE_RECOVER) - rec->s->state = JK_LB_STATE_PROBE; + if (rec->s->state == JK_LB_STATE_RECOVER) { + rec->s->state = JK_LB_STATE_PROBE; + states[rec->i] = JK_LB_STATE_PROBE; + } if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC) jk_shm_unlock(); @@ -1124,8 +1145,10 @@ */ if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC) jk_shm_lock(); - if (rec->s->state != JK_LB_STATE_ERROR) - rec->s->state = JK_LB_STATE_BUSY; + if (rec->s->state != JK_LB_STATE_ERROR) { + rec->s->state = JK_LB_STATE_BUSY; + states[rec->i] = JK_LB_STATE_BUSY; + } if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC) jk_shm_unlock(); jk_log(l, JK_LOG_INFO, @@ -1146,6 +1169,7 @@ /* Increment the number of workers serving request */ p->worker->s->busy++; + rec->s->busy++; if (p->worker->s->busy > p->worker->s->max_busy) p->worker->s->max_busy = p->worker->s->busy; if ( (p->worker->lbmethod == JK_LB_METHOD_REQUESTS) || @@ -1194,14 +1218,18 @@ /* When returning the endpoint mark the worker as not busy. * We have at least one endpoint free */ - if (rec->s->state == JK_LB_STATE_BUSY) - rec->s->state = JK_LB_STATE_OK; + if (rec->s->state == JK_LB_STATE_BUSY) { + rec->s->state = JK_LB_STATE_OK; + states[rec->i] = JK_LB_STATE_OK; + } /* Decrement the busy worker count. * Check if the busy was reset to zero by graceful * restart of the server. */ if (p->worker->s->busy) p->worker->s->busy--; + if (rec->s->busy) + rec->s->busy--; if (service_stat == JK_TRUE) { rec->s->state = JK_LB_STATE_OK; rec->s->error_time = 0; @@ -1213,7 +1241,8 @@ * Client error !!! * Since this is bad request do not fail over. */ - rec->s->state = JK_LB_STATE_OK; + rec->s->state = JK_LB_STATE_OK; + states[rec->i] = JK_LB_STATE_ERROR; rec->s->error_time = 0; rc = JK_CLIENT_ERROR; recoverable = JK_FALSE; @@ -1224,7 +1253,8 @@ * Don't mark the node as bad. * Failing over to another node could help. */ - rec->s->state = JK_LB_STATE_OK; + rec->s->state = JK_LB_STATE_OK; + states[rec->i] = JK_LB_STATE_ERROR; rec->s->error_time = 0; rc = JK_FALSE; } @@ -1234,7 +1264,8 @@ * Don't mark the node as bad. * Failing over to another node could help. */ - rec->s->state = JK_LB_STATE_OK; + rec->s->state = JK_LB_STATE_OK; + states[rec->i] = JK_LB_STATE_ERROR; rec->s->error_time = 0; rc = JK_FALSE; } @@ -1245,46 +1276,66 @@ * Failing over to another node could help. */ rec->s->errors++; - rec->s->state = JK_LB_STATE_ERROR; + if (rec->s->busy) { + rec->s->state = JK_LB_STATE_OK; + } + else { + rec->s->state = JK_LB_STATE_ERROR; + } + states[rec->i] = JK_LB_STATE_ERROR; rec->s->error_time = time(NULL); rc = JK_FALSE; } else if (service_stat == JK_REPLY_TIMEOUT) { if (aw->s->reply_timeouts > (unsigned)p->worker->max_reply_timeouts) { /* - * Service failed - to many reply timeouts - * Take this node out of service. - */ + * Service failed - to many reply timeouts + * Take this node out of service. + */ rec->s->errors++; - rec->s->state = JK_LB_STATE_ERROR; + if (rec->s->busy) { + rec->s->state = JK_LB_STATE_OK; + } + else { + rec->s->state = JK_LB_STATE_ERROR; + } + states[rec->i] = JK_LB_STATE_ERROR; rec->s->error_time = time(NULL); } else { - /* - * XXX: if we want to be able to failover - * to other nodes after a reply timeout, - * but we do not put the timeout node into error, - * how can we make sure, that we actually fail over - * to other nodes? - */ - rec->s->state = JK_LB_STATE_OK; + /* + * XXX: if we want to be able to failover + * to other nodes after a reply timeout, + * but we do not put the timeout node into error, + * how can we make sure, that we actually fail over + * to other nodes? + */ + rec->s->state = JK_LB_STATE_OK; + states[rec->i] = JK_LB_STATE_ERROR; rec->s->error_time = 0; } rc = JK_FALSE; } else { /* - * Service failed !!! - * Time for fault tolerance (if possible)... - */ + * Service failed !!! + * Time for fault tolerance (if possible)... + */ rec->s->errors++; - rec->s->state = JK_LB_STATE_ERROR; + if (rec->s->busy) { + rec->s->state = JK_LB_STATE_OK; + } + else { + rec->s->state = JK_LB_STATE_ERROR; + } + states[rec->i] = JK_LB_STATE_ERROR; rec->s->error_time = time(NULL); rc = JK_FALSE; } - if (rec->s->state == JK_LB_STATE_ERROR) + if (states[rec->i] == JK_LB_STATE_ERROR) jk_log(l, JK_LOG_INFO, - "service failed, worker %s is in error state", + "service failed, %sworker %s is in error state", + rec->s->state == JK_LB_STATE_ERROR ? "entire " : "", rec->name); if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC) jk_shm_unlock(); @@ -1368,6 +1419,8 @@ lb_add_log_items(s, lb_last_log_names, prec, l); } + /* Free any private memory used */ + free(states); JK_TRACE_EXIT(l); return rc; } @@ -1434,6 +1487,8 @@ for (i = 0; i < num_of_workers; i++) { const char *s; unsigned int ms; + + p->lb_workers[i].i = i; strncpy(p->lb_workers[i].name, worker_names[i], JK_SHM_STR_SIZ); strncpy(p->lb_workers[i].s->h.name, worker_names[i], Modified: tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h URL: http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h?rev=647085&r1=647084&r2=647085&view=diff ============================================================================== --- tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h (original) +++ tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h Fri Apr 11 02:13:07 2008 @@ -158,6 +158,8 @@ int activation; /* Current lb factor */ int lb_factor; + /* Current worker index */ + int i; /* Current lb reciprocal factor */ jk_uint64_t lb_mult; }; Modified: tomcat/connectors/trunk/jk/native/common/jk_shm.h URL: http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/native/common/jk_shm.h?rev=647085&r1=647084&r2=647085&view=diff ============================================================================== --- tomcat/connectors/trunk/jk/native/common/jk_shm.h (original) +++ tomcat/connectors/trunk/jk/native/common/jk_shm.h Fri Apr 11 02:13:07 2008 @@ -124,6 +124,8 @@ char domain[JK_SHM_STR_SIZ+1]; /* worker redirect route */ char redirect[JK_SHM_STR_SIZ+1]; + /* Number of currently busy channels */ + volatile int busy; /* worker distance */ volatile int distance; /* current activation state (config) of the worker */ Modified: tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml URL: http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml?rev=647085&r1=647084&r2=647085&view=diff ============================================================================== --- tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml (original) +++ tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml Fri Apr 11 02:13:07 2008 @@ -48,6 +48,15 @@ preprocessor defines. (rjung) </update> <fix> + Do not put loadbalancer node in error state if there is opened + channel. This fixes the bug when new new connection fails due to + bussines, causing opened connections fail stickyness. + This brings back per-node busy counter and private state array + for each request. We can mark the state as error for failover to + work while still operating and reporting node as OK if there are + opened working connections. (mturk) + </fix> + <fix> <bug>44738</bug>: Fix merging of JkOption ForwardURI* between virtual hosts. Patch contributed by Toshihiro Sasajima. (rjung) </fix> --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]