changelog.xml

mturk Fri, 11 Apr 2008 02:13:51 -0700

Author: mturk
Date: Fri Apr 11 02:13:07 2008
New Revision: 647085

URL: http://svn.apache.org/viewvc?rev=647085&view=rev
Log:
Fix lb node in-error setting (see changelog)


Modified:
    tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c
    tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h
    tomcat/connectors/trunk/jk/native/common/jk_shm.h
    tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml

Modified: tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c
URL: 
http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c?rev=647085&r1=647084&r2=647085&view=diff
==============================================================================
--- tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c (original)
+++ tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c Fri Apr 11 02:13:07 
2008
@@ -50,8 +50,8 @@
  * Note: activation == JK_LB_ACTIVATION_ACTIVE is equivalent to
  *       activation is none of JK_LB_ACTIVATION_STOPPED, 
JK_LB_ACTIVATION_DISABLED
  */
-#define JK_WORKER_USABLE(s, activation)   ((s)->state <= JK_LB_STATE_FORCE && 
activation == JK_LB_ACTIVATION_ACTIVE)
-#define JK_WORKER_USABLE_STICKY(s, activation)   ((s)->state <= 
JK_LB_STATE_BUSY && activation != JK_LB_ACTIVATION_STOPPED)
+#define JK_WORKER_USABLE(s, activation)   ((s) <= JK_LB_STATE_FORCE && 
activation == JK_LB_ACTIVATION_ACTIVE)
+#define JK_WORKER_USABLE_STICKY(s, activation)   ((s) <= JK_LB_STATE_BUSY && 
activation != JK_LB_ACTIVATION_STOPPED)
 
 static const char *lb_locking_type[] = {
     JK_LB_LOCK_TEXT_OPTIMISTIC,
@@ -657,7 +657,7 @@
 
 static int find_by_session(jk_ws_service_t *s,
                            lb_worker_t *p,
-                           const char *name,
+                           const char *name,                           
                            jk_logger_t *l)
 {
 
@@ -676,6 +676,7 @@
 static int find_best_bydomain(jk_ws_service_t *s,
                               lb_worker_t *p,
                               const char *domain,
+                              int *states,
                               jk_logger_t *l)
 {
     unsigned int i;
@@ -701,7 +702,7 @@
                      JK_LB_ACTIVATION_UNSET;
         if (activation == JK_LB_ACTIVATION_UNSET)
             activation = wr.activation;
-        if (JK_WORKER_USABLE(wr.s, activation)) {
+        if (JK_WORKER_USABLE(states[wr.i], activation)) {
             if (candidate < 0 || wr.distance < d ||
                 (wr.s->lb_value < curmin &&
                 wr.distance == d)) {
@@ -718,6 +719,7 @@
 
 static int find_best_byvalue(jk_ws_service_t *s,
                              lb_worker_t *p,
+                             int *states,
                              jk_logger_t *l)
 {
     unsigned int i;
@@ -746,7 +748,7 @@
         /* Take into calculation only the workers that are
          * not in error state, stopped, disabled or busy.
          */
-        if (JK_WORKER_USABLE(wr.s, activation)) {
+        if (JK_WORKER_USABLE(states[wr.i], activation)) {
             if (candidate < 0 || wr.distance < d ||
                 (wr.s->lb_value < curmin &&
                 wr.distance == d)) {
@@ -763,6 +765,7 @@
 static int find_bysession_route(jk_ws_service_t *s,
                                 lb_worker_t *p,
                                 const char *name,
+                                int *states,
                                 jk_logger_t *l)
 {
     int uses_domain  = 0;
@@ -771,7 +774,7 @@
     candidate = find_by_session(s, p, name, l);
     if (candidate < 0) {
         uses_domain = 1;
-        candidate = find_best_bydomain(s, p, name, l);
+        candidate = find_best_bydomain(s, p, name, states, l);
     }
     if (candidate >= 0) {
         lb_sub_worker_t wr = p->lb_workers[candidate];
@@ -783,7 +786,7 @@
                      JK_LB_ACTIVATION_UNSET;
         if (activation == JK_LB_ACTIVATION_UNSET)
             activation = wr.activation;
-        if (!JK_WORKER_USABLE_STICKY(wr.s, activation)) {
+        if (!JK_WORKER_USABLE_STICKY(states[wr.i], activation)) {
             /* We have a worker that is error state or stopped.
              * If it has a redirection set use that redirection worker.
              * This enables to safely remove the member from the
@@ -797,7 +800,7 @@
                 s->route = NULL;
             }
             else if (*wr.domain && !uses_domain) {
-                candidate = find_best_bydomain(s, p, wr.domain, l);
+                candidate = find_best_bydomain(s, p, wr.domain, states, l);
                 s->route = wr.domain;
             }
             if (candidate >= 0) {
@@ -807,7 +810,7 @@
                              JK_LB_ACTIVATION_UNSET;
                 if (activation == JK_LB_ACTIVATION_UNSET)
                     activation = wr.activation;
-                if (!JK_WORKER_USABLE_STICKY(wr.s, activation))
+                if (!JK_WORKER_USABLE_STICKY(states[wr.i], activation))
                     candidate = -1;
             }
         }
@@ -817,6 +820,7 @@
 
 static int find_failover_worker(jk_ws_service_t *s,
                                 lb_worker_t * p,
+                                int *states,
                                 jk_logger_t *l)
 {
     int rc = -1;
@@ -830,26 +834,28 @@
         }
     }
     if (redirect)
-        rc = find_bysession_route(s, p, redirect, l);
+        rc = find_bysession_route(s, p, redirect, states, l);
     return rc;
 }
 
 static int find_best_worker(jk_ws_service_t *s,
                             lb_worker_t * p,
+                            int *states,
                             jk_logger_t *l)
 {
     int rc = -1;
 
-    rc = find_best_byvalue(s, p, l);
+    rc = find_best_byvalue(s, p, states, l);
     /* By default use worker route as session route */
     if (rc < 0)
-        rc = find_failover_worker(s, p, l);
+        rc = find_failover_worker(s, p, states, l);
     return rc;
 }
 
 static lb_sub_worker_t *get_most_suitable_worker(jk_ws_service_t *s,
                                                  lb_worker_t * p,
                                                  char *sessionid,
+                                                 int *states,
                                                  jk_logger_t *l)
 {
     int rc = -1;
@@ -865,7 +871,7 @@
                          JK_LB_ACTIVATION_UNSET;
         if (activation == JK_LB_ACTIVATION_UNSET)
             activation = p->lb_workers[0].activation;
-        if (JK_WORKER_USABLE_STICKY(p->lb_workers[0].s, activation)) {
+        if (JK_WORKER_USABLE_STICKY(states[0], activation)) {
             if (activation != JK_LB_ACTIVATION_DISABLED) {
                 JK_TRACE_EXIT(l);
                 return p->lb_workers;
@@ -907,7 +913,7 @@
                            session_route);
 
                 /* We have a session route. Whow! */
-                rc = find_bysession_route(s, p, session_route, l);
+                rc = find_bysession_route(s, p, session_route, states, l);
                 if (rc >= 0) {
                     lb_sub_worker_t *wr = &(p->lb_workers[rc]);
                     if (p->lblock == JK_LB_LOCK_PESSIMISTIC)
@@ -940,7 +946,7 @@
             return NULL;
         }
     }
-    rc = find_best_worker(s, p, l);
+    rc = find_best_worker(s, p, states, l);
     if (p->lblock == JK_LB_LOCK_PESSIMISTIC)
         jk_shm_unlock();
     else {
@@ -1014,6 +1020,8 @@
     int recoverable = JK_TRUE;
     int rc = JK_UNSET;
     char *sessionid = NULL;
+    int  *states = NULL;
+    int i;
 
     JK_TRACE_ENTER(l);
 
@@ -1030,11 +1038,22 @@
 
     /* Set returned error to OK */
     *is_error = JK_HTTP_OK;
+    if (!(states = (int *)malloc(num_of_workers * sizeof(int)))) {
+        *is_error = JK_HTTP_SERVER_ERROR;
+        jk_log(l, JK_LOG_ERROR,
+               "Failed allocating private worker state memory");
+        JK_TRACE_EXIT(l);
+        return JK_SERVER_ERROR;        
+    }
 
     jk_shm_lock();
     if (p->worker->sequence != p->worker->s->h.sequence)
         jk_lb_pull(p->worker, l);
     jk_shm_unlock();
+    for (i = 0; i < num_of_workers; i++) {
+        /* Copy the shared state info */
+        states[i] = p->worker->lb_workers[i].s->state;
+    }
 
     /* set the recovery post, for LB mode */
     s->reco_buf = jk_b_new(s->pool);
@@ -1068,7 +1087,7 @@
 
     while (attempt <= num_of_workers && recoverable == JK_TRUE) {
         lb_sub_worker_t *rec =
-            get_most_suitable_worker(s, p->worker, sessionid, l);
+            get_most_suitable_worker(s, p->worker, sessionid, states, l);
         rc = JK_FALSE;
         *is_error = JK_HTTP_SERVER_BUSY;
         /* Do not reuse previous worker, because
@@ -1093,8 +1112,10 @@
 
             if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                 jk_shm_lock();
-            if (rec->s->state == JK_LB_STATE_RECOVER)
-                rec->s->state = JK_LB_STATE_PROBE;
+            if (rec->s->state == JK_LB_STATE_RECOVER) {
+                rec->s->state  = JK_LB_STATE_PROBE;
+                states[rec->i] = JK_LB_STATE_PROBE;
+            }
             if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                 jk_shm_unlock();
                        
@@ -1124,8 +1145,10 @@
                  */
                 if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                     jk_shm_lock();
-                if (rec->s->state != JK_LB_STATE_ERROR)
-                    rec->s->state = JK_LB_STATE_BUSY;
+                if (rec->s->state != JK_LB_STATE_ERROR) {
+                    rec->s->state  = JK_LB_STATE_BUSY;
+                    states[rec->i] = JK_LB_STATE_BUSY;
+                }
                 if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                     jk_shm_unlock();
                 jk_log(l, JK_LOG_INFO,
@@ -1146,6 +1169,7 @@
 
                 /* Increment the number of workers serving request */
                 p->worker->s->busy++;
+                rec->s->busy++;
                 if (p->worker->s->busy > p->worker->s->max_busy)
                     p->worker->s->max_busy = p->worker->s->busy;
                 if ( (p->worker->lbmethod == JK_LB_METHOD_REQUESTS) ||
@@ -1194,14 +1218,18 @@
                 /* When returning the endpoint mark the worker as not busy.
                  * We have at least one endpoint free
                  */
-                if (rec->s->state == JK_LB_STATE_BUSY)
-                    rec->s->state = JK_LB_STATE_OK;
+                if (rec->s->state == JK_LB_STATE_BUSY) {
+                    rec->s->state  = JK_LB_STATE_OK;
+                    states[rec->i] = JK_LB_STATE_OK;
+                }
                 /* Decrement the busy worker count.
                  * Check if the busy was reset to zero by graceful
                  * restart of the server.
                  */
                 if (p->worker->s->busy)
                     p->worker->s->busy--;
+                if (rec->s->busy)
+                    rec->s->busy--;
                 if (service_stat == JK_TRUE) {
                     rec->s->state = JK_LB_STATE_OK;
                     rec->s->error_time = 0;
@@ -1213,7 +1241,8 @@
                     * Client error !!!
                     * Since this is bad request do not fail over.
                     */
-                    rec->s->state = JK_LB_STATE_OK;
+                    rec->s->state  = JK_LB_STATE_OK;
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = 0;
                     rc = JK_CLIENT_ERROR;
                     recoverable = JK_FALSE;
@@ -1224,7 +1253,8 @@
                     * Don't mark the node as bad.
                     * Failing over to another node could help.
                     */
-                    rec->s->state = JK_LB_STATE_OK;
+                    rec->s->state  = JK_LB_STATE_OK;
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = 0;
                     rc = JK_FALSE;
                 }
@@ -1234,7 +1264,8 @@
                     * Don't mark the node as bad.
                     * Failing over to another node could help.
                     */
-                    rec->s->state = JK_LB_STATE_OK;
+                    rec->s->state  = JK_LB_STATE_OK;
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = 0;
                     rc = JK_FALSE;
                 }
@@ -1245,46 +1276,66 @@
                     * Failing over to another node could help.
                     */
                     rec->s->errors++;
-                    rec->s->state = JK_LB_STATE_ERROR;
+                    if (rec->s->busy) {
+                        rec->s->state = JK_LB_STATE_OK;
+                    }
+                    else {
+                        rec->s->state = JK_LB_STATE_ERROR;
+                    }
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = time(NULL);
                     rc = JK_FALSE;
                 }
                 else if (service_stat == JK_REPLY_TIMEOUT) {
                     if (aw->s->reply_timeouts > 
(unsigned)p->worker->max_reply_timeouts) {
                         /*
-                        * Service failed - to many reply timeouts
-                        * Take this node out of service.
-                        */
+                         * Service failed - to many reply timeouts
+                         * Take this node out of service.
+                         */
                         rec->s->errors++;
-                        rec->s->state = JK_LB_STATE_ERROR;
+                        if (rec->s->busy) {
+                            rec->s->state = JK_LB_STATE_OK;
+                        }
+                        else {
+                            rec->s->state = JK_LB_STATE_ERROR;
+                        }
+                        states[rec->i] = JK_LB_STATE_ERROR;
                         rec->s->error_time = time(NULL);
                     }
                     else {
-                    /*
-                    * XXX: if we want to be able to failover
-                    * to other nodes after a reply timeout,
-                    * but we do not put the timeout node into error,
-                    * how can we make sure, that we actually fail over
-                    * to other nodes?
-                    */
-                        rec->s->state = JK_LB_STATE_OK;
+                        /*
+                         * XXX: if we want to be able to failover
+                         * to other nodes after a reply timeout,
+                         * but we do not put the timeout node into error,
+                         * how can we make sure, that we actually fail over
+                         * to other nodes?
+                         */
+                        rec->s->state  = JK_LB_STATE_OK;
+                        states[rec->i] = JK_LB_STATE_ERROR;
                         rec->s->error_time = 0;
                     }
                     rc = JK_FALSE;
                 }
                 else {
                     /*
-                    * Service failed !!!
-                    * Time for fault tolerance (if possible)...
-                    */
+                     * Service failed !!!
+                     * Time for fault tolerance (if possible)...
+                     */
                     rec->s->errors++;
-                    rec->s->state = JK_LB_STATE_ERROR;
+                    if (rec->s->busy) {
+                        rec->s->state = JK_LB_STATE_OK;
+                    }
+                    else {
+                        rec->s->state = JK_LB_STATE_ERROR;
+                    }
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = time(NULL);
                     rc = JK_FALSE;
                 }
-                if (rec->s->state == JK_LB_STATE_ERROR)
+                if (states[rec->i] == JK_LB_STATE_ERROR)
                     jk_log(l, JK_LOG_INFO,
-                           "service failed, worker %s is in error state",
+                           "service failed, %sworker %s is in error state",
+                           rec->s->state == JK_LB_STATE_ERROR ? "entire " : "",
                            rec->name);
                 if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                     jk_shm_unlock();
@@ -1368,6 +1419,8 @@
         lb_add_log_items(s, lb_last_log_names, prec, l);
     }
 
+    /* Free any private memory used */
+    free(states);
     JK_TRACE_EXIT(l);
     return rc;
 }
@@ -1434,6 +1487,8 @@
             for (i = 0; i < num_of_workers; i++) {
                 const char *s;
                 unsigned int ms;
+
+                p->lb_workers[i].i = i;
                 strncpy(p->lb_workers[i].name, worker_names[i],
                         JK_SHM_STR_SIZ);
                 strncpy(p->lb_workers[i].s->h.name, worker_names[i],

Modified: tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h
URL: 
http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h?rev=647085&r1=647084&r2=647085&view=diff
==============================================================================
--- tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h (original)
+++ tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h Fri Apr 11 02:13:07 
2008
@@ -158,6 +158,8 @@
     int activation;
     /* Current lb factor */
     int lb_factor;
+    /* Current worker index */
+    int i;
     /* Current lb reciprocal factor */
     jk_uint64_t lb_mult;
 };

Modified: tomcat/connectors/trunk/jk/native/common/jk_shm.h
URL: 
http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/native/common/jk_shm.h?rev=647085&r1=647084&r2=647085&view=diff
==============================================================================
--- tomcat/connectors/trunk/jk/native/common/jk_shm.h (original)
+++ tomcat/connectors/trunk/jk/native/common/jk_shm.h Fri Apr 11 02:13:07 2008
@@ -124,6 +124,8 @@
     char    domain[JK_SHM_STR_SIZ+1];
     /* worker redirect route */
     char    redirect[JK_SHM_STR_SIZ+1];
+    /* Number of currently busy channels */
+    volatile int busy;
     /* worker distance */
     volatile int distance;
     /* current activation state (config) of the worker */

Modified: tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml
URL: 
http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml?rev=647085&r1=647084&r2=647085&view=diff
==============================================================================
--- tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml (original)
+++ tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml Fri Apr 11 
02:13:07 2008
@@ -48,6 +48,15 @@
         preprocessor defines. (rjung)
       </update>
       <fix>
+        Do not put loadbalancer node in error state if there is opened
+        channel. This fixes the bug when new new connection fails due to
+        bussines, causing opened connections fail stickyness.
+        This brings back per-node busy counter and private state array
+        for each request. We can mark the state as error for failover to
+        work while still operating and reporting node as OK if there are
+        opened working connections. (mturk)
+      </fix>
+      <fix>
         <bug>44738</bug>: Fix merging of JkOption ForwardURI* between virtual 
hosts.
         Patch contributed by Toshihiro Sasajima. (rjung)
       </fix>



---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

svn commit: r647085 - in /tomcat/connectors/trunk/jk: native/common/jk_lb_worker.c native/common/jk_lb_worker.h native/common/jk_shm.h xdocs/miscellaneous/changelog.xml

Reply via email to