Hello,

    The attached patch against trunk restores Squid ability to retry
connections that fail due to a pconn race condition. Currently, we serve
an ERR_ZERO_SIZE_OBJECT "Bad Gateway" error in those cases.

This patch is meant to reflect previous discussions on this topic but is
simpler/tighter than my previous attempts that were posted as PREVIEW
patches.


Thank you,

Alex.

Retry requests that failed due to a persistent connection race
instead of replying with ERR_ZERO_SIZE_OBJECT "Bad Gateway".

The ERR_ZERO_SIZE_OBJECT errors were visible to the client when the
destination had only one address because serverDestinations.shift()
made the list of destination empty and startConnectionOrFail() failed.

When FwdState starts to use a pinned connection, the connection is treated as
an idle persistent connection as far as race detection is concerned.
Currently, pinned connections cannot be reopened, repinned, and retried after
a pconn race. This will change when server-side bumped connections become
pinned.

It feels wrong that a failed serverConn may remain set while we are opening a
new connection to replace it. I added an XXX to mark that preexisting problem.

=== modified file 'src/forward.cc'
--- src/forward.cc	2012-02-10 03:30:02 +0000
+++ src/forward.cc	2012-02-17 00:54:18 +0000
@@ -80,40 +80,41 @@
 FwdState::abort(void* d)
 {
     FwdState* fwd = (FwdState*)d;
     Pointer tmp = fwd; // Grab a temporary pointer to keep the object alive during our scope.
 
     if (Comm::IsConnOpen(fwd->serverConnection())) {
         comm_remove_close_handler(fwd->serverConnection()->fd, fwdServerClosedWrapper, fwd);
     }
     fwd->serverDestinations.clean();
     fwd->self = NULL;
 }
 
 /**** PUBLIC INTERFACE ********************************************************/
 
 FwdState::FwdState(const Comm::ConnectionPointer &client, StoreEntry * e, HttpRequest * r)
 {
     debugs(17, 2, HERE << "Forwarding client request " << client << ", url=" << e->url() );
     entry = e;
     clientConn = client;
     request = HTTPMSGLOCK(r);
+    pconnRace = raceImpossible;
     start_t = squid_curtime;
     serverDestinations.reserve(Config.forward_max_tries);
     e->lock();
     EBIT_SET(e->flags, ENTRY_FWD_HDR_WAIT);
 }
 
 // Called once, right after object creation, when it is safe to set self
 void FwdState::start(Pointer aSelf)
 {
     // Protect ourselves from being destroyed when the only Server pointing
     // to us is gone (while we expect to talk to more Servers later).
     // Once we set self, we are responsible for clearing it when we do not
     // expect to talk to any servers.
     self = aSelf; // refcounted
 
     // We hope that either the store entry aborts or peer is selected.
     // Otherwise we are going to leak our object.
 
     entry->registerAbort(FwdState::abort, this);
 
@@ -308,40 +309,43 @@
         if (!err) {
             ErrorState *anErr = new ErrorState(ERR_CANNOT_FORWARD, HTTP_INTERNAL_SERVER_ERROR, request);
             anErr->xerrno = errno;
             fail(anErr);
         } // else use actual error from last connection attempt
         self = NULL;       // refcounted
     }
 }
 
 void
 FwdState::fail(ErrorState * errorState)
 {
     debugs(17, 3, HERE << err_type_str[errorState->type] << " \"" << httpStatusString(errorState->httpStatus) << "\"\n\t" << entry->url()  );
 
     delete err;
     err = errorState;
 
     if (!errorState->request)
         errorState->request = HTTPMSGLOCK(request);
 
+    if (pconnRace == racePossible && errorState->type == ERR_ZERO_SIZE_OBJECT)
+        pconnRace = raceHappened;
+
 #if USE_SSL
     if (errorState->type == ERR_SECURE_CONNECT_FAIL && errorState->detail)
         request->detailError(errorState->type, errorState->detail->errorNo());
     else
 #endif
         request->detailError(errorState->type, errorState->xerrno);
 }
 
 /**
  * Frees fwdState without closing FD or generating an abort
  */
 void
 FwdState::unregister(Comm::ConnectionPointer &conn)
 {
     debugs(17, 3, HERE << entry->url() );
     assert(serverConnection() == conn);
     assert(Comm::IsConnOpen(conn));
     comm_remove_close_handler(conn->fd, fwdServerClosedWrapper, this);
     serverConn = NULL;
 }
@@ -518,40 +522,44 @@
 
     default:
         return false;
     }
 
     return true;
 }
 
 void
 FwdState::serverClosed(int fd)
 {
     debugs(17, 2, HERE << "FD " << fd << " " << entry->url());
     retryOrBail();
 }
 
 void
 FwdState::retryOrBail()
 {
     if (checkRetry()) {
         debugs(17, 3, HERE << "re-forwarding (" << n_tries << " tries, " << (squid_curtime - start_t) << " secs)");
+        // we should retry the same destination if it failed due to pconn race
+        if (pconnRace == raceHappened)
+            debugs(17, 4, HERE << "retrying the same destination");
+        else
         serverDestinations.shift(); // last one failed. try another.
         startConnectionOrFail();
         return;
     }
 
     // TODO: should we call completed() here and move doneWithRetries there?
     doneWithRetries();
 
     if (self != NULL && !err && shutting_down) {
         ErrorState *anErr = new ErrorState(ERR_SHUTTING_DOWN, HTTP_SERVICE_UNAVAILABLE, request);
         errorAppendEntry(entry, anErr);
     }
 
     self = NULL;	// refcounted
 }
 
 // If the Server quits before nibbling at the request body, the body sender
 // will not know (so that we can retry). Call this if we will not retry. We
 // will notify the sender so that it does not get stuck waiting for space.
 void
@@ -810,97 +818,113 @@
     int ftimeout = Config.Timeout.forward - (squid_curtime - start_t);
     if (ftimeout < 0)
         ftimeout = 5;
 
     if (ftimeout < ctimeout)
         ctimeout = ftimeout;
 
     if (serverDestinations[0]->getPeer() && request->flags.sslBumped == true) {
         debugs(50, 4, "fwdConnectStart: Ssl bumped connections through parrent proxy are not allowed");
         ErrorState *anErr = new ErrorState(ERR_CANNOT_FORWARD, HTTP_SERVICE_UNAVAILABLE, request);
         fail(anErr);
         self = NULL; // refcounted
         return;
     }
 
     request->flags.pinned = 0; // XXX: what if the ConnStateData set this to flag existing credentials?
     // XXX: answer: the peer selection *should* catch it and give us only the pinned peer. so we reverse the =0 step below.
     // XXX: also, logs will now lie if pinning is broken and leads to an error message.
     if (serverDestinations[0]->peerType == PINNED) {
         ConnStateData *pinned_connection = request->pinnedConnection();
-        assert(pinned_connection);
+        // pinned_connection may become nil after a pconn race
+        if (pinned_connection)
         serverConn = pinned_connection->validatePinnedConnection(request, serverDestinations[0]->getPeer());
+        else
+            serverConn = NULL;
         if (Comm::IsConnOpen(serverConn)) {
 #if 0
             if (!serverConn->getPeer())
                 serverConn->peerType = HIER_DIRECT;
 #endif
             n_tries++;
             request->flags.pinned = 1;
             if (pinned_connection->pinnedAuth())
                 request->flags.auth = 1;
+            // the server may close the pinned connection before this request
+            pconnRace = racePossible;
             dispatch();
             return;
         }
         /* Failure. Fall back on next path */
         debugs(17,2,HERE << " Pinned connection " << pinned_connection << " not valid.");
         serverDestinations.shift();
         startConnectionOrFail();
         return;
     }
 
     // Use pconn to avoid opening a new connection.
     const char *host;
     if (serverDestinations[0]->getPeer()) {
         host = serverDestinations[0]->getPeer()->host;
     } else {
         host = request->GetHost();
     }
-    Comm::ConnectionPointer temp = fwdPconnPool->pop(serverDestinations[0], host, checkRetriable());
+
+    Comm::ConnectionPointer temp;
+    // Avoid pconns after races so that the same client does not suffer twice.
+    // This does not increase the total number of connections because we just
+    // closed the connection that failed the race. And re-pinning assumes this.
+    if (pconnRace != raceHappened)
+        temp = fwdPconnPool->pop(serverDestinations[0], host, checkRetriable());
+    
+    const bool openedPconn = Comm::IsConnOpen(temp);
+    pconnRace = openedPconn ? racePossible : raceImpossible;
 
     // if we found an open persistent connection to use. use it.
-    if (Comm::IsConnOpen(temp)) {
+    if (openedPconn) {
         serverConn = temp;
         debugs(17, 3, HERE << "reusing pconn " << serverConnection());
         n_tries++;
 
         if (!serverConnection()->getPeer())
             origin_tries++;
 
         comm_add_close_handler(serverConnection()->fd, fwdServerClosedWrapper, this);
 
         /* Update server side TOS and Netfilter mark on the connection. */
         if (Ip::Qos::TheConfig.isAclTosActive()) {
             temp->tos = GetTosToServer(request);
             Ip::Qos::setSockTos(temp, temp->tos);
         }
 #if SO_MARK
         if (Ip::Qos::TheConfig.isAclNfmarkActive()) {
             temp->nfmark = GetNfmarkToServer(request);
             Ip::Qos::setSockNfmark(temp, temp->nfmark);
         }
 #endif
 
         dispatch();
         return;
     }
 
+    // XXX: stale serverConn may remain set even though we are opening new conn
+
 #if URL_CHECKSUM_DEBUG
     entry->mem_obj->checkUrlChecksum();
 #endif
 
     /* Get the server side TOS and Netfilter mark to be set on the connection. */
     if (Ip::Qos::TheConfig.isAclTosActive()) {
         serverDestinations[0]->tos = GetTosToServer(request);
     }
 #if SO_MARK && USE_LIBCAP
     serverDestinations[0]->nfmark = GetNfmarkToServer(request);
     debugs(17, 3, "fwdConnectStart: got outgoing addr " << serverDestinations[0]->local << ", tos " << int(serverDestinations[0]->tos)
            << ", netfilter mark " << serverDestinations[0]->nfmark);
 #else
     serverDestinations[0]->nfmark = 0;
     debugs(17, 3, "fwdConnectStart: got outgoing addr " << serverDestinations[0]->local << ", tos " << int(serverDestinations[0]->tos));
 #endif
 
     calls.connector = commCbCall(17,3, "fwdConnectDoneWrapper", CommConnectCbPtrFun(fwdConnectDoneWrapper, this));
     Comm::ConnOpener *cs = new Comm::ConnOpener(serverDestinations[0], calls.connector, ctimeout);
     cs->setHost(host);

=== modified file 'src/forward.h'
--- src/forward.h	2011-08-20 15:57:06 +0000
+++ src/forward.h	2012-02-17 00:42:24 +0000
@@ -88,25 +88,29 @@
     time_t start_t;
     int n_tries;
     int origin_tries;
 
     // AsyncCalls which we set and may need cancelling.
     struct {
         AsyncCall::Pointer connector;  ///< a call linking us to the ConnOpener producing serverConn.
     } calls;
 
     struct {
         unsigned int connected_okay:1; ///< TCP link ever opened properly. This affects retry of POST,PUT,CONNECT,etc
         unsigned int dont_retry:1;
         unsigned int forward_completed:1;
     } flags;
 
     /** connections to open, in order, until successful */
     Comm::ConnectionList serverDestinations;
 
     Comm::ConnectionPointer serverConn; ///< a successfully opened connection to a server.
 
+    /// possible pconn race states
+    typedef enum { raceImpossible, racePossible, raceHappened } PconnRace;
+    PconnRace pconnRace; ///< current pconn race state
+
     // NP: keep this last. It plays with private/public
     CBDATA_CLASS2(FwdState);
 };
 
 #endif /* SQUID_FORWARD_H */

Reply via email to