[ 
https://issues.apache.org/jira/browse/TS-3794?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14639128#comment-14639128
 ] 

Sudheer Vinukonda commented on TS-3794:
---------------------------------------

Below's the patch I've tested functionally and with light load. I'm also trying 
to test this in prod as well.

{code}
diff --git a/iocore/net/SSLNetVConnection.cc b/iocore/net/SSLNetVConnection.cc
index e06f749..b122f5f 100644
--- a/iocore/net/SSLNetVConnection.cc
+++ b/iocore/net/SSLNetVConnection.cc
@@ -491,7 +491,7 @@ SSLNetVConnection::net_read_io(NetHandler *nh, EThread 
*lthread)
     if (ret == EVENT_ERROR) {
       this->read.triggered = 0;
       readSignalError(nh, err);
-    } else if (ret == SSL_HANDSHAKE_WANT_READ || ret == 
SSL_HANDSHAKE_WANT_ACCEPT || ret == EVENT_CONT) {
+    } else if (ret == SSL_HANDSHAKE_WANT_READ || ret == 
SSL_HANDSHAKE_WANT_ACCEPT) {
       if (SSLConfigParams::ssl_handshake_timeout_in > 0) {
         double handshake_time = ((double)(Thread::get_hrtime() - 
sslHandshakeBeginTime) / 1000000000);
         Debug("ssl", "ssl handshake for vc %p, took %.3f seconds, configured 
handshake_timer: %d", this, handshake_time,
diff --git a/proxy/http/HttpSM.cc b/proxy/http/HttpSM.cc
index 3f4dd72..4ac8d4a 100644
--- a/proxy/http/HttpSM.cc
+++ b/proxy/http/HttpSM.cc
@@ -614,7 +614,7 @@ HttpSM::state_read_client_request_header(int event, void 
*data)
   //   time we've been called.  The timeout had been set to
   //   the accept timeout by the HttpClientSession
   //
-  if (client_request_hdr_bytes == 0) {
+  if ((ua_buffer_reader->read_avail() > 0) && (client_request_hdr_bytes == 0)) 
{
     milestones[TS_MILESTONE_UA_FIRST_READ] = Thread::get_hrtime();
     
ua_session->get_netvc()->set_inactivity_timeout(HRTIME_SECONDS(t_state.txn_conf->transaction_no_activity_timeout_in));
   }
diff --git a/proxy/http2/Http2ClientSession.cc 
b/proxy/http2/Http2ClientSession.cc
index 4e3b85e..0ea6fca 100644
--- a/proxy/http2/Http2ClientSession.cc
+++ b/proxy/http2/Http2ClientSession.cc
@@ -195,6 +195,13 @@ Http2ClientSession::do_io_close(int alerrno)
   DebugHttp2Ssn0("session closed");
 
   ink_assert(this->mutex->thread_holding == this_ethread());
+  if (client_vc) {
+    // clean up ssl's first byte iobuf
+    SSLNetVConnection *ssl_vc = dynamic_cast<SSLNetVConnection *>(client_vc);
+    if (ssl_vc) {
+      ssl_vc->set_ssl_iobuf(NULL);
+    }
+  }
   HTTP2_DECREMENT_THREAD_DYN_STAT(HTTP2_STAT_CURRENT_CLIENT_SESSION_COUNT, 
this->mutex->thread_holding);
   send_connection_event(&this->connection_state, HTTP2_SESSION_EVENT_FINI, 
this);
   do_api_callout(TS_HTTP_SSN_CLOSE_HOOK);
diff --git a/proxy/http2/Http2SessionAccept.cc 
b/proxy/http2/Http2SessionAccept.cc
index fbb25db..b917555 100644
--- a/proxy/http2/Http2SessionAccept.cc
+++ b/proxy/http2/Http2SessionAccept.cc
@@ -65,7 +65,15 @@ Http2SessionAccept::mainEvent(int event, void *data)
   ink_release_assert((event == NET_EVENT_ACCEPT) ? (data != 0) : (1));
 
   if (event == NET_EVENT_ACCEPT) {
-    this->accept(static_cast<NetVConnection *>(data), NULL, NULL);
+    NetVConnection *netvc = static_cast<NetVConnection *>(edata);
+    SSLNetVConnection *ssl_vc = dynamic_cast<SSLNetVConnection *>(netvc);
+    MIOBuffer * iobuf = NULL;
+    IOBufferReader * reader = NULL;
+    if (ssl_vc) {
+      iobuf = ssl_vc->get_ssl_iobuf();
+      reader = ssl_vc->get_ssl_reader();
+    }
+    this->accept(static_cast<NetVConnection *>(data), iobuf, reader);
     return EVENT_CONT;
   }
 
diff --git a/proxy/spdy/SpdyClientSession.cc b/proxy/spdy/SpdyClientSession.cc
index 2f8720e..d60e7a5 100644
--- a/proxy/spdy/SpdyClientSession.cc
+++ b/proxy/spdy/SpdyClientSession.cc
@@ -143,6 +143,11 @@ SpdyClientSession::clear()
   this->mutex = NULL;
 
   if (vc) {
+    // clean up ssl's first byte iobuf
+    SSLNetVConnection *ssl_vc = dynamic_cast<SSLNetVConnection *>(vc);
+    if (ssl_vc) {
+      ssl_vc->set_ssl_iobuf(NULL);
+    }
     TSVConnClose(reinterpret_cast<TSVConn>(vc));
     vc = NULL;
   }
@@ -203,7 +208,7 @@ SpdyClientSession::start()
     {SPDYLAY_SETTINGS_INITIAL_WINDOW_SIZE, SPDYLAY_ID_FLAG_SETTINGS_NONE, 
spdy_initial_window_size}};
   int r;
 
-  if (TSIOBufferReaderAvail(this->req_reader) > 0) {
+  if (this->write_vio && (TSIOBufferReaderAvail(this->req_reader) > 0)) {
     spdy_process_read(TS_EVENT_VCONN_WRITE_READY, this);
   }
 
diff --git a/proxy/spdy/SpdySessionAccept.cc b/proxy/spdy/SpdySessionAccept.cc
index 98ff9ab..531ff8f 100644
--- a/proxy/spdy/SpdySessionAccept.cc
+++ b/proxy/spdy/SpdySessionAccept.cc
@@ -45,7 +45,15 @@ SpdySessionAccept::mainEvent(int event, void *edata)
 #if TS_HAS_SPDY
     SpdyClientSession *sm = SpdyClientSession::alloc();
     sm->version = this->version;
-    sm->new_connection(netvc, NULL, NULL, false);
+
+    SSLNetVConnection *ssl_vc = dynamic_cast<SSLNetVConnection *>(netvc);
+    MIOBuffer * iobuf = NULL;
+    IOBufferReader * reader = NULL;
+    if (ssl_vc) {
+      iobuf = ssl_vc->get_ssl_iobuf();
+      reader = ssl_vc->get_ssl_reader();
+    }
+    sm->new_connection(netvc, iobuf, reader, false);
 #else
     Error("accepted a SPDY session, but SPDY support is not available");
     netvc->do_io_close();

{code}

> Enhance post SSL handshake read race condition for SPDY and H2 scenarios.
> -------------------------------------------------------------------------
>
>                 Key: TS-3794
>                 URL: https://issues.apache.org/jira/browse/TS-3794
>             Project: Traffic Server
>          Issue Type: Bug
>          Components: Core
>    Affects Versions: 5.3.0
>            Reporter: Sudheer Vinukonda
>            Assignee: Sudheer Vinukonda
>             Fix For: 6.1.0
>
>
> This is a follow up for TS-3714 to extend the SSL post handshake read to SPDY 
> and H2 scenarios.
> An example slow log showing very high *ua_first_read*.
> {code}
> ERROR: [8624075] Slow Request: client_ip: xx.xx.xx.xxx url: 
> http://xxxxxxxxxxxxxxxxxx.... status: 200 unique id: bytes: 86 fd: 0 client 
> state: 0 serv
> er state: 9 ua_begin: 0.000 ua_first_read: 42.224 ua_read_header_done: 42.224 
> cache_open_rea
> d_begin: 42.224 cache_open_read_end: 42.224 dns_lookup_begin: 42.224 
> dns_lookup_end: 42.224
> server_connect: 42.224 server_first_read: 42.229 server_read_header_done: 
> 42.229 server_clos
> e: 42.229 ua_begin_write: 42.229 ua_close: 42.229 sm_finish: 42.229
> {code}
> Initially, I suspected that it might be caused by browser's connecting early 
> before sending any bytes to TS. However, this seems to be happening too 
> frequently and with unrealistically high delay between *ua_begin* and 
> *ua_first_read*.
> I suspect it's caused due to the code that disables the read temporarily 
> before calling *TXN_START* hook and re-enables it after the API call out. The 
> read disable is done via a 0-byte *do_io_read* on the client vc, but, the 
> problem is that a valid *mbuf* is still passed. Based on what I am seeing, 
> it's possible this results in actually enabling the *read_vio* all the way 
> uptil *ssl_read_from_net* for instance (if there's a race condition and there 
> were bytes already from the client resulting in an epoll read event), which 
> would finally disable the read since, the *ntodo* (nbytes) is 0. However, 
> this may result in the epoll event being lost until a new i/o happens from 
> the client. I'm trying out further experiments to confirm the theory. In most 
> cases, the read buffer already has some bytes by the time the http session 
> and http sm is created, which makes it just work. But, if there's a slight 
> delay in the receiving bytes after making a connection, the epoll mechanism 
> should read it, but, due to the way the temporary read disable is being done, 
> the event may be lost (this is coz, ATS uses the epoll edge triggered mode).
> Some history on this issue - 
> This issue has been a problem for a long time and affects both http and https 
> requests. When this issue was first reported, our router operations team 
> eventually closed it indicating that disabling *accept* threads resolved it 
> ([~yzlai] also reported similar observations and conclusions). It's possible 
> that the race condition window may be slightly reduced by disabling accept 
> threads, but, to the overall performance and through put, accept threads are 
> very important. I now notice that the issue still exists (regardless of 
> whether or not accept threads are enabled/disabled) and am testing further to 
> confirm the issue.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to