[ 
https://issues.apache.org/jira/browse/TS-3455?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Luca Bruno updated TS-3455:
---------------------------
    Description: 
I've written a simple test case plugin for demonstrating this problem, not sure 
if it's a problem on my side, but that would also mean that the regex 
invalidate plugin would also abort().

What the plugin does: in LOOKUP_COMPLETE, if the cache status is FRESH then set 
it to STALE.

To reproduce:
1) First cacheable request to ATS gets cached
2) Request again the same url, the plugin triggers and set the cache to STALE. 
Then ATS does abort().

Plugin code:

{noformat}
#include <ts/ts.h>
#include <ts/remap.h>
#include <ts/experimental.h>
#include <stdlib.h>
#include <stdio.h>
#include <getopt.h>
#include <string.h>
#include <string>
#include <iterator>
#include <map>

const char PLUGIN_NAME[] = "maybebug";
static int Handler(TSCont cont, TSEvent event, void *edata);

struct PluginState {
  PluginState()
  {
    cont = TSContCreate(Handler, NULL);
    TSContDataSet(cont, this);
  }

  ~PluginState()
  {
    TSContDestroy(cont);
  }

  TSCont cont;
};

static int Handler(TSCont cont, TSEvent event, void* edata) {
  TSHttpTxn txn = (TSHttpTxn)edata;

  if (event == TS_EVENT_HTTP_CACHE_LOOKUP_COMPLETE) {
    int lookup_status;
    if (TS_SUCCESS == TSHttpTxnCacheLookupStatusGet(txn, &lookup_status)) {
      TSDebug(PLUGIN_NAME, "lookup complete: %d", lookup_status);

      if (lookup_status == TS_CACHE_LOOKUP_HIT_FRESH) {
        TSDebug(PLUGIN_NAME, "set stale");
        TSHttpTxnCacheLookupStatusSet(txn, TS_CACHE_LOOKUP_HIT_STALE);
      }
    }
  }

  TSHttpTxnReenable(txn, TS_EVENT_HTTP_CONTINUE);
  return TS_EVENT_NONE;
}

void TSPluginInit (int argc, const char *argv[]) {
  TSPluginRegistrationInfo info;
  info.plugin_name = strdup("cappello");
  info.vendor_name = strdup("foo");
  info.support_email = strdup("[email protected]");

  if (TSPluginRegister(TS_SDK_VERSION_3_0 , &info) != TS_SUCCESS) {
    TSError("Plugin registration failed");
  }

  PluginState* state = new PluginState();

  TSHttpHookAdd(TS_HTTP_CACHE_LOOKUP_COMPLETE_HOOK, state->cont);
}
{noformat}

Output:

{noformat}
[Mar 19 18:40:36.254] Server {0x7f6df0b4f740} DIAG: (maybebug) lookup complete: 0
[Mar 19 18:40:40.854] Server {0x7f6decfee700} DIAG: (maybebug) lookup complete: 
2
[Mar 19 18:40:40.854] Server {0x7f6decfee700} DIAG: (maybebug) set stale
FATAL: HttpTransact.cc:433: failed assert `s->pending_work == NULL`
traffic_server - STACK TRACE: 
/usr/local/lib/libtsutil.so.5(ink_fatal+0xa3)[0x7f6df072186d]
/usr/local/lib/libtsutil.so.5(_Z12ink_get_randv+0x0)[0x7f6df071f3a0]
traffic_server[0x60d0aa]
traffic_server(_ZN12HttpTransact22HandleCacheOpenReadHitEPNS_5StateE+0xf82)[0x619206]
...
{noformat}

What happens in gdb is that HandleCacheOpenReadHit is called twice in the same 
request. The first time s->pending_work is NULL, the second time it's not NULL.

The patch below fixes the problem:

{noformat}
diff --git a/proxy/http/HttpTransact.cc b/proxy/http/HttpTransact.cc
index 0078ef1..852f285 100644
--- a/proxy/http/HttpTransact.cc
+++ b/proxy/http/HttpTransact.cc
@@ -2641,11 +2641,6 @@ HttpTransact::HandleCacheOpenReadHit(State* s)
 //        ink_release_assert(s->current.request_to == PARENT_PROXY ||
 //                    s->http_config_param->no_dns_forward_to_parent != 0);
 
-        // Set ourselves up to handle pending revalidate issues
-        //  after the PP DNS lookup
-        ink_assert(s->pending_work == NULL);
-        s->pending_work = issue_revalidate;
-
         // We must be going a PARENT PROXY since so did
         //  origin server DNS lookup right after state Start
         //
@@ -2654,6 +2649,11 @@ HttpTransact::HandleCacheOpenReadHit(State* s)
         //  missing ip but we won't take down the system
         //
         if (s->current.request_to == PARENT_PROXY) {
+          // Set ourselves up to handle pending revalidate issues
+          //  after the PP DNS lookup
+          ink_assert(s->pending_work == NULL);
+          s->pending_work = issue_revalidate;
+
           TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
         } else if (s->current.request_to == ORIGIN_SERVER) {
           TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, OSDNSLookup);
{noformat}

  was:
I've written a simple test case plugin for demonstrating this problem, not sure 
if it's a problem on my side.

What the plugin does:

1) In READ_RESPONSE_HDR rename Cache-Control to X-Plugin-Cache-Control
2) In LOOKUP_COMPLETE, if it's a FRESH hit set the status to be STALE instead

So after the cache has been enabled, do a first request: this will be cached 
because there's no Cache-Control header.
Do a second request, now ATS will abort().

The issue is related to both the fact that Cache-Control is renamed and the 
cache set to be stale. Either of the two alone don't trigger the issue.

Plugin code:

{noformat}
#include <ts/ts.h>
#include <ts/remap.h>
#include <ts/experimental.h>
#include <stdlib.h>
#include <stdio.h>
#include <getopt.h>
#include <string.h>
#include <string>
#include <iterator>
#include <map>


// Constants and some declarations
const char PLUGIN_NAME[] = "maybebug";
static int Handler(TSCont cont, TSEvent event, void *edata);

struct PluginState {
  PluginState()
  {
    cont = TSContCreate(Handler, NULL);
    TSContDataSet(cont, this);
  }

  ~PluginState()
  {
    TSContDestroy(cont);
  }

  TSCont cont;
};

static int Handler(TSCont cont, TSEvent event, void* edata) {
  TSHttpTxn txn = (TSHttpTxn)edata;
  TSMBuffer mbuf;
  TSMLoc hdrp;

  if (event == TS_EVENT_HTTP_READ_RESPONSE_HDR) {
    TSDebug(PLUGIN_NAME, "read response");
    if (TS_SUCCESS == TSHttpTxnServerRespGet(txn, &mbuf, &hdrp)) {
      TSMLoc fieldp = TSMimeHdrFieldFind(mbuf, hdrp, "Cache-Control", 
sizeof("Cache-Control")-1);
      if (fieldp != TS_NULL_MLOC) {
        TSDebug(PLUGIN_NAME, "rename cache-control");
        TSMimeHdrFieldNameSet(mbuf, hdrp, fieldp, "X-Plugin-Cache-Control", 
sizeof("X-Plugin-Cache-Control")-1);
        TSHandleMLocRelease(mbuf, hdrp, fieldp);
      }
      TSHandleMLocRelease(mbuf, TS_NULL_MLOC, hdrp);
    }
  } else if (event == TS_EVENT_HTTP_CACHE_LOOKUP_COMPLETE) {
    int lookup_status;
    if (TS_SUCCESS != TSHttpTxnCacheLookupStatusGet(txn, &lookup_status)) {
      goto beach;
    }
    TSDebug(PLUGIN_NAME, "lookup complete: %d", lookup_status);

    if (lookup_status == TS_CACHE_LOOKUP_HIT_FRESH && 
TSHttpTxnCachedRespGet(txn, &mbuf, &hdrp) == TS_SUCCESS) {
      TSMLoc fieldp = TSMimeHdrFieldFind(mbuf, hdrp, "X-Plugin-Cache-Control", 
sizeof("X-Plugin-Cache-Control")-1);
      if (fieldp) {
        TSDebug(PLUGIN_NAME, "set stale");
        TSHandleMLocRelease(mbuf, hdrp, fieldp);
        TSHttpTxnCacheLookupStatusSet(txn, TS_CACHE_LOOKUP_HIT_STALE);
      }

      TSHandleMLocRelease(mbuf, TS_NULL_MLOC, hdrp);
    }
  }

beach:
  TSHttpTxnReenable(txn, TS_EVENT_HTTP_CONTINUE);
  return TS_EVENT_NONE;
}


TSReturnCode TSRemapInit(TSRemapInterface*  api, char* errbuf, int bufsz) {
  return TS_SUCCESS;
}


TSReturnCode TSRemapNewInstance(int argc, char* argv[], void** instance, char* 
errbuf, int errbuf_size) {
  TSDebug(PLUGIN_NAME, "new instance");

  PluginState* es = new PluginState();

  *instance = es;

  return TS_SUCCESS;
}


void TSRemapDeleteInstance(void* instance) {
  delete static_cast<PluginState*>(instance);
}


TSRemapStatus TSRemapDoRemap(void* instance, TSHttpTxn txn, TSRemapRequestInfo* 
rri) {
  PluginState* es = static_cast<PluginState*>(instance);

  TSHttpTxnHookAdd(txn, TS_HTTP_READ_RESPONSE_HDR_HOOK, es->cont);
  TSHttpTxnHookAdd(txn, TS_HTTP_CACHE_LOOKUP_COMPLETE_HOOK, es->cont);
  return TSREMAP_NO_REMAP;
}
{noformat}

Stacktrace from gdb:

{noformat}
#1  0x00007ffff54e53e0 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007ffff7bb96fc in ink_die_die_die (retval=1) at ink_error.cc:43
#3  0x00007ffff7bb97c2 in ink_fatal_va(int, const char *, typedef __va_list_tag 
__va_list_tag *) (return_code=1, fmt=0x7ffff7bca490 "%s:%d: failed assert 
`%s`", ap=0x7ffff42813d8)
    at ink_error.cc:67
#4  0x00007ffff7bb986d in ink_fatal (return_code=1, 
message_format=0x7ffff7bca490 "%s:%d: failed assert `%s`") at ink_error.cc:75
#5  0x00007ffff7bb73a0 in _ink_assert (expression=0x7c6883 "s->pending_work == 
NULL", file=0x7c66cc "HttpTransact.cc", line=433) at ink_assert.cc:37
#6  0x000000000060d0aa in how_to_open_connection (s=0x7fffdacac9d0) at 
HttpTransact.cc:433
#7  0x0000000000619206 in HttpTransact::HandleCacheOpenReadHit 
(s=0x7fffdacac9d0) at HttpTransact.cc:2684
#8  0x00000000005ffd21 in HttpSM::call_transact_and_set_next_state 
(this=0x7fffdacac960, f=0) at HttpSM.cc:6954
#9  0x00000000005ebc20 in HttpSM::handle_api_return (this=0x7fffdacac960) at 
HttpSM.cc:1532
#10 0x00000000005eb9d5 in HttpSM::state_api_callout (this=0x7fffdacac960, 
event=0, data=0x0) at HttpSM.cc:1464
#11 0x00000000005f8bcb in HttpSM::do_api_callout_internal (this=0x7fffdacac960) 
at HttpSM.cc:4912
#12 0x0000000000606999 in HttpSM::do_api_callout (this=0x7fffdacac960) at 
HttpSM.cc:450
#13 0x00000000005ffed0 in HttpSM::set_next_state (this=0x7fffdacac960) at 
HttpSM.cc:6996
#14 0x00000000005ffe6a in HttpSM::call_transact_and_set_next_state 
(this=0x7fffdacac960, f=0) at HttpSM.cc:6962
#15 0x00000000005f5378 in HttpSM::do_hostdb_lookup (this=0x7fffdacac960) at 
HttpSM.cc:4043
#16 0x0000000000600874 in HttpSM::set_next_state (this=0x7fffdacac960) at 
HttpSM.cc:7092
#17 0x00000000005ffe6a in HttpSM::call_transact_and_set_next_state 
(this=0x7fffdacac960, f=0) at HttpSM.cc:6962
#18 0x00000000005ebc20 in HttpSM::handle_api_return (this=0x7fffdacac960) at 
HttpSM.cc:1532
#19 0x00000000005eb9d5 in HttpSM::state_api_callout (this=0x7fffdacac960, 
event=60000, data=0x0) at HttpSM.cc:1464
#20 0x00000000005eb0dc in HttpSM::state_api_callback (this=0x7fffdacac960, 
event=60000, data=0x0) at HttpSM.cc:1282
#21 0x0000000000531919 in TSHttpTxnReenable (txnp=0x7fffdacac960, 
event=TS_EVENT_HTTP_CONTINUE) at InkAPI.cc:5577
#22 0x00007fffef64a0cc in Handler (cont=0x1315040, 
event=TS_EVENT_HTTP_CACHE_LOOKUP_COMPLETE, edata=0x7fffdacac960) at bug.cxx:68
#23 0x0000000000525ebf in INKContInternal::handle_event (this=0x1315040, 
event=60015, edata=0x7fffdacac960) at InkAPI.cc:1008
#24 0x00000000005112d4 in Continuation::handleEvent (this=0x1315040, 
event=60015, data=0x7fffdacac960) at ../iocore/eventsystem/I_Continuation.h:146
#25 0x00000000005267de in APIHook::invoke (this=0x1399eb0, event=60015, 
edata=0x7fffdacac960) at InkAPI.cc:1227
#26 0x00000000005eb764 in HttpSM::state_api_callout (this=0x7fffdacac960, 
event=0, data=0x0) at HttpSM.cc:1389
#27 0x00000000005f8bcb in HttpSM::do_api_callout_internal (this=0x7fffdacac960) 
at HttpSM.cc:4912
#28 0x0000000000606999 in HttpSM::do_api_callout (this=0x7fffdacac960) at 
HttpSM.cc:450
#29 0x00000000005ffed0 in HttpSM::set_next_state (this=0x7fffdacac960) at 
HttpSM.cc:6996
#30 0x00000000005ffe6a in HttpSM::call_transact_and_set_next_state 
(this=0x7fffdacac960, f=0) at HttpSM.cc:6962
#31 0x00000000005ebc20 in HttpSM::handle_api_return (this=0x7fffdacac960) at 
HttpSM.cc:1532
#32 0x00000000005eb9d5 in HttpSM::state_api_callout (this=0x7fffdacac960, 
event=0, data=0x0) at HttpSM.cc:1464
#33 0x00000000005f8bcb in HttpSM::do_api_callout_internal (this=0x7fffdacac960) 
at HttpSM.cc:4912
#34 0x0000000000606999 in HttpSM::do_api_callout (this=0x7fffdacac960) at 
HttpSM.cc:450
#35 0x00000000005ffed0 in HttpSM::set_next_state (this=0x7fffdacac960) at 
HttpSM.cc:6996
#36 0x00000000005ffe6a in HttpSM::call_transact_and_set_next_state 
(this=0x7fffdacac960, f=0x6162d2 
<HttpTransact::HandleCacheOpenRead(HttpTransact::State*)>) at HttpSM.cc:6962
#37 0x00000000005ef48b in HttpSM::state_cache_open_read (this=0x7fffdacac960, 
event=1102, data=0x15ad510) at HttpSM.cc:2515
#38 0x00000000005ef9b8 in HttpSM::main_handler (this=0x7fffdacac960, 
event=1102, data=0x15ad510) at HttpSM.cc:2574
#39 0x00000000005112d4 in Continuation::handleEvent (this=0x7fffdacac960, 
event=1102, data=0x15ad510) at ../iocore/eventsystem/I_Continuation.h:146
#40 0x00000000005da01c in HttpCacheSM::state_cache_open_read 
(this=0x7fffdacae398, event=1102, data=0x15ad510) at HttpCacheSM.cc:118
#41 0x00000000005112d4 in Continuation::handleEvent (this=0x7fffdacae398, 
event=1102, data=0x15ad510) at ../iocore/eventsystem/I_Continuation.h:146
#42 0x000000000073686f in CacheVC::callcont (this=0x15ad510, event=1102) at 
P_CacheInternal.h:661
#43 0x00000000007349a9 in CacheVC::openReadStartEarliest (this=0x15ad510) at 
CacheRead.cc:955
#44 0x00000000005112d4 in Continuation::handleEvent (this=0x15ad510, 
event=3900, data=0x0) at ../iocore/eventsystem/I_Continuation.h:146
#45 0x0000000000713740 in CacheVC::handleReadDone (this=0x15ad510, event=3900, 
e=0x15ad698) at Cache.cc:2668
#46 0x00000000005112d4 in Continuation::handleEvent (this=0x15ad510, 
event=3900, data=0x15ad698) at ../iocore/eventsystem/I_Continuation.h:146
#47 0x0000000000718d11 in AIOCallbackInternal::io_complete (this=0x15ad698, 
event=1, data=0x1172c70) at ../../iocore/aio/P_AIO.h:124
#48 0x00000000005112d4 in Continuation::handleEvent (this=0x15ad698, event=1, 
data=0x1172c70) at ../iocore/eventsystem/I_Continuation.h:146
#49 0x000000000078ed3d in EThread::process_event (this=0x7ffff4688010, 
e=0x1172c70, calling_code=1) at UnixEThread.cc:144
#50 0x000000000078ef91 in EThread::execute (this=0x7ffff4688010) at 
UnixEThread.cc:195
#51 0x000000000078e272 in spawn_thread_internal (a=0x1130c40) at Thread.cc:88
#52 0x00007ffff6545b50 in start_thread () from 
/lib/x86_64-linux-gnu/libpthread.so.0
#53 0x00007ffff558c7bd in clone () from /lib/x86_64-linux-gnu/libc.so.6
#54 0x0000000000000000 in ?? ()
{noformat}

Debug output from the plugin

{noformat}
[Mar 19 11:27:29.173] Server {0x2afb87afae00} DIAG: (maybebug) new instance
[Mar 19 11:29:39.786] Server {0x2afb88608700} DIAG: (maybebug) lookup complete: 
1
[Mar 19 11:29:39.790] Server {0x2afb88608700} DIAG: (maybebug) read response
[Mar 19 11:29:40.240] Server {0x2afb88608700} DIAG: (maybebug) rename 
cache-control
[Mar 19 11:29:40.420] Server {0x2afb88507700} DIAG: (maybebug) lookup complete: 
2
[Mar 19 11:29:40.420] Server {0x2afb88507700} DIAG: (maybebug) set stale
{noformat}


> Renaming Cache-Control in read-response and marking the cache STALE in 
> lookup-complete causes abort()
> -----------------------------------------------------------------------------------------------------
>
>                 Key: TS-3455
>                 URL: https://issues.apache.org/jira/browse/TS-3455
>             Project: Traffic Server
>          Issue Type: Bug
>            Reporter: Luca Bruno
>
> I've written a simple test case plugin for demonstrating this problem, not 
> sure if it's a problem on my side, but that would also mean that the regex 
> invalidate plugin would also abort().
> What the plugin does: in LOOKUP_COMPLETE, if the cache status is FRESH then 
> set it to STALE.
> To reproduce:
> 1) First cacheable request to ATS gets cached
> 2) Request again the same url, the plugin triggers and set the cache to 
> STALE. Then ATS does abort().
> Plugin code:
> {noformat}
> #include <ts/ts.h>
> #include <ts/remap.h>
> #include <ts/experimental.h>
> #include <stdlib.h>
> #include <stdio.h>
> #include <getopt.h>
> #include <string.h>
> #include <string>
> #include <iterator>
> #include <map>
> const char PLUGIN_NAME[] = "maybebug";
> static int Handler(TSCont cont, TSEvent event, void *edata);
> struct PluginState {
>   PluginState()
>   {
>     cont = TSContCreate(Handler, NULL);
>     TSContDataSet(cont, this);
>   }
>   ~PluginState()
>   {
>     TSContDestroy(cont);
>   }
>   TSCont cont;
> };
> static int Handler(TSCont cont, TSEvent event, void* edata) {
>   TSHttpTxn txn = (TSHttpTxn)edata;
>   if (event == TS_EVENT_HTTP_CACHE_LOOKUP_COMPLETE) {
>     int lookup_status;
>     if (TS_SUCCESS == TSHttpTxnCacheLookupStatusGet(txn, &lookup_status)) {
>       TSDebug(PLUGIN_NAME, "lookup complete: %d", lookup_status);
>       if (lookup_status == TS_CACHE_LOOKUP_HIT_FRESH) {
>         TSDebug(PLUGIN_NAME, "set stale");
>         TSHttpTxnCacheLookupStatusSet(txn, TS_CACHE_LOOKUP_HIT_STALE);
>       }
>     }
>   }
>   TSHttpTxnReenable(txn, TS_EVENT_HTTP_CONTINUE);
>   return TS_EVENT_NONE;
> }
> void TSPluginInit (int argc, const char *argv[]) {
>   TSPluginRegistrationInfo info;
>   info.plugin_name = strdup("cappello");
>   info.vendor_name = strdup("foo");
>   info.support_email = strdup("[email protected]");
>   if (TSPluginRegister(TS_SDK_VERSION_3_0 , &info) != TS_SUCCESS) {
>     TSError("Plugin registration failed");
>   }
>   PluginState* state = new PluginState();
>   TSHttpHookAdd(TS_HTTP_CACHE_LOOKUP_COMPLETE_HOOK, state->cont);
> }
> {noformat}
> Output:
> {noformat}
> [Mar 19 18:40:36.254] Server {0x7f6df0b4f740} DIAG: (maybebug) lookup 
> complete: 0
> [Mar 19 18:40:40.854] Server {0x7f6decfee700} DIAG: (maybebug) lookup 
> complete: 2
> [Mar 19 18:40:40.854] Server {0x7f6decfee700} DIAG: (maybebug) set stale
> FATAL: HttpTransact.cc:433: failed assert `s->pending_work == NULL`
> traffic_server - STACK TRACE: 
> /usr/local/lib/libtsutil.so.5(ink_fatal+0xa3)[0x7f6df072186d]
> /usr/local/lib/libtsutil.so.5(_Z12ink_get_randv+0x0)[0x7f6df071f3a0]
> traffic_server[0x60d0aa]
> traffic_server(_ZN12HttpTransact22HandleCacheOpenReadHitEPNS_5StateE+0xf82)[0x619206]
> ...
> {noformat}
> What happens in gdb is that HandleCacheOpenReadHit is called twice in the 
> same request. The first time s->pending_work is NULL, the second time it's 
> not NULL.
> The patch below fixes the problem:
> {noformat}
> diff --git a/proxy/http/HttpTransact.cc b/proxy/http/HttpTransact.cc
> index 0078ef1..852f285 100644
> --- a/proxy/http/HttpTransact.cc
> +++ b/proxy/http/HttpTransact.cc
> @@ -2641,11 +2641,6 @@ HttpTransact::HandleCacheOpenReadHit(State* s)
>  //        ink_release_assert(s->current.request_to == PARENT_PROXY ||
>  //                    s->http_config_param->no_dns_forward_to_parent != 0);
>  
> -        // Set ourselves up to handle pending revalidate issues
> -        //  after the PP DNS lookup
> -        ink_assert(s->pending_work == NULL);
> -        s->pending_work = issue_revalidate;
> -
>          // We must be going a PARENT PROXY since so did
>          //  origin server DNS lookup right after state Start
>          //
> @@ -2654,6 +2649,11 @@ HttpTransact::HandleCacheOpenReadHit(State* s)
>          //  missing ip but we won't take down the system
>          //
>          if (s->current.request_to == PARENT_PROXY) {
> +          // Set ourselves up to handle pending revalidate issues
> +          //  after the PP DNS lookup
> +          ink_assert(s->pending_work == NULL);
> +          s->pending_work = issue_revalidate;
> +
>            TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
>          } else if (s->current.request_to == ORIGIN_SERVER) {
>            TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, OSDNSLookup);
> {noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to