hans9500 opened a new issue, #12583:
URL: https://github.com/apache/trafficserver/issues/12583

   Hi ATS team,
   
   We encountered an issue where a core dump occurs due to repeated calls to 
CacheVC::scanObject after a failed cache deletion.
   I’d like to share our findings and ask for your insights on this behavior.
   
   ## Issue Description
   We implemented a **pattern-based purge** feature to delete cached objects.
   For example, executing the following command triggers a purge:
   ```
   curl -X PURGE "http://localhost:8080/image/*"; -o /dev/null
   ```
   
   When the scheduled time arrives, the pattern purge handler calls the related 
function, and within it, we invoke cacheProcessor.scan() to perform the cache 
deletion.
   
   After calling CacheVC::scanObject, the function attempts to delete the 
corresponding cache entry.
   However, the cache is **not actually deleted**, and CacheVC::scanObject is 
called again, repeatedly attempting to delete the **same file** in an infinite 
loop.
   
   In conclusion, this happens because the **stripe value becomes inconsistent 
(different)** between operations.
   
   ### Our custom function calls cacheProcessor.scan(). Details are described 
below.
   ```
   cacheProcessor.scan()
        => caches[CACHE_FRAG_TYPE_HTTP]->scan()
                => CacheVC::scanStripe()
                        => CacheVC::scanObject()
   ```
   
   ### For debugging, we inserted debug messages into the following functions:
   ```
   int
   dir_probe(const CacheKey *key, Stripe *stripe, Dir *result, Dir 
**last_collision)
   {
     ink_assert(stripe->mutex->thread_holding == this_ethread());
     int  s   = key->slice32(0) % stripe->segments;
     int  b   = key->slice32(1) % stripe->buckets;
     Dir *seg = stripe->dir_segment(s);
     Dir *e = nullptr, *p = nullptr, *collision = *last_collision;
     CHECK_DIR(d);
   #ifdef LOOP_CHECK_MODE
     if (dir_bucket_loop_fix(dir_bucket(b, seg), s, vol))
       return 0;
   #endif
   Lagain:
     e = dir_bucket(b, seg);
   
     // Added for debugging (+)
     DDbg(dbg_ctl_cache_stats, "==> stripe = %p, s = %d, b = %d, seg = %p, e = 
%p", stripe, s, b, seg, e);
     if (stripe) {
       DDbg(dbg_ctl_cache_stats, "stripe->fd = %d", stripe->fd);
     }
     if (e) {
       DDbg(dbg_ctl_cache_stats, "boffset %" PRId64 "", dir_offset(e));
     }
     // Added for debugging (-)
     ...
   ```
   
   ## Scenario
   1. When the scheduled time for the pattern purge arrives, 
`cacheProcessor.scan()` is called.
   Then, the `CacheVC::scanObject()` function is invoked.
   ```
   int
   CacheVC::scanObject(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
   {
     Dbg(dbg_ctl_cache_scan_truss, "inside %p:scanObject", this);
   
     Doc  *doc    = nullptr;
     void *result = nullptr;
     int   hlen   = 0;
     char  hname[500];
     bool  hostinfo_copied         = false;
     off_t next_object_len         = 0;
     bool  might_need_overlap_read = false;
     ...
     while (static_cast<off_t>(reinterpret_cast<char *>(doc) - buf->data()) + 
next_object_len <
            static_cast<off_t>(io.aiocb.aio_nbytes)) {
       might_need_overlap_read = false;
       doc                     = reinterpret_cast<Doc *>(reinterpret_cast<char 
*>(doc) + next_object_len);
       next_object_len         = stripe->round_to_approx_size(doc->len);
       int  i;
       bool changed;
       ...
       while (true) {
         // dir_probe debug info. stripe is 0x7ff5a0057030
         // 
===========================================================================================================
         /*
         [Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheDir.cc:559 (dir_probe)> 
(cache_stats) ==> stripe = 0x7ff5a0057030, s = 46, b = 5309, seg = 
0x7ff59b852430, e = 0x7ff59b8861b8
         [Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheDir.cc:561 (dir_probe)> 
(cache_stats) stripe->fd = 32
         [Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheDir.cc:564 (dir_probe)> 
(cache_stats) boffset 244180
         [Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheDir.cc:591 (dir_probe)> 
(dir_probe_hit) found (key=3447329293) D829FB03 DD1078BE vol 32 bucket 5309 
boffset 244180
         */
         // 
===========================================================================================================
         if (!dir_probe(&doc->first_key, stripe, &dir, &last_collision)) {
           goto Lskip;
         }
         if (!dir_agg_valid(stripe, &dir) || !dir_head(&dir) ||
             (stripe->vol_offset(&dir) != io.aiocb.aio_offset + 
(reinterpret_cast<char *>(doc) - buf->data()))) {
           continue;
         }
         // Break occurs here and exits the loop.
         break;
       }
       ...
       // vector.count() is 1.
       changed         = false;
       hostinfo_copied = false;
       for (i = 0; i < vector.count(); i++) {
         if (!vector.get(i)->valid()) {
           goto Lskip;
         }
         if (!hostinfo_copied) {
           memccpy(hname, vector.get(i)->request_get()->host_get(&hlen), 0, 
500);
           hname[hlen] = 0;
           Dbg(dbg_ctl_cache_scan, "hostname = '%s', hostlen = %d", hname, 
hlen);
           // [Oct 15 13:06:30.191] [ET_NET 5] DIAG: <CacheVC.cc:940 
(scanObject)> (cache_scan) hostname = 'test.aplatform.com', hostlen = 18
           hostinfo_copied = true;
         }
         ...
         earliest_key = key;
         int result1  = 
_action.continuation->handleEvent(CACHE_EVENT_SCAN_OBJECT, vector.get(i));
         /*
         // We simply check whether the object belongs to the pattern purge 
target.
         // If it matches, return CACHE_SCAN_RESULT_DELETE.
         // url info: path [image/1B] — this URL belongs to the pattern purge 
target.
         */
         switch (result1) {
         case CACHE_SCAN_RESULT_CONTINUE:
           continue;
         case CACHE_SCAN_RESULT_DELETE: // Execution enters here.
           changed = true;
           vector.remove(i, true);
           i--;
           continue;
         ...
         }
       if (changed) {
         if (!vector.count()) {
           ink_assert(hostinfo_copied);
           SET_HANDLER(&CacheVC::scanRemoveDone);
           // force remove even if there is a writer
           cacheProcessor.remove(this, &doc->first_key, CACHE_FRAG_TYPE_HTTP, 
hname, hlen); // Execution enters here.
           return EVENT_CONT;
         } else {
           ...
         }
       }
   ```
   
   2. Starting from the call to `cacheProcessor.remove(this, &doc->first_key, 
CACHE_FRAG_TYPE_HTTP, hname, hlen)`, the following behavior occurs...
   ```
   Action *
   CacheProcessor::remove(Continuation *cont, const CacheKey *key, 
CacheFragType frag_type, const char *hostname, int host_len)
   {
     Dbg(dbg_ctl_cache_remove, "[CacheProcessor::remove] Issuing cache delete 
for %u", cache_hash(*key));
     // [Oct 15 13:06:30.192] [ET_NET 5] DIAG: <Cache.cc:854 (remove)> 
(cache_remove) [CacheProcessor::remove] Issuing cache delete for 3447329293
     return caches[frag_type]->remove(cont, key, frag_type, hostname, host_len);
   }
   
   Action *
   Cache::remove(Continuation *cont, const CacheKey *key, CacheFragType type, 
const char *hostname, int host_len)
   {
     if (!CacheProcessor::IsCacheReady(type)) {
       if (cont) {
         cont->handleEvent(CACHE_EVENT_REMOVE_FAILED, nullptr);
       }
       return ACTION_RESULT_DONE;
     }
   
     Ptr<ProxyMutex> mutex;
     if (!cont) {
       cont = new_CacheRemoveCont();
     }
   
     CACHE_TRY_LOCK(lock, cont->mutex, this_ethread());
     ink_assert(lock.is_locked());
   
     // 
===========================================================================================================
     // This is where the problem occurs.
     // The stripe pointer is returned as 0x7ff5a0071030 instead of 
0x7ff5a0057030.
     // 
===========================================================================================================
     Stripe *stripe = key_to_stripe(key, hostname, host_len);
     /*
     [Oct 15 13:06:30.192] [ET_NET 5] DIAG: <Cache.cc:1880 (key_to_stripe)> 
(cache_hosting) Generic volume: b8028a88d for host: test.aplatform.com
     */
     // coverity[var_decl]
     Dir result;
     dir_clear(&result); // initialized here, set result empty so we can 
recognize missed lock
     mutex = cont->mutex;
   
     CacheVC *c   = new_CacheVC(cont);
     c->vio.op    = VIO::NONE;
     c->frag_type = type;
     c->op_type   = static_cast<int>(CacheOpType::Remove);
     Metrics::Gauge::increment(cache_rsb.status[c->op_type].active);
     
Metrics::Gauge::increment(stripe->cache_vol->vol_rsb.status[c->op_type].active);
     c->first_key = c->key = *key;
     c->stripe             = stripe; // This is the problematic section.
     c->dir                = result;
     c->f.remove           = 1;
   
     SET_CONTINUATION_HANDLER(c, &CacheVC::removeEvent);
     int ret = c->removeEvent(EVENT_IMMEDIATE, nullptr); // This function is 
invoked.
     if (ret == EVENT_DONE) {
       return ACTION_RESULT_DONE;
     } else {
       return &c->_action;
     }
   }
   ```
   
   3. After calling `c->removeEvent(EVENT_IMMEDIATE, nullptr)`, the following 
behavior occurs…
   ```
   int
   CacheVC::removeEvent(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
   {
     ...
     Lcollision:
       // check for collision
       // In the normal case, execution should enter the if-statement.
       // However, the stripe information has changed,
       // so the code proceeds to the next routine after the if-statement.
       /*
       [Oct 15 13:06:30.192] [ET_NET 5] DIAG: <CacheDir.cc:559 (dir_probe)> 
(cache_stats) ==> stripe = 0x7ff5a0071030, s = 46, b = 5309, seg = 
0x7ff596890430, e = 0x7ff5968c41b8
       [Oct 15 13:06:30.192] [ET_NET 5] DIAG: <CacheDir.cc:561 (dir_probe)> 
(cache_stats) stripe->fd = 32
       [Oct 15 13:06:30.192] [ET_NET 5] DIAG: <CacheDir.cc:564 (dir_probe)> 
(cache_stats) boffset 0
       [Oct 15 13:06:30.192] [ET_NET 5] DIAG: <CacheDir.cc:621 (dir_probe)> 
(dir_probe_miss) missed (key=3447329293) D829FB03 DD1078BE on vol 32 bucket 
5309 at 0x7ff596890430
       */
       if (dir_probe(&key, stripe, &dir, &last_collision) > 0) {
         int ret = do_read_call(&key);
         if (ret == EVENT_RETURN) {
           goto Lread;
         }
         return ret;
       } 
   
     Ldone:
       
Metrics::Counter::increment(cache_rsb.status[static_cast<int>(CacheOpType::Remove)].failure);
       
Metrics::Counter::increment(stripe->cache_vol->vol_rsb.status[static_cast<int>(CacheOpType::Remove)].failure);
       if (od) {
         stripe->close_write(this);
       }
     }
     ink_assert(!stripe || this_ethread() != stripe->mutex->thread_holding);
     // Execution reaches here.
     // At this point, _action.continuation->handleEvent refers to the 
CacheVC::scanRemoveDone function.
     _action.continuation->handleEvent(CACHE_EVENT_REMOVE_FAILED, (void 
*)-ECACHE_NO_DOC);
     goto Lfree;
   Lremoved:
     _action.continuation->handleEvent(CACHE_EVENT_REMOVE, nullptr);
   Lfree:
     return free_CacheVC(this);
   }
   ```
   
   4. The `CacheVC::scanRemoveDone` function does not handle any event. It 
simply calls `CacheVC::scanObject` again.
   ```
   int
   CacheVC::scanRemoveDone(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED 
*/)
   {
     Dbg(dbg_ctl_cache_scan_truss, "inside %p:scanRemoveDone", this);
     Dbg(dbg_ctl_cache_scan, "remove done.");
     alternate.destroy();
     SET_HANDLER(&CacheVC::scanObject);
     return handleEvent(EVENT_IMMEDIATE, nullptr); 
   }
   ```
   
   5. It goes back to step 1 and repeats the same routine. Eventually, a `core 
dump` occurs.
   
   ### Unfortunately, we don't have any debug logs related to the stripe at the 
time the cache was stored.
   ### Therefore, we don't know why the stripe value during storage differs 
from the stripe returned by key_to_stripe() when it is called later.
   
   ## Temporary Fix
   ### This function is only for verifying that cache deletion works correctly 
— performance was not considered.
   ### Please note that this is for reference only. `PURGE_ISSUE` is **not 
defined**.
   ```
   Action *
   Cache::remove(Continuation *cont, const CacheKey *key, CacheFragType type, 
const char *hostname, int host_len)
   {
     if (!CacheProcessor::IsCacheReady(type)) {
       if (cont) {
         cont->handleEvent(CACHE_EVENT_REMOVE_FAILED, nullptr);
       }
       return ACTION_RESULT_DONE;
     }
   
     Ptr<ProxyMutex> mutex;
     if (!cont) {
       cont = new_CacheRemoveCont();
     }
   
     CACHE_TRY_LOCK(lock, cont->mutex, this_ethread());
     ink_assert(lock.is_locked());
     #ifndef PURGE_ISSUE
     Stripe *stripe = nullptr;
     #else
     Stripe *stripe = key_to_stripe(key, hostname, host_len);
     #endif
     // coverity[var_decl]
     Dir result;
     dir_clear(&result); // initialized here, set result empty so we can 
recognize missed lock
     mutex = cont->mutex;
   
     #ifndef PURGE_ISSUE
     if (auto *vc = dynamic_cast<CacheVC *>(cont)) {
       if (vc->stripe) {
         stripe = vc->stripe;
         Dbg(dbg_ctl_cache_remove, "Reusing stripe from VC: %p", stripe);
       }
     }
   
     if (!stripe) {
       stripe = key_to_stripe(key, hostname, host_len);
       Dbg(dbg_ctl_cache_remove, "Newly calculated stripe: %p", stripe);
     }
     #endif
   
     CacheVC *c   = new_CacheVC(cont);
     c->vio.op    = VIO::NONE;
     c->frag_type = type;
     c->op_type   = static_cast<int>(CacheOpType::Remove);
     Metrics::Gauge::increment(cache_rsb.status[c->op_type].active);
     
Metrics::Gauge::increment(stripe->cache_vol->vol_rsb.status[c->op_type].active);
     c->first_key = c->key = *key;
     c->stripe             = stripe;
     c->dir                = result;
     c->f.remove           = 1;
   
     SET_CONTINUATION_HANDLER(c, &CacheVC::removeEvent);
     int ret = c->removeEvent(EVENT_IMMEDIATE, nullptr);
     if (ret == EVENT_DONE) {
       return ACTION_RESULT_DONE;
     } else {
       return &c->_action;
     }
   }
   ```
   
   ## ATS Information
   * OS: Ubuntu 22.04
   * ats version: 10.0.4
   * [hosting.config] : empty
   * [cache.config] : empty
   * [storage.config]
   ```
   /dev/sdb id=cache.disk.0
   ```
   * [volume.config]
   ```
   volume=1 scheme=http size=10%
   volume=2 scheme=http size=10%
   volume=3 scheme=http size=10%
   volume=4 scheme=http size=10%
   volume=5 scheme=http size=10%
   volume=6 scheme=http size=10%
   volume=7 scheme=http size=10%
   volume=8 scheme=http size=10%
   volume=9 scheme=http size=10%
   ```
   
   We’d appreciate it if you could review this behavior and let us know if this 
is a known issue or if we’re missing something.
   
   Thank you for your time and support!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to