This is an automated email from the ASF dual-hosted git repository.

zwoop pushed a commit to branch 9.2.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/9.2.x by this push:
     new 3ac7beba39 Make bad disk detection more robust (#10625)
3ac7beba39 is described below

commit 3ac7beba394a595f14af2c31cbe1155777acac51
Author: Mo Chen <[email protected]>
AuthorDate: Tue Oct 17 12:36:32 2023 -0500

    Make bad disk detection more robust (#10625)
    
    -Report disk I/O failure synchronously.
    -Fix a signed/unsigned bug when checking disk I/O result. -Rename a couple 
of symbols for clarity.
---
 iocore/aio/AIO.cc         | 10 +++++-----
 iocore/aio/I_AIO.h        |  2 +-
 iocore/aio/P_AIO.h        | 15 +++++++++------
 iocore/cache/Cache.cc     | 14 +++++++-------
 iocore/cache/CacheDir.cc  |  2 +-
 iocore/cache/CacheDisk.cc |  6 +++---
 iocore/cache/CacheVol.cc  |  2 +-
 iocore/cache/P_CacheVol.h |  4 ++--
 8 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/iocore/aio/AIO.cc b/iocore/aio/AIO.cc
index 20a02125c7..215b250847 100644
--- a/iocore/aio/AIO.cc
+++ b/iocore/aio/AIO.cc
@@ -53,8 +53,8 @@ int thread_is_created = 0;
 RecInt cache_config_threads_per_disk = 12;
 RecInt api_config_threads_per_disk   = 12;
 
-RecRawStatBlock *aio_rsb      = nullptr;
-Continuation *aio_err_callbck = nullptr;
+RecRawStatBlock *aio_rsb       = nullptr;
+Continuation *aio_err_callback = nullptr;
 // AIO Stats
 uint64_t aio_num_read      = 0;
 uint64_t aio_bytes_read    = 0;
@@ -140,9 +140,9 @@ new_AIOCallback()
 }
 
 void
-ink_aio_set_callback(Continuation *callback)
+ink_aio_set_err_callback(Continuation *callback)
 {
-  aio_err_callbck = callback;
+  aio_err_callback = callback;
 }
 
 void
@@ -401,7 +401,7 @@ cache_op(AIOCallbackInternal *op)
       res += err;
     }
     op->aio_result = res;
-    ink_assert(op->aio_result == (int64_t)a->aio_nbytes);
+    ink_assert(op->ok());
   }
   return 1;
 }
diff --git a/iocore/aio/I_AIO.h b/iocore/aio/I_AIO.h
index 6144e188f6..1a21dd32e7 100644
--- a/iocore/aio/I_AIO.h
+++ b/iocore/aio/I_AIO.h
@@ -137,7 +137,7 @@ struct DiskHandler : public Continuation {
 
 void ink_aio_init(ts::ModuleVersion version);
 int ink_aio_start();
-void ink_aio_set_callback(Continuation *error_callback);
+void ink_aio_set_err_callback(Continuation *error_callback);
 
 int ink_aio_read(AIOCallback *op,
                  int fromAPI = 0); // fromAPI is a boolean to indicate if this 
is from a API call such as upload proxy feature
diff --git a/iocore/aio/P_AIO.h b/iocore/aio/P_AIO.h
index 18a8690e38..adb58e1149 100644
--- a/iocore/aio/P_AIO.h
+++ b/iocore/aio/P_AIO.h
@@ -41,10 +41,10 @@ static constexpr ts::ModuleVersion 
AIO_MODULE_INTERNAL_VERSION{AIO_MODULE_PUBLIC
 TS_INLINE int
 AIOCallback::ok()
 {
-  return (off_t)aiocb.aio_nbytes == (off_t)aio_result;
+  return (aiocb.aio_nbytes == static_cast<size_t>(aio_result)) && (aio_result 
>= 0);
 }
 
-extern Continuation *aio_err_callbck;
+extern Continuation *aio_err_callback;
 
 #if AIO_MODE == AIO_MODE_NATIVE
 
@@ -110,13 +110,16 @@ AIOCallbackInternal::io_complete(int event, void *data)
 {
   (void)event;
   (void)data;
-  if (aio_err_callbck && !ok()) {
+  if (aio_err_callback && !ok()) {
     AIOCallback *err_op          = new AIOCallbackInternal();
     err_op->aiocb.aio_fildes     = this->aiocb.aio_fildes;
     err_op->aiocb.aio_lio_opcode = this->aiocb.aio_lio_opcode;
-    err_op->mutex                = aio_err_callbck->mutex;
-    err_op->action               = aio_err_callbck;
-    eventProcessor.schedule_imm(err_op);
+    err_op->mutex                = aio_err_callback->mutex;
+    err_op->action               = aio_err_callback;
+
+    // Take this lock in-line because we want to stop other I/O operations on 
this disk ASAP
+    SCOPED_MUTEX_LOCK(lock, aio_err_callback->mutex, this_ethread());
+    err_op->action.continuation->handleEvent(EVENT_NONE, err_op);
   }
   if (!action.cancelled) {
     action.continuation->handleEvent(AIO_EVENT_DONE, this);
diff --git a/iocore/cache/Cache.cc b/iocore/cache/Cache.cc
index 20671bd780..384ee354a0 100644
--- a/iocore/cache/Cache.cc
+++ b/iocore/cache/Cache.cc
@@ -605,7 +605,7 @@ CacheProcessor::start_internal(int flags)
   memset(sds, 0, sizeof(Span *) * gndisks);
 
   gndisks = 0;
-  ink_aio_set_callback(new AIO_Callback_handler());
+  ink_aio_set_err_callback(new AIO_failure_handler());
 
   config_volumes.read_config_file();
 
@@ -1335,7 +1335,7 @@ Vol::handle_dir_clear(int event, void *data)
 
   if (event == AIO_EVENT_DONE) {
     op = static_cast<AIOCallback *>(data);
-    if (static_cast<size_t>(op->aio_result) != op->aiocb.aio_nbytes) {
+    if (!op->ok()) {
       Warning("unable to clear cache directory '%s'", hash_text.get());
       disk->incrErrors(op);
       fd = -1;
@@ -1364,7 +1364,7 @@ Vol::handle_dir_read(int event, void *data)
   AIOCallback *op = static_cast<AIOCallback *>(data);
 
   if (event == AIO_EVENT_DONE) {
-    if (static_cast<size_t>(op->aio_result) != op->aiocb.aio_nbytes) {
+    if (!op->ok()) {
       Note("Directory read failed: clearing cache directory %s", 
this->hash_text.get());
       clear_dir();
       return EVENT_DONE;
@@ -1459,7 +1459,7 @@ Vol::handle_recover_from_data(int event, void * /* data 
ATS_UNUSED */)
       io.aiocb.aio_nbytes = (skip + len) - recover_pos;
     }
   } else if (event == AIO_EVENT_DONE) {
-    if (io.aiocb.aio_nbytes != static_cast<size_t>(io.aio_result)) {
+    if (!io.ok()) {
       Warning("disk read error on recover '%s', clearing", hash_text.get());
       disk->incrErrors(&io);
       goto Lclear;
@@ -1722,7 +1722,7 @@ Vol::handle_header_read(int event, void *data)
     for (auto &i : hf) {
       ink_assert(op != nullptr);
       i = static_cast<VolHeaderFooter *>(op->aiocb.aio_buf);
-      if (static_cast<size_t>(op->aio_result) != op->aiocb.aio_nbytes) {
+      if (!op->ok()) {
         Note("Header read failed: clearing cache directory %s", 
this->hash_text.get());
         clear_dir();
         return EVENT_DONE;
@@ -2004,7 +2004,7 @@ CacheProcessor::has_online_storage() const
 }
 
 int
-AIO_Callback_handler::handle_disk_failure(int /* event ATS_UNUSED */, void 
*data)
+AIO_failure_handler::handle_disk_failure(int /* event ATS_UNUSED */, void 
*data)
 {
   /* search for the matching file descriptor */
   if (!CacheProcessor::cache_ready) {
@@ -2435,7 +2435,7 @@ CacheVC::removeEvent(int /* event ATS_UNUSED */, Event * 
/* e ATS_UNUSED */)
       goto Lcollision;
     }
     // check read completed correct FIXME: remove bad vols
-    if (static_cast<size_t>(io.aio_result) != io.aiocb.aio_nbytes) {
+    if (!io.ok()) {
       goto Ldone;
     }
     {
diff --git a/iocore/cache/CacheDir.cc b/iocore/cache/CacheDir.cc
index b59a8e809c..e98b03ce82 100644
--- a/iocore/cache/CacheDir.cc
+++ b/iocore/cache/CacheDir.cc
@@ -1086,7 +1086,7 @@ Lrestart:
 
   if (event == AIO_EVENT_DONE) {
     // AIO Thread
-    if (io.aio_result != static_cast<int64_t>(io.aiocb.aio_nbytes)) {
+    if (!io.ok()) {
       Warning("vol write error during directory sync '%s'", 
gvol[vol_idx]->hash_text.get());
       event = EVENT_NONE;
       goto Ldone;
diff --git a/iocore/cache/CacheDisk.cc b/iocore/cache/CacheDisk.cc
index c9b1c75d3a..abb89f3c0f 100644
--- a/iocore/cache/CacheDisk.cc
+++ b/iocore/cache/CacheDisk.cc
@@ -147,7 +147,7 @@ CacheDisk::clearDone(int event, void * /* data ATS_UNUSED 
*/)
 {
   ink_assert(event == AIO_EVENT_DONE);
 
-  if (io.aiocb.aio_nbytes != static_cast<size_t>(io.aio_result)) {
+  if (!io.ok()) {
     Warning("Could not clear disk header for disk %s: declaring disk bad", 
path);
     incrErrors(&io);
     SET_DISK_BAD(this);
@@ -163,7 +163,7 @@ CacheDisk::openStart(int event, void * /* data ATS_UNUSED 
*/)
 {
   ink_assert(event == AIO_EVENT_DONE);
 
-  if (io.aiocb.aio_nbytes != static_cast<size_t>(io.aio_result)) {
+  if (!io.ok()) {
     Warning("could not read disk header for disk %s: declaring disk bad", 
path);
 
     // the header could have random values by the AIO read error
@@ -237,7 +237,7 @@ CacheDisk::syncDone(int event, void * /* data ATS_UNUSED */)
 {
   ink_assert(event == AIO_EVENT_DONE);
 
-  if (io.aiocb.aio_nbytes != static_cast<size_t>(io.aio_result)) {
+  if (!io.ok()) {
     Warning("Error writing disk header for disk %s:disk bad", path);
     incrErrors(&io);
     SET_DISK_BAD(this);
diff --git a/iocore/cache/CacheVol.cc b/iocore/cache/CacheVol.cc
index 9d45c88002..fb7e93770f 100644
--- a/iocore/cache/CacheVol.cc
+++ b/iocore/cache/CacheVol.cc
@@ -202,7 +202,7 @@ CacheVC::scanObject(int /* event ATS_UNUSED */, Event * /* 
e ATS_UNUSED */)
     goto Lread;
   }
 
-  if (static_cast<size_t>(io.aio_result) != io.aiocb.aio_nbytes) {
+  if (!io.ok()) {
     result = (void *)-ECACHE_READ_FAIL;
     goto Ldone;
   }
diff --git a/iocore/cache/P_CacheVol.h b/iocore/cache/P_CacheVol.h
index b2f69b13ce..15c02e2556 100644
--- a/iocore/cache/P_CacheVol.h
+++ b/iocore/cache/P_CacheVol.h
@@ -278,10 +278,10 @@ struct Vol : public Continuation {
   ~Vol() override { ats_free(agg_buffer); }
 };
 
-struct AIO_Callback_handler : public Continuation {
+struct AIO_failure_handler : public Continuation {
   int handle_disk_failure(int event, void *data);
 
-  AIO_Callback_handler() : Continuation(new_ProxyMutex()) { 
SET_HANDLER(&AIO_Callback_handler::handle_disk_failure); }
+  AIO_failure_handler() : Continuation(new_ProxyMutex()) { 
SET_HANDLER(&AIO_failure_handler::handle_disk_failure); }
 };
 
 struct CacheVol {

Reply via email to