This is an automated email from the ASF dual-hosted git repository.
zwoop pushed a commit to branch 9.2.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/9.2.x by this push:
new 3ac7beba39 Make bad disk detection more robust (#10625)
3ac7beba39 is described below
commit 3ac7beba394a595f14af2c31cbe1155777acac51
Author: Mo Chen <[email protected]>
AuthorDate: Tue Oct 17 12:36:32 2023 -0500
Make bad disk detection more robust (#10625)
-Report disk I/O failure synchronously.
-Fix a signed/unsigned bug when checking disk I/O result. -Rename a couple
of symbols for clarity.
---
iocore/aio/AIO.cc | 10 +++++-----
iocore/aio/I_AIO.h | 2 +-
iocore/aio/P_AIO.h | 15 +++++++++------
iocore/cache/Cache.cc | 14 +++++++-------
iocore/cache/CacheDir.cc | 2 +-
iocore/cache/CacheDisk.cc | 6 +++---
iocore/cache/CacheVol.cc | 2 +-
iocore/cache/P_CacheVol.h | 4 ++--
8 files changed, 29 insertions(+), 26 deletions(-)
diff --git a/iocore/aio/AIO.cc b/iocore/aio/AIO.cc
index 20a02125c7..215b250847 100644
--- a/iocore/aio/AIO.cc
+++ b/iocore/aio/AIO.cc
@@ -53,8 +53,8 @@ int thread_is_created = 0;
RecInt cache_config_threads_per_disk = 12;
RecInt api_config_threads_per_disk = 12;
-RecRawStatBlock *aio_rsb = nullptr;
-Continuation *aio_err_callbck = nullptr;
+RecRawStatBlock *aio_rsb = nullptr;
+Continuation *aio_err_callback = nullptr;
// AIO Stats
uint64_t aio_num_read = 0;
uint64_t aio_bytes_read = 0;
@@ -140,9 +140,9 @@ new_AIOCallback()
}
void
-ink_aio_set_callback(Continuation *callback)
+ink_aio_set_err_callback(Continuation *callback)
{
- aio_err_callbck = callback;
+ aio_err_callback = callback;
}
void
@@ -401,7 +401,7 @@ cache_op(AIOCallbackInternal *op)
res += err;
}
op->aio_result = res;
- ink_assert(op->aio_result == (int64_t)a->aio_nbytes);
+ ink_assert(op->ok());
}
return 1;
}
diff --git a/iocore/aio/I_AIO.h b/iocore/aio/I_AIO.h
index 6144e188f6..1a21dd32e7 100644
--- a/iocore/aio/I_AIO.h
+++ b/iocore/aio/I_AIO.h
@@ -137,7 +137,7 @@ struct DiskHandler : public Continuation {
void ink_aio_init(ts::ModuleVersion version);
int ink_aio_start();
-void ink_aio_set_callback(Continuation *error_callback);
+void ink_aio_set_err_callback(Continuation *error_callback);
int ink_aio_read(AIOCallback *op,
int fromAPI = 0); // fromAPI is a boolean to indicate if this
is from a API call such as upload proxy feature
diff --git a/iocore/aio/P_AIO.h b/iocore/aio/P_AIO.h
index 18a8690e38..adb58e1149 100644
--- a/iocore/aio/P_AIO.h
+++ b/iocore/aio/P_AIO.h
@@ -41,10 +41,10 @@ static constexpr ts::ModuleVersion
AIO_MODULE_INTERNAL_VERSION{AIO_MODULE_PUBLIC
TS_INLINE int
AIOCallback::ok()
{
- return (off_t)aiocb.aio_nbytes == (off_t)aio_result;
+ return (aiocb.aio_nbytes == static_cast<size_t>(aio_result)) && (aio_result
>= 0);
}
-extern Continuation *aio_err_callbck;
+extern Continuation *aio_err_callback;
#if AIO_MODE == AIO_MODE_NATIVE
@@ -110,13 +110,16 @@ AIOCallbackInternal::io_complete(int event, void *data)
{
(void)event;
(void)data;
- if (aio_err_callbck && !ok()) {
+ if (aio_err_callback && !ok()) {
AIOCallback *err_op = new AIOCallbackInternal();
err_op->aiocb.aio_fildes = this->aiocb.aio_fildes;
err_op->aiocb.aio_lio_opcode = this->aiocb.aio_lio_opcode;
- err_op->mutex = aio_err_callbck->mutex;
- err_op->action = aio_err_callbck;
- eventProcessor.schedule_imm(err_op);
+ err_op->mutex = aio_err_callback->mutex;
+ err_op->action = aio_err_callback;
+
+ // Take this lock in-line because we want to stop other I/O operations on
this disk ASAP
+ SCOPED_MUTEX_LOCK(lock, aio_err_callback->mutex, this_ethread());
+ err_op->action.continuation->handleEvent(EVENT_NONE, err_op);
}
if (!action.cancelled) {
action.continuation->handleEvent(AIO_EVENT_DONE, this);
diff --git a/iocore/cache/Cache.cc b/iocore/cache/Cache.cc
index 20671bd780..384ee354a0 100644
--- a/iocore/cache/Cache.cc
+++ b/iocore/cache/Cache.cc
@@ -605,7 +605,7 @@ CacheProcessor::start_internal(int flags)
memset(sds, 0, sizeof(Span *) * gndisks);
gndisks = 0;
- ink_aio_set_callback(new AIO_Callback_handler());
+ ink_aio_set_err_callback(new AIO_failure_handler());
config_volumes.read_config_file();
@@ -1335,7 +1335,7 @@ Vol::handle_dir_clear(int event, void *data)
if (event == AIO_EVENT_DONE) {
op = static_cast<AIOCallback *>(data);
- if (static_cast<size_t>(op->aio_result) != op->aiocb.aio_nbytes) {
+ if (!op->ok()) {
Warning("unable to clear cache directory '%s'", hash_text.get());
disk->incrErrors(op);
fd = -1;
@@ -1364,7 +1364,7 @@ Vol::handle_dir_read(int event, void *data)
AIOCallback *op = static_cast<AIOCallback *>(data);
if (event == AIO_EVENT_DONE) {
- if (static_cast<size_t>(op->aio_result) != op->aiocb.aio_nbytes) {
+ if (!op->ok()) {
Note("Directory read failed: clearing cache directory %s",
this->hash_text.get());
clear_dir();
return EVENT_DONE;
@@ -1459,7 +1459,7 @@ Vol::handle_recover_from_data(int event, void * /* data
ATS_UNUSED */)
io.aiocb.aio_nbytes = (skip + len) - recover_pos;
}
} else if (event == AIO_EVENT_DONE) {
- if (io.aiocb.aio_nbytes != static_cast<size_t>(io.aio_result)) {
+ if (!io.ok()) {
Warning("disk read error on recover '%s', clearing", hash_text.get());
disk->incrErrors(&io);
goto Lclear;
@@ -1722,7 +1722,7 @@ Vol::handle_header_read(int event, void *data)
for (auto &i : hf) {
ink_assert(op != nullptr);
i = static_cast<VolHeaderFooter *>(op->aiocb.aio_buf);
- if (static_cast<size_t>(op->aio_result) != op->aiocb.aio_nbytes) {
+ if (!op->ok()) {
Note("Header read failed: clearing cache directory %s",
this->hash_text.get());
clear_dir();
return EVENT_DONE;
@@ -2004,7 +2004,7 @@ CacheProcessor::has_online_storage() const
}
int
-AIO_Callback_handler::handle_disk_failure(int /* event ATS_UNUSED */, void
*data)
+AIO_failure_handler::handle_disk_failure(int /* event ATS_UNUSED */, void
*data)
{
/* search for the matching file descriptor */
if (!CacheProcessor::cache_ready) {
@@ -2435,7 +2435,7 @@ CacheVC::removeEvent(int /* event ATS_UNUSED */, Event *
/* e ATS_UNUSED */)
goto Lcollision;
}
// check read completed correct FIXME: remove bad vols
- if (static_cast<size_t>(io.aio_result) != io.aiocb.aio_nbytes) {
+ if (!io.ok()) {
goto Ldone;
}
{
diff --git a/iocore/cache/CacheDir.cc b/iocore/cache/CacheDir.cc
index b59a8e809c..e98b03ce82 100644
--- a/iocore/cache/CacheDir.cc
+++ b/iocore/cache/CacheDir.cc
@@ -1086,7 +1086,7 @@ Lrestart:
if (event == AIO_EVENT_DONE) {
// AIO Thread
- if (io.aio_result != static_cast<int64_t>(io.aiocb.aio_nbytes)) {
+ if (!io.ok()) {
Warning("vol write error during directory sync '%s'",
gvol[vol_idx]->hash_text.get());
event = EVENT_NONE;
goto Ldone;
diff --git a/iocore/cache/CacheDisk.cc b/iocore/cache/CacheDisk.cc
index c9b1c75d3a..abb89f3c0f 100644
--- a/iocore/cache/CacheDisk.cc
+++ b/iocore/cache/CacheDisk.cc
@@ -147,7 +147,7 @@ CacheDisk::clearDone(int event, void * /* data ATS_UNUSED
*/)
{
ink_assert(event == AIO_EVENT_DONE);
- if (io.aiocb.aio_nbytes != static_cast<size_t>(io.aio_result)) {
+ if (!io.ok()) {
Warning("Could not clear disk header for disk %s: declaring disk bad",
path);
incrErrors(&io);
SET_DISK_BAD(this);
@@ -163,7 +163,7 @@ CacheDisk::openStart(int event, void * /* data ATS_UNUSED
*/)
{
ink_assert(event == AIO_EVENT_DONE);
- if (io.aiocb.aio_nbytes != static_cast<size_t>(io.aio_result)) {
+ if (!io.ok()) {
Warning("could not read disk header for disk %s: declaring disk bad",
path);
// the header could have random values by the AIO read error
@@ -237,7 +237,7 @@ CacheDisk::syncDone(int event, void * /* data ATS_UNUSED */)
{
ink_assert(event == AIO_EVENT_DONE);
- if (io.aiocb.aio_nbytes != static_cast<size_t>(io.aio_result)) {
+ if (!io.ok()) {
Warning("Error writing disk header for disk %s:disk bad", path);
incrErrors(&io);
SET_DISK_BAD(this);
diff --git a/iocore/cache/CacheVol.cc b/iocore/cache/CacheVol.cc
index 9d45c88002..fb7e93770f 100644
--- a/iocore/cache/CacheVol.cc
+++ b/iocore/cache/CacheVol.cc
@@ -202,7 +202,7 @@ CacheVC::scanObject(int /* event ATS_UNUSED */, Event * /*
e ATS_UNUSED */)
goto Lread;
}
- if (static_cast<size_t>(io.aio_result) != io.aiocb.aio_nbytes) {
+ if (!io.ok()) {
result = (void *)-ECACHE_READ_FAIL;
goto Ldone;
}
diff --git a/iocore/cache/P_CacheVol.h b/iocore/cache/P_CacheVol.h
index b2f69b13ce..15c02e2556 100644
--- a/iocore/cache/P_CacheVol.h
+++ b/iocore/cache/P_CacheVol.h
@@ -278,10 +278,10 @@ struct Vol : public Continuation {
~Vol() override { ats_free(agg_buffer); }
};
-struct AIO_Callback_handler : public Continuation {
+struct AIO_failure_handler : public Continuation {
int handle_disk_failure(int event, void *data);
- AIO_Callback_handler() : Continuation(new_ProxyMutex()) {
SET_HANDLER(&AIO_Callback_handler::handle_disk_failure); }
+ AIO_failure_handler() : Continuation(new_ProxyMutex()) {
SET_HANDLER(&AIO_failure_handler::handle_disk_failure); }
};
struct CacheVol {