Hi Kang,
On Tue, Jun 10, 2025 at 11:55 AM Kang Yang <kang.y...@oss.qualcomm.com> wrote: > > In rare cases, ath10k may lose connection with the PCIe bus due to > some unknown reasons, which could further lead to system crashes during > resuming due to watchdog timeout: > > ath10k_pci 0000:01:00.0: wmi command 20486 timeout, restarting hardware > ath10k_pci 0000:01:00.0: already restarting > ath10k_pci 0000:01:00.0: failed to stop WMI vdev 0: -11 > ath10k_pci 0000:01:00.0: failed to stop vdev 0: -11 > ieee80211 phy0: PM: **** DPM device timeout **** > Call Trace: > panic+0x125/0x315 > dpm_watchdog_set+0x54/0x54 > dpm_watchdog_handler+0x57/0x57 > call_timer_fn+0x31/0x13c > > At this point, all WMI commands will timeout and attempt to restart > device. So set a threshold for consecutive restart failures. If the > threshold is exceeded, consider the hardware is unreliable and all > ath10k operations should be skipped to avoid system crash. > > fail_cont_count and pending_recovery are atomic variables, and > do not involve complex conditional logic. Therefore, even if recovery > check and reconfig complete are executed concurrently, the recovery > mechanism will not be broken. > > Tested-on: QCA6174 hw3.2 PCI WLAN.RM.4.4.1-00288-QCARMSWPZ-1 > > Signed-off-by: Kang Yang <kang.y...@oss.qualcomm.com> > --- > drivers/net/wireless/ath/ath10k/core.c | 50 +++++++++++++++++++++++--- > drivers/net/wireless/ath/ath10k/core.h | 11 ++++-- > drivers/net/wireless/ath/ath10k/mac.c | 7 +++- > drivers/net/wireless/ath/ath10k/wmi.c | 6 ++++ > 4 files changed, 65 insertions(+), 9 deletions(-) > > diff --git a/drivers/net/wireless/ath/ath10k/core.c > b/drivers/net/wireless/ath/ath10k/core.c > index fe3a8f4a1cc1..f925a3cf9ebd 100644 > --- a/drivers/net/wireless/ath/ath10k/core.c > +++ b/drivers/net/wireless/ath/ath10k/core.c > @@ -4,6 +4,7 @@ > * Copyright (c) 2011-2017 Qualcomm Atheros, Inc. > * Copyright (c) 2018-2019, The Linux Foundation. All rights reserved. > * Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights > reserved. > + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. > */ > > #include <linux/module.h> > @@ -2491,12 +2492,51 @@ static int ath10k_init_hw_params(struct ath10k *ar) > return 0; > } > > +static bool ath10k_core_needs_recovery(struct ath10k *ar) > +{ > + long time_left; > + > + /* Sometimes the recovery will fail and then the next all recovery > fail, > + * so avoid infinite recovery. > + */ > + if (atomic_read(&ar->fail_cont_count) >= > ATH10K_RECOVERY_MAX_FAIL_COUNT) { > + ath10k_err(ar, "consecutive fail %d times, will shutdown > driver!", > + atomic_read(&ar->fail_cont_count)); > + ar->state = ATH10K_STATE_WEDGED; > + return false; > + } > + > + ath10k_dbg(ar, ATH10K_DBG_BOOT, "total recovery count: %d", You don't need a newline. > + ++ar->recovery_count); > + > + if (atomic_read(&ar->pending_recovery)) { > + /* Sometimes it happened another recovery work before the > previous one > + * completed, then the second recovery work will destroy the > previous > + * one, thus below is to avoid that. > + */ > + time_left = wait_for_completion_timeout(&ar->driver_recovery, > + > ATH10K_RECOVERY_TIMEOUT_HZ); > + if (time_left) { > + ath10k_warn(ar, "previous recovery succeeded, skip > this!\n"); > + return false; > + } > + > + /* Record the continuous recovery fail count when recovery > failed. */ > + atomic_inc(&ar->fail_cont_count); > + > + /* Avoid having multiple recoveries at the same time. */ > + return false; > + } > + > + atomic_inc(&ar->pending_recovery); > + > + return true; > +} > + > void ath10k_core_start_recovery(struct ath10k *ar) > { > - if (test_and_set_bit(ATH10K_FLAG_RESTARTING, &ar->dev_flags)) { > - ath10k_warn(ar, "already restarting\n"); > + if (!ath10k_core_needs_recovery(ar)) > return; > - } > > queue_work(ar->workqueue, &ar->restart_work); > } > @@ -2532,6 +2572,8 @@ static void ath10k_core_restart(struct work_struct > *work) > struct ath10k *ar = container_of(work, struct ath10k, restart_work); > int ret; > > + reinit_completion(&ar->driver_recovery); > + > set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags); > > /* Place a barrier to make sure the compiler doesn't reorder > @@ -2596,8 +2638,6 @@ static void ath10k_core_restart(struct work_struct > *work) > if (ret) > ath10k_warn(ar, "failed to send firmware crash dump via > devcoredump: %d", > ret); > - > - complete(&ar->driver_recovery); > } > > static void ath10k_core_set_coverage_class_work(struct work_struct *work) > diff --git a/drivers/net/wireless/ath/ath10k/core.h > b/drivers/net/wireless/ath/ath10k/core.h > index 446dca74f06a..06ac95707531 100644 > --- a/drivers/net/wireless/ath/ath10k/core.h > +++ b/drivers/net/wireless/ath/ath10k/core.h > @@ -4,6 +4,7 @@ > * Copyright (c) 2011-2017 Qualcomm Atheros, Inc. > * Copyright (c) 2018-2019, The Linux Foundation. All rights reserved. > * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved. > + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. > */ > > #ifndef _CORE_H_ > @@ -87,6 +88,8 @@ > IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) > #define ATH10K_ITER_RESUME_FLAGS (IEEE80211_IFACE_ITER_RESUME_ALL |\ > IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) > +#define ATH10K_RECOVERY_TIMEOUT_HZ (5 * HZ) > +#define ATH10K_RECOVERY_MAX_FAIL_COUNT 4 > > struct ath10k; > > @@ -865,9 +868,6 @@ enum ath10k_dev_flags { > /* Per Station statistics service */ > ATH10K_FLAG_PEER_STATS, > > - /* Indicates that ath10k device is during recovery process and not > complete */ > - ATH10K_FLAG_RESTARTING, > - > /* protected by conf_mutex */ > ATH10K_FLAG_NAPI_ENABLED, > }; > @@ -1211,6 +1211,11 @@ struct ath10k { > struct work_struct bundle_tx_work; > struct work_struct tx_complete_work; > > + atomic_t pending_recovery; > + u8 recovery_count; No reason to be a u8 IMO, use unsigned int. > + /* continuous recovery fail count */ > + atomic_t fail_cont_count; > +