Hello,

Please find my comments below:

On 6/16/2026 11:01 AM, Mathieu Poirier wrote:
> On Wed, Jun 10, 2026 at 11:27:11AM -0700, Tanmay Shah wrote:
>> Remote processor will report the crash reason via the resource table
>> and notify the host via mailbox notification. The host checks this
>> crash reason on every mailbox notification from the remote and report
>> to the rproc core framework. Then the rproc core framework will start
>> the recovery process.
>>
>> Signed-off-by: Tanmay Shah <[email protected]>
>> ---
>>
>> changes in v5
>>   - use local variable to access crash report pointer
>>   - End crash report string with '\0' without relying on fw
>>
>>  drivers/remoteproc/xlnx_r5_remoteproc.c | 73 ++++++++++++++++++++++++-
>>  1 file changed, 72 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c 
>> b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> index 3349d1877751..86afff9f3b40 100644
>> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
>> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> @@ -112,6 +112,10 @@ struct rsc_tbl_data {
>>      const uintptr_t rsc_tbl;
>>  } __packed;
>>  
>> +enum xlnx_rproc_fw_rsc {
>> +    XLNX_RPROC_FW_CRASH_REASON = RSC_VENDOR_START,
> 
> I would call this XLNX_RPROC_FW_CRASH_REPORT
> 

Ack.

>> +};
>> +
>>  /*
>>   * Hardcoded TCM bank values. This will stay in driver to maintain backward
>>   * compatibility with device-tree that does not have TCM information.
>> @@ -131,9 +135,27 @@ static const struct mem_bank_data 
>> zynqmp_tcm_banks_lockstep[] = {
>>      {0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"},
>>  };
>>  
>> +#define CRASH_REASON_STR_LEN 16
>> +
>> +/**
>> + * struct xlnx_rproc_crash_report - resource to know crash status and reason
>> + *
>> + * @version: version of this resource
>> + * @crash_state: if true, the rproc is notifying crash, time to recover
>> + * @crash_reason: number to describe reason of crash
>> + * @crash_reason_str: short string description of crash reason
>> + */
>> +struct xlnx_rproc_crash_report {
>> +    u8 version;
>> +    u8 crash_state;
>> +    u8 crash_reason;
> 
> Do you need 2 variables?  Would it be possible for @crash_reason != 0 to
> indicate a crash, making @cash_state irrelevant?
> 

Actually initially I had defined crash_reason only, but when I looked at
how crash reason/type is defined in the kernel, I got idea to keep
crash_state separate than crash_reason:

https://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git/tree/include/linux/remoteproc.h?h=for-next#n183

enum rproc_crash_type {
        RPROC_MMUFAULT,
        RPROC_WATCHDOG,
        RPROC_FATAL_ERROR,
};

So, if crash_reason is defined like above, then it's easy to make
mistake and not define no_crash = 0. Different firmware projects can
treat crash_reason differently. I would like to keep crash_state and
crash_reason separate and not impose policy on firmware projects on how
to define crash_reason.

>> +    char crash_reason_str[CRASH_REASON_STR_LEN];
>> +} __packed;
>> +
>>  /**
>>   * struct zynqmp_r5_core - remoteproc core's internal data
>>   *
>> + * @crash_report: rproc crash state and reason
>>   * @rsc_tbl_va: resource table virtual address
>>   * @sram: Array of sram memories assigned to this core
>>   * @num_sram: number of sram for this core
>> @@ -147,6 +169,7 @@ static const struct mem_bank_data 
>> zynqmp_tcm_banks_lockstep[] = {
>>   * @ipi: pointer to mailbox information
>>   */
>>  struct zynqmp_r5_core {
>> +    struct xlnx_rproc_crash_report *crash_report;
>>      void __iomem *rsc_tbl_va;
>>      struct zynqmp_sram_bank *sram;
>>      int num_sram;
>> @@ -204,11 +227,27 @@ static int event_notified_idr_cb(int id, void *ptr, 
>> void *data)
>>   */
>>  static void handle_event_notified(struct work_struct *work)
>>  {
>> +    struct xlnx_rproc_crash_report *report;
>> +    struct zynqmp_r5_core *r5_core;
>>      struct mbox_info *ipi;
>>      struct rproc *rproc;
>>  
>>      ipi = container_of(work, struct mbox_info, mbox_work);
>>      rproc = ipi->r5_core->rproc;
>> +    r5_core = ipi->r5_core;
>> +    report = r5_core->crash_report;
>> +
>> +    /* report crash only if expected */
>> +    if (report && report->crash_state) {
>> +            if (rproc->state == RPROC_ATTACHED || rproc->state == 
>> RPROC_RUNNING) {
>> +                    report->crash_reason_str[CRASH_REASON_STR_LEN - 1] = 
>> '\0';
>> +                    dev_warn(&rproc->dev, "crash reason id: %d %s\n",
>> +                             report->crash_reason, 
>> report->crash_reason_str);
>> +                    rproc_report_crash(rproc, RPROC_FATAL_ERROR);
>> +                    report->crash_state = false;
>> +                    return;
>> +            }
>> +    }
>>  
>>      /*
>>       * We only use IPI for interrupt. The RPU firmware side may or may
>> @@ -448,6 +487,13 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc)
>>      if (ret)
>>              dev_err(r5_core->dev, "core force power down failed\n");
>>  
>> +    /*
>> +     * Clear attach on recovery flag during stop operation. The next state
>> +     * of the remote processor is expected to be "Running" state. In this
>> +     * state boot recovery method must take place over attach on recovery.
>> +     */
>> +    test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
> 
> Why is there a need to play set/clear the RPROC_FEAT_ATTACH_ON_RECOVERY flag
> here and in zynqmp_r5_attach() and zynqmp_r5_detach()?  I think we talked 
> about
> that before but can't remember the outcome of that conversation.
> 

The remote processor can go through following life cycle:

attach() -> stop() -> start(). When this happens, ATTACH_ON_RECOVERY is
not valid anymore. At this point, it should be BOOT_RECOVERY which is
indicated by clearing ATTACH_RECOVERY flag. That is why it is important
to clear this bit on stop().

Now in detach() it is not needed. I am just clearing the flag as part of
a cleanup sequence.

 >> +
>>      return ret;
>>  }
>>  
>> @@ -869,6 +915,9 @@ static int zynqmp_r5_get_rsc_table_va(struct 
>> zynqmp_r5_core *r5_core)
>>  
>>  static int zynqmp_r5_attach(struct rproc *rproc)
>>  {
>> +    /* Enable attach on recovery method. Clear it during rproc stop. */
>> +    rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY);
>> +
>>      dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index);
>>  
>>      return 0;
>> @@ -883,9 +932,30 @@ static int zynqmp_r5_detach(struct rproc *rproc)
>>       */
>>      zynqmp_r5_rproc_kick(rproc, 0);
>>  
>> +    clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
>> +
>>      return 0;
>>  }
>>  
>> +static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void 
>> *rsc,
>> +                            int offset, int avail)
>> +{
>> +    struct zynqmp_r5_core *r5_core = rproc->priv;
>> +    void *rsc_offset = (r5_core->rsc_tbl_va + offset);
>> +
> 
>         if (rsc_type != XLNX_RPROC_FW_CRASH_REASON)
>                 return RSC_IGNORED;
> 

Ack.

>> +    if (rsc_type == XLNX_RPROC_FW_CRASH_REASON) {
>> +            r5_core->crash_report = rsc_offset;
>> +            /* reset all values */
>> +            r5_core->crash_report->crash_state = false;
>> +            r5_core->crash_report->crash_reason = 0;
>> +            r5_core->crash_report->crash_reason_str[0] = '\0';
>> +    } else {
>> +            return RSC_IGNORED;
>> +    }
>> +
>> +    return RSC_HANDLED;
>> +}
>> +
>>  static const struct rproc_ops zynqmp_r5_rproc_ops = {
>>      .prepare        = zynqmp_r5_rproc_prepare,
>>      .unprepare      = zynqmp_r5_rproc_unprepare,
>> @@ -900,6 +970,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = {
>>      .get_loaded_rsc_table = zynqmp_r5_get_loaded_rsc_table,
>>      .attach         = zynqmp_r5_attach,
>>      .detach         = zynqmp_r5_detach,
>> +    .handle_rsc     = zynqmp_r5_handle_rsc,
>>  };
>>  
>>  /**
>> @@ -939,7 +1010,7 @@ static struct zynqmp_r5_core 
>> *zynqmp_r5_alloc_rproc_core(struct device *cdev)
>>  
>>      rproc_coredump_set_elf_info(r5_rproc, ELFCLASS32, EM_ARM);
>>  
>> -    r5_rproc->recovery_disabled = true;
>> +    r5_rproc->recovery_disabled = false;
> 
> I'm good with the overall architecture of this set.
> 

Okay. If above comments looks good, I will send v6 soon.

> Regards,
> Mathieu
> 
>>      r5_rproc->has_iommu = false;
>>      r5_rproc->auto_boot = false;
>>  
>> -- 
>> 2.34.1
>>


Reply via email to