On Thu, 22 May 2025, Bjorn Helgaas wrote: > From: Bjorn Helgaas <bhelg...@google.com> > > Previously aer_get_device_error_info() and aer_print_error() took a pointer > to struct aer_err_info and a pointer to a pci_dev. Typically the pci_dev > was one of the elements of the aer_err_info.dev[] array (DPC was an > exception, where the dev[] array was unused). > > Convert aer_get_device_error_info() and aer_print_error() to take an index > into the aer_err_info.dev[] array instead. A future patch will add > per-device ratelimit information, so the index makes it convenient to find > the ratelimit associated with the device. > > To accommodate DPC, set info->dev[0] to the DPC port before using these > interfaces. > > Signed-off-by: Bjorn Helgaas <bhelg...@google.com> > --- > drivers/pci/pci.h | 4 ++-- > drivers/pci/pcie/aer.c | 33 +++++++++++++++++++++++---------- > drivers/pci/pcie/dpc.c | 8 ++++++-- > 3 files changed, 31 insertions(+), 14 deletions(-) > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 1a9bfc708757..e1a28215967f 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -605,8 +605,8 @@ struct aer_err_info { > struct pcie_tlp_log tlp; /* TLP Header */ > }; > > -int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info > *info); > -void aer_print_error(struct pci_dev *dev, struct aer_err_info *info); > +int aer_get_device_error_info(struct aer_err_info *info, int i); > +void aer_print_error(struct aer_err_info *info, int i); > > int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, > unsigned int tlp_len, bool flit, > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 787a953fb331..237741e66d28 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -705,12 +705,18 @@ static void aer_print_source(struct pci_dev *dev, > struct aer_err_info *info, > found ? "" : " (no details found"); > } > > -void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) > +void aer_print_error(struct aer_err_info *info, int i) > { > - int layer, agent; > - int id = pci_dev_id(dev); > + struct pci_dev *dev; > + int layer, agent, id; > const char *level = info->level; > > + if (i >= AER_MAX_MULTI_ERR_DEVICES) > + return;
Are these OoB checks actually indication of a logic error in the caller side which would perhaps warrant using if (WARN_ON_ONCE(i >= AER_MAX_MULTI_ERR_DEVICES)) ? Reviewed-by: Ilpo Järvinen <ilpo.jarvi...@linux.intel.com> > + > + dev = info->dev[i]; > + id = pci_dev_id(dev); > + > pci_dev_aer_stats_incr(dev, info); > trace_aer_event(pci_name(dev), (info->status & ~info->mask), > info->severity, info->tlp_header_valid, &info->tlp); > @@ -1193,19 +1199,26 @@ EXPORT_SYMBOL_GPL(aer_recover_queue); > > /** > * aer_get_device_error_info - read error status from dev and store it to > info > - * @dev: pointer to the device expected to have an error record > * @info: pointer to structure to store the error record > + * @i: index into info->dev[] > * > * Return: 1 on success, 0 on error. > * > * Note that @info is reused among all error devices. Clear fields properly. > */ > -int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) > +int aer_get_device_error_info(struct aer_err_info *info, int i) > { > - int type = pci_pcie_type(dev); > - int aer = dev->aer_cap; > + struct pci_dev *dev; > + int type, aer; > u32 aercc; > > + if (i >= AER_MAX_MULTI_ERR_DEVICES) > + return 0; > + > + dev = info->dev[i]; > + aer = dev->aer_cap; > + type = pci_pcie_type(dev); > + > /* Must reset in this function */ > info->status = 0; > info->tlp_header_valid = 0; > @@ -1257,11 +1270,11 @@ static inline void aer_process_err_devices(struct > aer_err_info *e_info) > > /* Report all before handling them, to not lose records by reset etc. */ > for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { > - if (aer_get_device_error_info(e_info->dev[i], e_info)) > - aer_print_error(e_info->dev[i], e_info); > + if (aer_get_device_error_info(e_info, i)) > + aer_print_error(e_info, i); > } > for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { > - if (aer_get_device_error_info(e_info->dev[i], e_info)) > + if (aer_get_device_error_info(e_info, i)) > handle_error_source(e_info->dev[i], e_info); > } > } > diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c > index 7ae1590ea1da..fc18349614d7 100644 > --- a/drivers/pci/pcie/dpc.c > +++ b/drivers/pci/pcie/dpc.c > @@ -253,6 +253,10 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev > *dev, > info->severity = AER_NONFATAL; > > info->level = KERN_ERR; > + > + info->dev[0] = dev; > + info->error_dev_num = 1; > + > return 1; > } > > @@ -270,8 +274,8 @@ void dpc_process_error(struct pci_dev *pdev) > pci_warn(pdev, "containment event, status:%#06x: unmasked > uncorrectable error detected\n", > status); > if (dpc_get_aer_uncorrect_severity(pdev, &info) && > - aer_get_device_error_info(pdev, &info)) { > - aer_print_error(pdev, &info); > + aer_get_device_error_info(&info, 0)) { > + aer_print_error(&info, 0); > pci_aer_clear_nonfatal_status(pdev); > pci_aer_clear_fatal_status(pdev); > } > -- i.