Hello Jon, Bjorn, On Thu, May 22, 2025 at 06:21:24PM -0500, Bjorn Helgaas wrote: > @@ -790,6 +818,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, > trace_aer_event(pci_name(dev), (status & ~mask), > aer_severity, tlp_header_valid, &aer->header_log); > > + if (!aer_ratelimit(dev, info.severity)) > + return;
I am seeing a kernel NULL pointer in the aer_ratelimit(), where dev->aer_info is NULL. This is happening on linus final 6.16 commit id. Here is dmesg: {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0 {1}[Hardware Error]: It has been corrected by h/w and requires no further action {1}[Hardware Error]: event severity: corrected {1}[Hardware Error]: Error 0, type: corrected {1}[Hardware Error]: section_type: PCIe error {1}[Hardware Error]: port_type: 4, root port {1}[Hardware Error]: version: 3.0 {1}[Hardware Error]: command: 0x0540, status: 0x0010 {1}[Hardware Error]: device_id: 0000:00:00.0 {1}[Hardware Error]: slot: 0 {1}[Hardware Error]: secondary_bus: 0x00 {1}[Hardware Error]: vendor_id: 0x8086, device_id: 0x2020 {1}[Hardware Error]: class_code: 060000 {1}[Hardware Error]: aer_cor_status: 0x00001000, aer_cor_mask: 0x00002000 {1}[Hardware Error]: aer_uncor_status: 0x00000000, aer_uncor_mask: 0x00100000 {1}[Hardware Error]: aer_uncor_severity: 0x000e3030 {1}[Hardware Error]: TLP Header: 00000000 00000000 00000000 00000000 BUG: kernel NULL pointer dereference, address: 0000000000000264 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD d4c32b067 P4D d4c32b067 PUD 4e4b84067 PMD 0 Oops: Oops: 0000 [#1] SMP CPU: 0 UID: 0 PID: 1553927 Comm: kworker/0:0 Kdump: loaded Tainted: G S E 6.16.0-0_fbk0_rc1_0_geb76fb6facf5 #1 NONE Tainted: [S]=CPU_OUT_OF_SPEC, [E]=UNSIGNED_MODULE Hardware name: Quanta Delta Lake MP 29F0EMA01C0/Delta Lake-Class1, BIOS F0E_3A21 06/27/2024 Workqueue: events aer_recover_work_func RIP: 0010:___ratelimit+0xc/0x1b0 Code: 48 8b 3d ef aa 59 02 e8 72 75 31 ff ff 8b 10 78 54 84 75 da 31 c0 5b c3 cc cc cc cc cc cc 55 41 57 41 56 41 54 53 50 48 89 fb <4c> 63 7f 04 4d 85 ff 0f 9e c0 8b 6f 08 85 ed 0f 9e c1 08 c1 80 f9 RSP: 0018:ffffc90028337d10 EFLAGS: 00010206 RAX: ffffc900011f9174 RBX: 0000000000000260 RCX: 0000000000000000 RDX: 0000000000001000 RSI: ffffffff827ea3c0 RDI: 0000000000000260 RBP: 0000000000000002 R08: 8080808080808080 R09: fefefefefefefeff R10: 000073746e657665 R11: 8080000000000000 R12: 0000000000001000 R13: 0000000000000002 R14: ffff8882877da000 R15: ffffc900011f9158 FS: 0000000000000000(0000) GS:ffff8890b1cf9000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000264 CR3: 00000003cee6a004 CR4: 00000000007726f0 PKRU: 55555554 Call Trace: <TASK> pci_print_aer+0x141/0x360 aer_recover_work_func+0xb5/0x130 process_scheduled_works+0x1a4/0x360 worker_thread+0x2df/0x3b0 kthread+0x1d2/0x1f0 ? pr_cont_work+0x1c0/0x1c0 ? finish_task_switch+0x213/0x2e0 ? kthread_blkcg+0x30/0x30 ret_from_fork+0x69/0xe0 ? kthread_blkcg+0x30/0x30 ret_from_fork_asm+0x11/0x20 </TASK> Here is the decoded stack from the crash dump: >>> trace #0 ___ratelimit (lib/ratelimit.c:33:17) #1 aer_ratelimit (drivers/pci/pcie/aer.c:0:2) #2 pci_print_aer (drivers/pci/pcie/aer.c:923:7) #3 aer_recover_work_func (drivers/pci/pcie/aer.c:1298:3) #4 process_one_work (kernel/workqueue.c:3238:2) #5 process_scheduled_works (kernel/workqueue.c:3321:3) #6 worker_thread (kernel/workqueue.c:3402:4) #7 kthread (kernel/kthread.c:464:9) #8 ret_from_fork (arch/x86/kernel/process.c:148:3) #9 ret_from_fork_asm+0x11/0x16 (arch/x86/entry/entry_64.S:245) And aer_info is NULL >>> trace[2]['dev'].aer_info (struct aer_info *)0x0 So, somehow the PCI was released at this point? Here is the state of the device at the crash time: >>> trace[2]['dev'] *(struct pci_dev *)0xffff8882877da000 = { .bus_list = (struct list_head){ .next = (struct list_head *)0xffff8882877db000, .prev = (struct list_head *)0xffff888287674428, }, .bus = (struct pci_bus *)0xffff888287674400, .subordinate = (struct pci_bus *)0x0, .sysdata = (void *)0xffff888284c94b38, .procent = (struct proc_dir_entry *)0xffff88828a515980, .slot = (struct pci_slot *)0x0, .devfn = (unsigned int)0, .vendor = (unsigned short)32902, .device = (unsigned short)8224, .subsystem_vendor = (unsigned short)32902, .subsystem_device = (unsigned short)0, .class = (unsigned int)393216, .revision = (u8)11, .hdr_type = (u8)0, .aer_cap = (u16)0, .aer_info = (struct aer_info *)0x0, .rcec_ea = (struct rcec_ea *)0x0, .rcec = (struct pci_dev *)0x0, .devcap = (u32)32768, .rebar_cap = (u16)0, .pcie_cap = (u8)144, .msi_cap = (u8)0, .msix_cap = (u8)0, .pcie_mpss = (u8)0, .rom_base_reg = (u8)48, .pin = (u8)1, .pcie_flags_reg = (u16)66, .dma_alias_mask = (unsigned long *)0x0, .driver = (struct pci_driver *)0x0, .dma_mask = (u64)4294967295, .dma_parms = (struct device_dma_parameters){ .max_segment_size = (unsigned int)65536, .min_align_mask = (unsigned int)0, .segment_boundary_mask = (unsigned long)4294967295, }, .current_state = (pci_power_t)0, .pm_cap = (u8)224, .pme_support = (unsigned int)0, .pme_poll = (unsigned int)0, .pinned = (unsigned int)0, .config_rrs_sv = (unsigned int)0, .imm_ready = (unsigned int)0, .d1_support = (unsigned int)0, .d2_support = (unsigned int)0, .no_d1d2 = (unsigned int)0, .no_d3cold = (unsigned int)0, .bridge_d3 = (unsigned int)1, .d3cold_allowed = (unsigned int)1, .mmio_always_on = (unsigned int)1, .wakeup_prepared = (unsigned int)0, .skip_bus_pm = (unsigned int)0, .ignore_hotplug = (unsigned int)0, .hotplug_user_indicators = (unsigned int)0, .clear_retrain_link = (unsigned int)0, .d3hot_delay = (unsigned int)10, .d3cold_delay = (unsigned int)100, .l1ss = (u16)0, .link_state = (struct pcie_link_state *)0x0, .ltr_path = (unsigned int)0, .pasid_no_tlp = (unsigned int)0, .eetlp_prefix_max = (unsigned int)0, .error_state = (pci_channel_state_t)1, .dev = (struct device){ .kobj = (struct kobject){ .name = (const char *)0xffff888287645e50 = "0000:00:00.0", .entry = (struct list_head){ .next = (struct list_head *)0xffff8882877db0d0, .prev = (struct list_head *)0xffff888287674520, }, .parent = (struct kobject *)0xffff888287674000, .kset = (struct kset *)0xffff888281f19240, .ktype = (const struct kobj_type *)device_ktype+0x0 = 0xffffffff82a41fd0, .sd = (struct kernfs_node *)0xffff888287859220, .kref = (struct kref){ .refcount = (refcount_t){ .refs = (atomic_t){ .counter = (int)5, }, }, }, .state_initialized = (unsigned int)1, .state_in_sysfs = (unsigned int)1, .state_add_uevent_sent = (unsigned int)1, .state_remove_uevent_sent = (unsigned int)0, .uevent_suppress = (unsigned int)0, }, .parent = (struct device *)0xffff888287674000, .p = (struct device_private *)0xffff88828771e000, .init_name = (const char *)0x0, .type = (const struct device_type *)pci_dev_type+0x0 = 0xffffffff82a03fe0, .bus = (const struct bus_type *)pci_bus_type+0x0 = 0xffffffff82a057f8, .driver = (struct device_driver *)0x0, .platform_data = (void *)0x0, .driver_data = (void *)0x0, .mutex = (struct mutex){ .owner = (atomic_long_t){ .counter = (s64)0, }, .wait_lock = (raw_spinlock_t){ .raw_lock = (arch_spinlock_t){ .val = (atomic_t){ .counter = (int)0, }, .locked = (u8)0, .pending = (u8)0, .locked_pending = (u16)0, .tail = (u16)0, }, }, .osq = (struct optimistic_spin_queue){ .tail = (atomic_t){ .counter = (int)0, }, }, .wait_list = (struct list_head){ .next = (struct list_head *)0xffff8882877da158, .prev = (struct list_head *)0xffff8882877da158, }, }, .links = (struct dev_links_info){ .suppliers = (struct list_head){ .next = (struct list_head *)0xffff8882877da168, .prev = (struct list_head *)0xffff8882877da168, }, .consumers = (struct list_head){ .next = (struct list_head *)0xffff8882877da178, .prev = (struct list_head *)0xffff8882877da178, }, .defer_sync = (struct list_head){ .next = (struct list_head *)0xffff8882877da188, .prev = (struct list_head *)0xffff8882877da188, }, .status = (enum dl_dev_state)DL_DEV_NO_DRIVER, }, .power = (struct dev_pm_info){ .power_state = (pm_message_t){ .event = (int)0, }, .can_wakeup = (bool)0, .async_suspend = (bool)1, .in_dpm_list = (bool)0, .is_prepared = (bool)0, .is_suspended = (bool)0, .is_noirq_suspended = (bool)0, .is_late_suspended = (bool)0, .no_pm = (bool)0, .early_init = (bool)1, .direct_complete = (bool)0, .driver_flags = (u32)0, .lock = (spinlock_t){ .rlock = (struct raw_spinlock){ .raw_lock = (arch_spinlock_t){ .val = (atomic_t){ .counter = (int)0, }, .locked = (u8)0, .pending = (u8)0, .locked_pending = (u16)0, .tail = (u16)0, }, }, }, .should_wakeup = (bool)0, .subsys_data = (struct pm_subsys_data *)0x0, .set_latency_tolerance = (void (*)(struct device *, s32))0x0, .qos = (struct dev_pm_qos *)0x0, }, .pm_domain = (struct dev_pm_domain *)0x0, .msi = (struct dev_msi_info){ .domain = (struct irq_domain *)0xffff888281000000, .data = (struct msi_device_data *)0x0, }, .dma_mask = (u64 *)0xffff8882877da088, .coherent_dma_mask = (u64)4294967295, .bus_dma_limit = (u64)0, .dma_range_map = (const struct bus_dma_region *)0x0, .dma_parms = (struct device_dma_parameters *)0xffff8882877da090, .dma_pools = (struct list_head){ .next = (struct list_head *)0xffff8882877da210, .prev = (struct list_head *)0xffff8882877da210, }, .cma_area = (struct cma *)0x0, .dma_io_tlb_mem = (struct io_tlb_mem *)io_tlb_default_mem.llvm.9097096581342952386+0x0 = 0xffffffff8477f850, .archdata = (struct dev_archdata){}, .of_node = (struct device_node *)0x0, .fwnode = (struct fwnode_handle *)0xffff888287484010, .numa_node = (int)0, .devt = (dev_t)0, .id = (u32)0, .devres_lock = (spinlock_t){ .rlock = (struct raw_spinlock){ .raw_lock = (arch_spinlock_t){ .val = (atomic_t){ .counter = (int)0, }, .locked = (u8)0, .pending = (u8)0, .locked_pending = (u16)0, .tail = (u16)0, }, }, }, .devres_head = (struct list_head){ .next = (struct list_head *)0xffff8882877da250, .prev = (struct list_head *)0xffff8882877da250, }, .class = (const struct class *)0x0, .groups = (const struct attribute_group **)0x0, .release = (void (*)(struct device *))pci_release_dev+0x0 = 0xffffffff81b11680, .iommu_group = (struct iommu_group *)0x0, .iommu = (struct dev_iommu *)0x0, .physical_location = (struct device_physical_location *)0x0, .removable = (enum device_removable)DEVICE_REMOVABLE_NOT_SUPPORTED, .offline_disabled = (bool)0, .offline = (bool)0, .of_node_reused = (bool)0, .state_synced = (bool)0, .can_match = (bool)0, .dma_skip_sync = (bool)0, .dma_iommu = (bool)0, }, .cfg_size = (int)4096, .irq = (unsigned int)0, .resource = (struct resource [17]){ { .start = (resource_size_t)0, .end = (resource_size_t)0, .name = (const char *)0xffff888287645e50 = "0000:00:00.0", .flags = (unsigned long)0, .desc = (unsigned long)0, .parent = (struct resource *)0x0, .sibling = (struct resource *)0x0, .child = (struct resource *)0x0, }, { .start = (resource_size_t)0, .end = (resource_size_t)0, .name = (const char *)0xffff888287645e50 = "0000:00:00.0", .flags = (unsigned long)0, .desc = (unsigned long)0, .parent = (struct resource *)0x0, .sibling = (struct resource *)0x0, .child = (struct resource *)0x0, }, { .start = (resource_size_t)0, .end = (resource_size_t)0, .name = (const char *)0xffff888287645e50 = "0000:00:00.0", .flags = (unsigned long)0, .desc = (unsigned long)0, .parent = (struct resource *)0x0, .sibling = (struct resource *)0x0, .child = (struct resource *)0x0, }, { .start = (resource_size_t)0, .end = (resource_size_t)0, .name = (const char *)0xffff888287645e50 = "0000:00:00.0", .flags = (unsigned long)0, .desc = (unsigned long)0, .parent = (struct resource *)0x0, .sibling = (struct resource *)0x0, .child = (struct resource *)0x0, }, { .start = (resource_size_t)0, .end = (resource_size_t)0, .name = (const char *)0xffff888287645e50 = "0000:00:00.0", .flags = (unsigned long)0, .desc = (unsigned long)0, .parent = (struct resource *)0x0, .sibling = (struct resource *)0x0, .child = (struct resource *)0x0, }, { .start = (resource_size_t)0, .end = (resource_size_t)0, .name = (const char *)0xffff888287645e50 = "0000:00:00.0", .flags = (unsigned long)0, .desc = (unsigned long)0, .parent = (struct resource *)0x0, .sibling = (struct resource *)0x0, .child = (struct resource *)0x0, }, { .start = (resource_size_t)0, .end = (resource_size_t)0, .name = (const char *)0xffff888287645e50 = "0000:00:00.0", .flags = (unsigned long)0, .desc = (unsigned long)0, .parent = (struct resource *)0x0, .sibling = (struct resource *)0x0, .child = (struct resource *)0x0, }, }, .driver_exclusive_resource = (struct resource){ .start = (resource_size_t)0, .end = (resource_size_t)18446744073709551615, .name = (const char *)0xffffffff82901265 = "PCI Exclusive", .flags = (unsigned long)0, .desc = (unsigned long)0, .parent = (struct resource *)0x0, .sibling = (struct resource *)0x0, .child = (struct resource *)0x0, }, .transparent = (unsigned int)0, .io_window = (unsigned int)0, .pref_window = (unsigned int)0, .pref_64_window = (unsigned int)0, .multifunction = (unsigned int)0, .is_busmaster = (unsigned int)0, .no_msi = (unsigned int)0, .no_64bit_msi = (unsigned int)0, .block_cfg_access = (unsigned int)0, .broken_parity_status = (unsigned int)0, .irq_reroute_variant = (unsigned int)0, .msi_enabled = (unsigned int)0, .msix_enabled = (unsigned int)0, .ari_enabled = (unsigned int)0, .ats_enabled = (unsigned int)0, .pasid_enabled = (unsigned int)0, .pri_enabled = (unsigned int)0, .tph_enabled = (unsigned int)0, .is_managed = (unsigned int)0, .is_msi_managed = (unsigned int)0, .needs_freset = (unsigned int)0, .state_saved = (unsigned int)0, .is_physfn = (unsigned int)0, .is_virtfn = (unsigned int)0, .is_hotplug_bridge = (unsigned int)0, .shpc_managed = (unsigned int)0, .is_thunderbolt = (unsigned int)0, .untrusted = (unsigned int)0, .external_facing = (unsigned int)0, .broken_intx_masking = (unsigned int)0, .io_window_1k = (unsigned int)0, .irq_managed = (unsigned int)0, .non_compliant_bars = (unsigned int)0, .is_probed = (unsigned int)0, .link_active_reporting = (unsigned int)1, .no_vf_scan = (unsigned int)0, .no_command_memory = (unsigned int)0, .rom_bar_overlap = (unsigned int)0, .rom_attr_enabled = (unsigned int)0, .non_mappable_bars = (unsigned int)0, .dev_flags = (pci_dev_flags_t)0, .enable_cnt = (atomic_t){ .counter = (int)0, }, .pcie_cap_lock = (spinlock_t){ .rlock = (struct raw_spinlock){ .raw_lock = (arch_spinlock_t){ .val = (atomic_t){ .counter = (int)0, }, .locked = (u8)0, .pending = (u8)0, .locked_pending = (u16)0, .tail = (u16)0, }, }, }, .saved_config_space = (u32 [16]){}, .saved_cap_space = (struct hlist_head){ .first = (struct hlist_node *)0xffff8882867cf300, }, .res_attr = (struct bin_attribute *[17]){}, .res_attr_wc = (struct bin_attribute *[17]){}, .broken_cmd_compl = (unsigned int)0, .ptm_cap = (u16)0, .ptm_root = (unsigned int)0, .ptm_enabled = (unsigned int)0, .ptm_granularity = (u8)0, .msix_base = (void *)0x0, .msi_lock = (raw_spinlock_t){ .raw_lock = (arch_spinlock_t){ .val = (atomic_t){ .counter = (int)0, }, .locked = (u8)0, .pending = (u8)0, .locked_pending = (u16)0, .tail = (u16)0, }, }, .vpd = (struct pci_vpd){ .lock = (struct mutex){ .owner = (atomic_long_t){ .counter = (s64)0, }, .wait_lock = (raw_spinlock_t){ .raw_lock = (arch_spinlock_t){ .val = (atomic_t){ .counter = (int)0, }, .locked = (u8)0, .pending = (u8)0, .locked_pending = (u16)0, .tail = (u16)0, }, }, .osq = (struct optimistic_spin_queue){ .tail = (atomic_t){ .counter = (int)0, }, }, .wait_list = (struct list_head){ .next = (struct list_head *)0xffff8882877da8b0, .prev = (struct list_head *)0xffff8882877da8b0, }, }, .len = (unsigned int)0, .cap = (u8)0, }, .dpc_cap = (u16)0, .dpc_rp_extensions = (unsigned int)0, .dpc_rp_log_size = (u8)0, .link_bwctrl = (struct pcie_bwctrl_data *)0x0, .sriov = (struct pci_sriov *)0x0, .physfn = (struct pci_dev *)0x0, .ats_cap = (u16)0, .ats_stu = (u8)0, .pri_cap = (u16)0, .pri_reqs_alloc = (u32)0, .pasid_required = (unsigned int)0, .pasid_cap = (u16)0, .pasid_features = (u16)0, .p2pdma = (struct pci_p2pdma *)0x0, .doe_mbs = (struct xarray){ .xa_lock = (spinlock_t){ .rlock = (struct raw_spinlock){ .raw_lock = (arch_spinlock_t){ .val = (atomic_t){ .counter = (int)0, }, .locked = (u8)0, .pending = (u8)0, .locked_pending = (u16)0, .tail = (u16)0, }, }, }, .xa_flags = (gfp_t)0, .xa_head = (void *)0x0, }, .acs_cap = (u16)0, .supported_speeds = (u8)14, .rom = (phys_addr_t)0, .romlen = (size_t)0, .driver_override = (const char *)0x0, .priv_flags = (unsigned long)129, .reset_methods = (u8 [8]){}, }