Hi Zhanpeng,

> 
> In order to use SSE within PMU drivers, register an SSE handler for the
> local PMU event. Reuse the existing overflow IRQ handler and pass
> appropriate pt_regs. Add a config option RISCV_PMU_SBI_SSE to select event
> delivery via SSE events.
> 
> When the SSE path is used, also honor the return value from
> perf_event_overflow(). If perf core throttles or disables an event, do not
> immediately restart the overflowed counters from the SSE handler.
> 
> Signed-off-by: Clément Léger <[email protected]>
> Co-developed-by: Zhanpeng Zhang <[email protected]>
> Signed-off-by: Zhanpeng Zhang <[email protected]>
> ---
>  drivers/perf/Kconfig           | 10 +++++
>  drivers/perf/riscv_pmu.c       | 23 +++++++++++
>  drivers/perf/riscv_pmu_sbi.c   | 78 ++++++++++++++++++++++++++++++++-----
>  include/linux/perf/riscv_pmu.h |  5 +++
>  4 files changed, 104 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
> index ab90932fc2d0..b6c58475091c 100644
> --- a/drivers/perf/Kconfig
> +++ b/drivers/perf/Kconfig
> @@ -105,6 +105,16 @@ config RISCV_PMU_SBI
>         full perf feature support i.e. counter overflow, privilege mode
>         filtering, counter configuration.
>  
> +config RISCV_PMU_SBI_SSE
> +     depends on RISCV_PMU && RISCV_SBI_SSE
> +     bool "RISC-V PMU SSE events"
> +     default n
> +     help
> +       Say y if you want to use SSE events to deliver PMU interrupts. This
> +       provides a way to profile the kernel at any level by using NMI-like
> +       SSE events. Since SSE events can be intrusive, this option allows
> +       selecting them only when needed.
> +
>  config STARFIVE_STARLINK_PMU
>       depends on ARCH_STARFIVE || COMPILE_TEST
>       depends on 64BIT
> diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
> index 8e3cd0f35336..a48e286d9394 100644
> --- a/drivers/perf/riscv_pmu.c
> +++ b/drivers/perf/riscv_pmu.c
> @@ -13,6 +13,7 @@
>  #include <linux/irqdesc.h>
>  #include <linux/perf/riscv_pmu.h>
>  #include <linux/printk.h>
> +#include <linux/riscv_sbi_sse.h>
>  #include <linux/smp.h>
>  #include <linux/sched_clock.h>
>  
> @@ -254,6 +255,24 @@ void riscv_pmu_start(struct perf_event *event, int flags)
>       perf_event_update_userpage(event);
>  }
>  
> +#ifdef CONFIG_RISCV_PMU_SBI_SSE
> +static void riscv_pmu_disable(struct pmu *pmu)
> +{
> +     struct riscv_pmu *rvpmu = to_riscv_pmu(pmu);
> +
> +     if (rvpmu->sse_evt)
> +             sse_event_disable_local(rvpmu->sse_evt);
> +}
> +
> +static void riscv_pmu_enable(struct pmu *pmu)
> +{
> +     struct riscv_pmu *rvpmu = to_riscv_pmu(pmu);
> +
> +     if (rvpmu->sse_evt)
> +             sse_event_enable_local(rvpmu->sse_evt);
> +}
> +#endif
> +
>  static int riscv_pmu_add(struct perf_event *event, int flags)
>  {
>       struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
> @@ -411,6 +430,10 @@ struct riscv_pmu *riscv_pmu_alloc(void)
>               .event_mapped   = riscv_pmu_event_mapped,
>               .event_unmapped = riscv_pmu_event_unmapped,
>               .event_idx      = riscv_pmu_event_idx,
> +#ifdef CONFIG_RISCV_PMU_SBI_SSE
> +             .pmu_enable     = riscv_pmu_enable,
> +             .pmu_disable    = riscv_pmu_disable,
> +#endif
>               .add            = riscv_pmu_add,
>               .del            = riscv_pmu_del,
>               .start          = riscv_pmu_start,
> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
> index 385af5e6e6d0..ac10ebd73c7c 100644
> --- a/drivers/perf/riscv_pmu_sbi.c
> +++ b/drivers/perf/riscv_pmu_sbi.c
> @@ -17,6 +17,7 @@
>  #include <linux/irqdomain.h>
>  #include <linux/of_irq.h>
>  #include <linux/of.h>
> +#include <linux/riscv_sbi_sse.h>
>  #include <linux/cpu_pm.h>
>  #include <linux/sched/clock.h>
>  #include <linux/soc/andes/irq.h>
> @@ -1038,10 +1039,10 @@ static void pmu_sbi_start_overflow_mask(struct 
> riscv_pmu *pmu,
>               pmu_sbi_start_ovf_ctrs_sbi(cpu_hw_evt, ctr_ovf_mask);
>  }
>  
> -static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
> +static irqreturn_t pmu_sbi_ovf_handler(struct cpu_hw_events *cpu_hw_evt,
> +                                    struct pt_regs *regs, bool from_sse)
>  {
>       struct perf_sample_data data;
> -     struct pt_regs *regs;
>       struct hw_perf_event *hw_evt;
>       union sbi_pmu_ctr_info *info;
>       int lidx, hidx, fidx;
> @@ -1049,7 +1050,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void 
> *dev)
>       struct perf_event *event;
> +     int ev_overflow = 0;
>       u64 overflow;
>       u64 overflowed_ctrs = 0;
> -     struct cpu_hw_events *cpu_hw_evt = dev;
>       u64 start_clock = sched_clock();
>       struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
>  
> @@ -1059,13 +1059,15 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void 
> *dev)
>       /* Firmware counter don't support overflow yet */
>       fidx = find_first_bit(cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS);
>       if (fidx == RISCV_MAX_COUNTERS) {
> -             csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num));
> +             if (!from_sse)
> +                     csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num));
>               return IRQ_NONE;
>       }
>  
>       event = cpu_hw_evt->events[fidx];
>       if (!event) {
> -             ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
> +             if (!from_sse)
> +                     ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
>               return IRQ_NONE;
>       }
>  
> @@ -1080,16 +1082,16 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void 
> *dev)
>  
>       /*
>        * Overflow interrupt pending bit should only be cleared after stopping
> -      * all the counters to avoid any race condition.
> +      * all the counters to avoid any race condition. When using SSE,
> +      * interrupt is cleared when stopping counters.
>        */
> -     ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
> +     if (!from_sse)
> +             ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
>  
>       /* No overflow bit is set */
>       if (!overflow)
>               return IRQ_NONE;
>  
> -     regs = get_irq_regs();
> -
>       for_each_set_bit(lidx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS) {
>               struct perf_event *event = cpu_hw_evt->events[lidx];
>  
> @@ -1133,18 +1136,65 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void 
> *dev)
>                        * TODO: We will need to stop the guest counters once
>                        * virtualization support is added.
>                        */
> -                     perf_event_overflow(event, &data, regs);
> +                     ev_overflow |= perf_event_overflow(event, &data, regs);
>               }
>               /* Reset the state as we are going to start the counter after 
> the loop */
>               hw_evt->state = 0;
>       }
>  
> -     pmu_sbi_start_overflow_mask(pmu, overflowed_ctrs);
> +     if (!ev_overflow || !from_sse)
> +             pmu_sbi_start_overflow_mask(pmu, overflowed_ctrs);
> +
>       perf_sample_event_took(sched_clock() - start_clock);
>  
>       return IRQ_HANDLED;
>  }
>  
> +static irqreturn_t pmu_sbi_ovf_irq_handler(int irq, void *dev)
> +{
> +     return pmu_sbi_ovf_handler(dev, get_irq_regs(), false);
> +}
> +
> +#ifdef CONFIG_RISCV_PMU_SBI_SSE
> +static int pmu_sbi_ovf_sse_handler(u32 evt, void *arg, struct pt_regs *regs)
> +{
> +     struct cpu_hw_events __percpu *hw_events = arg;
> +     struct cpu_hw_events *hw_event = raw_cpu_ptr(hw_events);
> +
> +     pmu_sbi_ovf_handler(hw_event, regs, true);
> +
> +     return 0;
> +}
> +
> +static int pmu_sbi_setup_sse(struct riscv_pmu *pmu)
> +{
> +     int ret;
> +     struct sse_event *evt;
> +     struct cpu_hw_events __percpu *hw_events = pmu->hw_events;
> +
> +     evt = sse_event_register(SBI_SSE_EVENT_LOCAL_PMU_OVERFLOW, 0,
> +                              pmu_sbi_ovf_sse_handler, hw_events);
> +     if (IS_ERR(evt))
> +             return PTR_ERR(evt);
> +
> +     ret = sse_event_enable(evt);
> +     if (ret) {
> +             sse_event_unregister(evt);
> +             return ret;
> +     }
> +
> +     pr_info("using SSE for PMU event delivery\n");
> +     pmu->sse_evt = evt;
> +
> +     return ret;
> +}
> +#else
> +static int pmu_sbi_setup_sse(struct riscv_pmu *pmu)
> +{
> +     return -EOPNOTSUPP;
> +}
> +#endif
> +
>  static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
>  {
>       struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node);
> @@ -1195,6 +1242,10 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, 
> struct platform_device *pde
>       struct cpu_hw_events __percpu *hw_events = pmu->hw_events;
>       struct irq_domain *domain = NULL;
>  
> +     ret = pmu_sbi_setup_sse(pmu);
> +     if (!ret)
> +             return 0;
> +
>       if (riscv_isa_extension_available(NULL, SSCOFPMF)) {
>               riscv_pmu_irq_num = RV_IRQ_PMU;
>               riscv_pmu_use_irq = true;
> @@ -1229,7 +1280,7 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, 
> struct platform_device *pde
>               return -ENODEV;
>       }
>  
> -     ret = request_percpu_irq(riscv_pmu_irq, pmu_sbi_ovf_handler, 
> "riscv-pmu", hw_events);
> +     ret = request_percpu_irq(riscv_pmu_irq, pmu_sbi_ovf_irq_handler, 
> "riscv-pmu", hw_events);
>       if (ret) {
>               pr_err("registering percpu irq failed [%d]\n", ret);
>               return ret;
> diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
> index f82a28040594..08fdcf6baf4e 100644
> --- a/include/linux/perf/riscv_pmu.h
> +++ b/include/linux/perf/riscv_pmu.h
> @@ -28,6 +28,8 @@
>  
>  #define RISCV_PMU_CONFIG1_GUEST_EVENTS 0x1
>  
> +struct sse_event;
> +
>  struct cpu_hw_events {
>       /* currently enabled events */
>       int                     n_events;
> @@ -54,6 +56,9 @@ struct riscv_pmu {
>       char            *name;
>  
>       irqreturn_t     (*handle_irq)(int irq_num, void *dev);
> +#ifdef CONFIG_RISCV_PMU_SBI_SSE
> +     struct sse_event *sse_evt;
> +#endif
>  
>       unsigned long   cmask;
>       u64             (*ctr_read)(struct perf_event *event);
> -- 
> 2.50.1 (Apple Git-155)

We found that if RISCV_PMU_SBI_SSE is enabled, executing 'perf record -g 
-F 999 ls' may occasionally fail. If the program is more complex, the 
error occurs 100% of the time. After disabling RISCV_PMU_SBI_SSE, the 
program runs normally. Therefore, we suspect there may be an issue with 
the PMU_SBI_SSE functionality. Have you encountered this phenomenon? Do 
you have any troubleshooting suggestions?
Below are some fault information:
[root@localhost ~]# perf record -g -F 999 ls
[ 9478.867438] BUG: spinlock bad magic on CPU#2, ls/1046
[ 9478.871534] Unable to handle kernel paging request at virtual address 
ffffffc6004940d0
[ 9478.872804] Oops [#1]
[ 9478.873359] Modules linked in: xt_MASQUERADE xfrm_user xfrm_algo iptable_nat 
xt_addrtype
iptable_filter ip_tables xt_conntrack x_tables nf_nat nf_conntrack 
nf_defrag_ipv6 nf_defrag_ipv4
libcrc32c br_netfilter bridge stp llc overlay virtio_gpu virtio_dma_buf 
drm_shmem_helper
drm_client_lib drm_kms_helper nls_iso8859_1 drm drm_panel_orientation_quirks 
backlight configfs efivarfs
[ 9478.877513] CPU: 2 UID: 0 PID: 1046 Comm: ls Not tainted 
6.13.0-00006-ge94d0b11d044-dirty #7
[ 9478.878544] Hardware name: QEMU QEMU Virtual Machine, BIOS 2.7 2/2/2022
[ 9478.879324] epc : string+0x48/0xe6
[ 9478.880100]  ra : vsnprintf+0xea/0x3b4
[ 9478.880697] epc : ffffffff809e39ae ra : ffffffff809e656e sp : 
ffffffc600493360
[ 9478.881331]  gp : ffffffff81516480 tp : ffffffd68d280000 t0 : 
ffffffc600493312
[ 9478.881952]  t1 : ffffffffffffffff t2 : 6e697073203a4755 s0 : 
ffffffc6004933a0
[ 9478.882557]  s1 : ffffffffffffffff a0 : ffffffc6004934d7 a1 : 
0000000000000000
[ 9478.883355]  a2 : ffffffc6004934a8 a3 : ffffffc6004930d0 a4 : 
ffffffc6004940d0
[ 9478.883988]  a5 : ffffffc7004934d6 a6 : ffffffffffffe000 a7 : 
0000000000000004
[ 9478.884600]  s2 : ffffffc6004934d7 s3 : ffffffc6004934a8 s4 : 
ffffffff80f24ea4
[ 9478.885420]  s5 : ffffffff000000ff s6 : ffffffff8100153e s7 : 
0000000000ffffff
[ 9478.886056]  s8 : ffffffff8100153c s9 : 0000000000000007 s10: 
0000000000000002
[ 9478.886859]  s11: ffffffc6004935f0 t3 : 0000000000000004 t4 : 
ffffffff80e592d8
[ 9478.887493]  t5 : ffffffff8152cb20 t6 : ffffffc60049307a
[ 9478.887946] status: 0000000200000100 badaddr: ffffffc6004940d0 cause: 
000000000000000d
[ 9478.888886] [<ffffffff809e39ae>] string+0x48/0xe6
[ 9478.889342] [<ffffffff809e656e>] vsnprintf+0xea/0x3b4
[ 9478.889795] [<ffffffff8007c2ce>] vprintk_store+0x108/0x3d2
[ 9478.890459] [<ffffffff8007d0ce>] vprintk_emit+0x82/0x218
[ 9478.890932] [<ffffffff8007d27a>] vprintk_default+0x16/0x1e
[ 9478.891416] [<ffffffff8007e270>] vprintk+0x1e/0x3c
[ 9478.892011] [<ffffffff8000332a>] _printk+0x32/0x50
[ 9478.892441] [<ffffffff80002f84>] spin_dump+0x5e/0x6e
[ 9478.892865] [<ffffffff80077bf4>] do_raw_spin_unlock+0x130/0x132
[ 9478.893542] [<ffffffff809f50e0>] _raw_spin_unlock+0x10/0x22
[ 9478.894031] [<ffffffff80166118>] filemap_map_pages+0x314/0x434
[ 9478.894541] [<ffffffff8019f6ea>] __handle_mm_fault+0x9ac/0xd50
[ 9478.895556] Code: 97aa a809 7463 00c5 0023 00d5 0505 2585 0663 00f5 (4683) 
0007 
[ 9478.896729] ---[ end trace 0000000000000000 ]---
[ 9478.897637] note: ls[1046] exited with irqs disabled
[ 9478.898653] note: ls[1046] exited with preempt_count 4

[root@localhost x264]# perf record -g -F 999 ./x264 -o output_static.mkv 
./Kimono_1920x1080_24.yuv
--input-res 1920x1080 --fps 24 --preset faster --vbv-maxrate 2400 --vbv-bufsize 
4800 --ref 1
--aq-mode 2 --aq-strength 1.2 --qcomp 0.8 --lookahead-threads 12 --ipratio 1.0 
--bframes 3
--rc-lookahead 6 --crf 28 --keyint 60 --scenecut 0 --frames 200                 
                                                                                
     
[   63.382672] Unable to handle kernel paging request at virtual address 
0000003feb0ff5d8                                                               
[   63.383325] Oops [#1]                                                        
                                                                        
[   63.383395] Modules linked in: xt_MASQUERADE xfrm_user xfrm_algo iptable_nat 
xt_addrtype
iptable_filter ip_tables xt_conntrack x_tables nf_nat nf_conntrack 
nf_defrag_ipv6 nf_defrag_ipv4
libcrc32c br_netfilter bridge stp llc overlay virtio_gpu virtio_dma_buf 
drm_shmem_helper
drm_client_lib drm_kms_helper nls_iso8859_1 drm configfs 
drm_panel_orientation_quirks
backlight efivarfs                                                              
            
[   63.385401] CPU: 3 UID: 0 PID: 580 Comm: x264 Not tainted 
6.13.0-00006-ge94d0b11d044-dirty #7                                             
   
[   63.385521] Hardware name: QEMU QEMU Virtual Machine, BIOS 2.7 2/2/2022      
                                                                        
[   63.385732] epc : walk_stackframe+0x5c/0x11a                                 
                                                                        
[   63.386470]  ra : walk_stackframe+0xda/0x11a                                 
                                                                        
[   63.386537] epc : ffffffff80013b5c ra : ffffffff80013bda sp : 
ffffffc600493a40                                                                
       
[   63.386579]  gp : ffffffff81516480 tp : ffffffd684b33300 t0 : 
ffffffd7febf9c20                                                                
       
[   63.386617]  t1 : 000000003b9aca00 t2 : ffffffd7febe8080 s0 : 
ffffffc600493a90                                                                
       
[   63.386654]  s1 : ffffffc600493ee0 a0 : 0000000000000001 a1 : 
ffffffff809f5a48                                                                
       
[   63.386692]  a2 : 0000000000000002 a3 : 0000003feb0ff5d0 a4 : 
ffffffffffffc000                                                                
       
[   63.386729]  a5 : 0000000000003fff a6 : ffffffff81552558 a7 : 
ffffffd698fb8000
[   63.386769]  s2 : 0000003feb100000 s3 : 0000003feb0ff5e0 s4 : 
ffffffff8001886a
[   63.386806]  s5 : ffffffc600493ac0 s6 : ffffffff809f5a48 s7 : 
0000000000000000
[   63.386844]  s8 : ffffffc600493ac0 s9 : 0000000000000000 s10: 
000000000007fff8
[   63.386886]  s11: ffffffd698fb8000 t3 : 0000000000000015 t4 : 
00000000000003e7
[   63.386932]  t5 : 0000000000452bf5 t6 : ffffffc600493f70
[   63.386965] status: 0000000200000100 badaddr: 0000003feb0ff5d8 cause: 
000000000000000d
[   63.387069] [<ffffffff80013b5c>] walk_stackframe+0x5c/0x11a
[   63.387122] [<ffffffff80018902>] perf_callchain_kernel+0x28/0x34
[   63.387154] [<ffffffff80160294>] get_perf_callchain+0x88/0x18e
[   63.387189] [<ffffffff8015bcf2>] perf_callchain+0x52/0x6e
[   63.387219] [<ffffffff8015c0f4>] perf_prepare_sample+0x3e6/0x730
[   63.387249] [<ffffffff8015c8d6>] perf_event_output_forward+0x56/0xb8
[   63.387280] [<ffffffff8015c51a>] __perf_event_overflow+0xdc/0x2e2
[   63.387310] [<ffffffff8015d954>] perf_event_overflow+0x12/0x1a
[   63.387341] [<ffffffff807e15e0>] pmu_sbi_ovf_handler+0x684/0x736
[   63.387383] [<ffffffff807e16ee>] pmu_sbi_ovf_sse_handler+0x26/0x30
[   63.387417] [<ffffffff807b73a0>] sse_handle_event+0x16/0x48
[   63.387449] [<ffffffff8001933a>] do_sse+0x76/0xa8
[   63.387478] [<ffffffff800195ac>] handle_sse+0xc0/0x162
[   63.387748] Code: 7933 00e9 6463 0539 f693 0079 e2a1 e062 8693 ff09 (bc03) 
0086 
[   63.388072] ---[ end trace 0000000000000000 ]---
[   63.388547] Kernel panic - not syncing: Fatal exception in interrupt
[   63.388870] SMP: stopping secondary CPUs

Regards
Yangsusheng

Reply via email to