Hi Andrii and Oleg.

This patch sent by me two weeks ago also aim to optimize the performance of 
uprobe
on arm64. I notice recent discussions on the performance and scalability of 
uprobes
within the mailing list. Considering this interest, I've added you and other 
relevant
maintainers to the CC list for broader visibility and potential collaboration.

Thanks.

在 2024/7/27 17:44, Liao Chang 写道:
> The profiling result of single-thread model of selftests bench reveals
> performance bottlenecks in find_uprobe() and caches_clean_inval_pou() on
> ARM64. On my local testing machine, 5% of CPU time is consumed by
> find_uprobe() for trig-uprobe-ret, while caches_clean_inval_pou() take
> about 34% of CPU time for trig-uprobe-nop and trig-uprobe-push.
> 
> This patch introduce struct uprobe_breakpoint to track previously
> allocated insn_slot for frequently hit uprobe. it effectively reduce the
> need for redundant insn_slot writes and subsequent expensive cache
> flush, especially on architecture like ARM64. This patch has been tested
> on Kunpeng916 (Hi1616), 4 NUMA nodes, 64 cores@ 2.4GHz. The selftest
> bench and Redis GET/SET benchmark result below reveal obivious
> performance gain.
> 
> before-opt
> ----------
> trig-uprobe-nop:  0.371 ± 0.001M/s (0.371M/prod)
> trig-uprobe-push: 0.370 ± 0.001M/s (0.370M/prod)
> trig-uprobe-ret:  1.637 ± 0.001M/s (1.647M/prod)
> trig-uretprobe-nop:  0.331 ± 0.004M/s (0.331M/prod)
> trig-uretprobe-push: 0.333 ± 0.000M/s (0.333M/prod)
> trig-uretprobe-ret:  0.854 ± 0.002M/s (0.854M/prod)
> Redis SET (RPS) uprobe: 42728.52
> Redis GET (RPS) uprobe: 43640.18
> Redis SET (RPS) uretprobe: 40624.54
> Redis GET (RPS) uretprobe: 41180.56
> 
> after-opt
> ---------
> trig-uprobe-nop:  0.916 ± 0.001M/s (0.916M/prod)
> trig-uprobe-push: 0.908 ± 0.001M/s (0.908M/prod)
> trig-uprobe-ret:  1.855 ± 0.000M/s (1.855M/prod)
> trig-uretprobe-nop:  0.640 ± 0.000M/s (0.640M/prod)
> trig-uretprobe-push: 0.633 ± 0.001M/s (0.633M/prod)
> trig-uretprobe-ret:  0.978 ± 0.003M/s (0.978M/prod)
> Redis SET (RPS) uprobe: 43939.69
> Redis GET (RPS) uprobe: 45200.80
> Redis SET (RPS) uretprobe: 41658.58
> Redis GET (RPS) uretprobe: 42805.80
> 
> While some uprobes might still need to share the same insn_slot, this
> patch compare the instructions in the resued insn_slot with the
> instructions execute out-of-line firstly to decides allocate a new one
> or not.
> 
> Additionally, this patch use a rbtree associated with each thread that
> hit uprobes to manage these allocated uprobe_breakpoint data. Due to the
> rbtree of uprobe_breakpoints has smaller node, better locality and less
> contention, it result in faster lookup times compared to find_uprobe().
> 
> The other part of this patch are some necessary memory management for
> uprobe_breakpoint data. A uprobe_breakpoint is allocated for each newly
> hit uprobe that doesn't already have a corresponding node in rbtree. All
> uprobe_breakpoints will be freed when thread exit.
> 
> Signed-off-by: Liao Chang <liaocha...@huawei.com>
> ---
>  include/linux/uprobes.h |   3 +
>  kernel/events/uprobes.c | 246 +++++++++++++++++++++++++++++++++-------
>  2 files changed, 211 insertions(+), 38 deletions(-)
> 
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index f46e0ca0169c..04ee465980af 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -78,6 +78,9 @@ struct uprobe_task {
>  
>       struct return_instance          *return_instances;
>       unsigned int                    depth;
> +
> +     struct rb_root                  breakpoints_tree;
> +     rwlock_t                        breakpoints_treelock;
>  };
>  
>  struct return_instance {
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2c83ba776fc7..3f1a6dc2a327 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -33,6 +33,7 @@
>  #define MAX_UPROBE_XOL_SLOTS         UINSNS_PER_PAGE
>  
>  static struct rb_root uprobes_tree = RB_ROOT;
> +
>  /*
>   * allows us to skip the uprobe_mmap if there are no uprobe events active
>   * at this time.  Probably a fine grained per inode count is better?
> @@ -886,6 +887,174 @@ static bool filter_chain(struct uprobe *uprobe,
>       return ret;
>  }
>  
> +static struct uprobe_task *get_utask(void);
> +static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr);
> +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int 
> *is_swbp);
> +
> +struct uprobe_breakpoint {
> +     struct rb_node          rb_node;
> +     refcount_t              ref;
> +     unsigned long           slot;
> +     unsigned long           vaddr;
> +     struct uprobe           *uprobe;
> +};
> +
> +static void put_ubp(struct uprobe_breakpoint *ubp)
> +{
> +     if (refcount_dec_and_test(&ubp->ref)) {
> +             put_uprobe(ubp->uprobe);
> +             kfree(ubp);
> +     }
> +}
> +
> +static struct uprobe_breakpoint *get_ubp(struct uprobe_breakpoint *ubp)
> +{
> +     refcount_inc(&ubp->ref);
> +     return ubp;
> +}
> +
> +#define __node_2_ubp(node) rb_entry((node), struct uprobe_breakpoint, 
> rb_node)
> +
> +struct __ubp_key {
> +     unsigned long bp_vaddr;
> +};
> +
> +static int ubp_cmp(const unsigned long bp_vaddr,
> +                const struct uprobe_breakpoint *ubp)
> +{
> +     if (bp_vaddr < ubp->vaddr)
> +             return -1;
> +
> +     if (bp_vaddr > ubp->vaddr)
> +             return 1;
> +
> +     return 0;
> +}
> +
> +static int __ubp_cmp_key(const void *k, const struct rb_node *b)
> +{
> +     const struct __ubp_key *key = k;
> +
> +     return ubp_cmp(key->bp_vaddr, __node_2_ubp(b));
> +}
> +
> +static int __ubp_cmp(struct rb_node *a, const struct rb_node *b)
> +{
> +     const struct uprobe_breakpoint *ubp = __node_2_ubp(a);
> +
> +     return ubp_cmp(ubp->vaddr, __node_2_ubp(b));
> +}
> +
> +static void delete_breakpoint(struct uprobe_task *utask)
> +{
> +     struct rb_node *node;
> +
> +     write_lock(&utask->breakpoints_treelock);
> +     node = rb_first(&utask->breakpoints_tree);
> +     while (node) {
> +             rb_erase(node, &utask->breakpoints_tree);
> +             write_unlock(&utask->breakpoints_treelock);
> +
> +             put_ubp(__node_2_ubp(node));
> +
> +             write_lock(&utask->breakpoints_treelock);
> +             node = rb_next(node);
> +     }
> +     write_unlock(&utask->breakpoints_treelock);
> +}
> +
> +static struct uprobe_breakpoint *alloc_breakpoint(struct uprobe_task *utask,
> +                                               struct uprobe *uprobe,
> +                                               unsigned long bp_vaddr)
> +{
> +     struct uprobe_breakpoint *ubp;
> +     struct rb_node *node;
> +
> +     ubp = kzalloc(sizeof(struct uprobe_breakpoint), GFP_KERNEL);
> +     if (!ubp)
> +             return NULL;
> +
> +     ubp->vaddr = bp_vaddr;
> +     ubp->uprobe = uprobe;
> +     /* get access + creation ref */
> +     refcount_set(&ubp->ref, 2);
> +     ubp->slot = UINSNS_PER_PAGE;
> +
> +     write_lock(&utask->breakpoints_treelock);
> +     node = rb_find_add(&ubp->rb_node, &utask->breakpoints_tree, __ubp_cmp);
> +     write_unlock(&utask->breakpoints_treelock);
> +
> +     /* Two or more threads hit the same breakpoint */
> +     if (node) {
> +             WARN_ON(uprobe != __node_2_ubp(node)->upobre);

A stupid typo.

s/upobre/uprobe/g

> +             kfree(ubp);
> +             return get_ubp(__node_2_ubp(node));
> +     }
> +
> +     return ubp;
> +}
> +
> +static struct uprobe_breakpoint *find_breakpoint(struct uprobe_task *utask,
> +                                              unsigned long bp_vaddr)
> +{
> +     struct rb_node *node;
> +     struct __ubp_key key = {
> +             .bp_vaddr = bp_vaddr,
> +     };
> +
> +     read_lock(&utask->breakpoints_treelock);
> +     node = rb_find(&key, &utask->breakpoints_tree, __ubp_cmp_key);
> +     read_unlock(&utask->breakpoints_treelock);
> +
> +     if (node)
> +             return get_ubp(__node_2_ubp(node));
> +
> +     return NULL;
> +}
> +
> +static struct uprobe_breakpoint *find_active_breakpoint(struct pt_regs *regs,
> +                                                     unsigned long bp_vaddr)
> +{
> +     struct uprobe_task *utask = get_utask();
> +     struct uprobe_breakpoint *ubp;
> +     struct uprobe *uprobe;
> +     int is_swbp;
> +
> +     if (unlikely(!utask))
> +             return NULL;
> +
> +     ubp = find_breakpoint(utask, bp_vaddr);
> +     if (ubp)
> +             return ubp;
> +
> +     uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
> +     if (!uprobe) {
> +             if (is_swbp > 0) {
> +                     /* No matching uprobe; signal SIGTRAP. */
> +                     force_sig(SIGTRAP);
> +             } else {
> +                     /*
> +                      * Either we raced with uprobe_unregister() or we can't
> +                      * access this memory. The latter is only possible if
> +                      * another thread plays with our ->mm. In both cases
> +                      * we can simply restart. If this vma was unmapped we
> +                      * can pretend this insn was not executed yet and get
> +                      * the (correct) SIGSEGV after restart.
> +                      */
> +                     instruction_pointer_set(regs, bp_vaddr);
> +             }
> +             return NULL;
> +     }
> +
> +     ubp = alloc_breakpoint(utask, uprobe, bp_vaddr);
> +     if (!ubp) {
> +             put_uprobe(uprobe);
> +             return NULL;
> +     }
> +
> +     return ubp;
> +}
> +
>  static int
>  install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
>                       struct vm_area_struct *vma, unsigned long vaddr)
> @@ -1576,9 +1745,8 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct 
> mm_struct *newmm)
>  /*
>   *  - search for a free slot.
>   */
> -static unsigned long xol_take_insn_slot(struct xol_area *area)
> +static __always_inline int xol_take_insn_slot(struct xol_area *area)
>  {
> -     unsigned long slot_addr;
>       int slot_nr;
>  
>       do {
> @@ -1590,34 +1758,48 @@ static unsigned long xol_take_insn_slot(struct 
> xol_area *area)
>                       slot_nr = UINSNS_PER_PAGE;
>                       continue;
>               }
> -             wait_event(area->wq, (atomic_read(&area->slot_count) < 
> UINSNS_PER_PAGE));
> +             wait_event(area->wq,
> +                        (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
>       } while (slot_nr >= UINSNS_PER_PAGE);
>  
> -     slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
> -     atomic_inc(&area->slot_count);
> +     return slot_nr;
> +}
> +
> +static __always_inline unsigned long
> +choose_insn_slot(struct xol_area *area, struct uprobe_breakpoint *ubp)
> +{
> +     if ((ubp->slot == UINSNS_PER_PAGE) ||
> +         test_and_set_bit(ubp->slot, area->bitmap)) {
> +             ubp->slot = xol_take_insn_slot(area);
> +     }
>  
> -     return slot_addr;
> +     atomic_inc(&area->slot_count);
> +     return area->vaddr + ubp->slot * UPROBE_XOL_SLOT_BYTES;
>  }
>  
>  /*
>   * xol_get_insn_slot - allocate a slot for xol.
>   * Returns the allocated slot address or 0.
>   */
> -static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
> +static unsigned long xol_get_insn_slot(struct uprobe_breakpoint *ubp)
>  {
>       struct xol_area *area;
>       unsigned long xol_vaddr;
> +     struct uprobe *uprobe = ubp->uprobe;
>  
>       area = get_xol_area();
>       if (!area)
>               return 0;
>  
> -     xol_vaddr = xol_take_insn_slot(area);
> +     xol_vaddr = choose_insn_slot(area, ubp);
>       if (unlikely(!xol_vaddr))
>               return 0;
>  
> -     arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
> -                           &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
> +     if (memcmp((void *)xol_vaddr, &uprobe->arch.ixol,
> +                sizeof(uprobe->arch.ixol)))
> +             arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
> +                                   &uprobe->arch.ixol,
> +                                   sizeof(uprobe->arch.ixol));

Perhaps, should i move memcmp() to the arch_uprobe_copy_ixol() provided by the 
architecture
code?

>  
>       return xol_vaddr;
>  }
> @@ -1717,8 +1899,7 @@ void uprobe_free_utask(struct task_struct *t)
>       if (!utask)
>               return;
>  
> -     if (utask->active_uprobe)
> -             put_uprobe(utask->active_uprobe);
> +     delete_breakpoint(utask);
>  
>       ri = utask->return_instances;
>       while (ri)
> @@ -1739,8 +1920,11 @@ void uprobe_free_utask(struct task_struct *t)
>   */
>  static struct uprobe_task *get_utask(void)
>  {
> -     if (!current->utask)
> +     if (!current->utask) {
>               current->utask = kzalloc(sizeof(struct uprobe_task), 
> GFP_KERNEL);
> +             current->utask->breakpoints_tree = RB_ROOT;
> +             rwlock_init(&current->utask->breakpoints_treelock);
> +     }
>       return current->utask;
>  }
>  
> @@ -1921,7 +2105,8 @@ static void prepare_uretprobe(struct uprobe *uprobe, 
> struct pt_regs *regs)
>  
>  /* Prepare to single-step probed instruction out of line. */
>  static int
> -pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long 
> bp_vaddr)
> +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs,
> +       struct uprobe_breakpoint *ubp)
>  {
>       struct uprobe_task *utask;
>       unsigned long xol_vaddr;
> @@ -1931,12 +2116,12 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs 
> *regs, unsigned long bp_vaddr)
>       if (!utask)
>               return -ENOMEM;
>  
> -     xol_vaddr = xol_get_insn_slot(uprobe);
> +     xol_vaddr = xol_get_insn_slot(ubp);
>       if (!xol_vaddr)
>               return -ENOMEM;
>  
>       utask->xol_vaddr = xol_vaddr;
> -     utask->vaddr = bp_vaddr;
> +     utask->vaddr = ubp->vaddr;
>  
>       err = arch_uprobe_pre_xol(&uprobe->arch, regs);
>       if (unlikely(err)) {
> @@ -2182,32 +2367,19 @@ bool __weak arch_uretprobe_is_alive(struct 
> return_instance *ret, enum rp_check c
>   */
>  static void handle_swbp(struct pt_regs *regs)
>  {
> +     struct uprobe_breakpoint *ubp;
>       struct uprobe *uprobe;
>       unsigned long bp_vaddr;
> -     int is_swbp;
>  
>       bp_vaddr = uprobe_get_swbp_addr(regs);
>       if (bp_vaddr == get_trampoline_vaddr())
>               return handle_trampoline(regs);
>  
> -     uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
> -     if (!uprobe) {
> -             if (is_swbp > 0) {
> -                     /* No matching uprobe; signal SIGTRAP. */
> -                     force_sig(SIGTRAP);
> -             } else {
> -                     /*
> -                      * Either we raced with uprobe_unregister() or we can't
> -                      * access this memory. The latter is only possible if
> -                      * another thread plays with our ->mm. In both cases
> -                      * we can simply restart. If this vma was unmapped we
> -                      * can pretend this insn was not executed yet and get
> -                      * the (correct) SIGSEGV after restart.
> -                      */
> -                     instruction_pointer_set(regs, bp_vaddr);
> -             }
> +     ubp = find_active_breakpoint(regs, bp_vaddr);
> +     if (!ubp)
>               return;
> -     }
> +
> +     uprobe = ubp->uprobe;
>  
>       /* change it in advance for ->handler() and restart */
>       instruction_pointer_set(regs, bp_vaddr);
> @@ -2241,12 +2413,11 @@ static void handle_swbp(struct pt_regs *regs)
>       if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
>               goto out;
>  
> -     if (!pre_ssout(uprobe, regs, bp_vaddr))
> -             return;
> +     pre_ssout(uprobe, regs, ubp);
>  
>       /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
>  out:
> -     put_uprobe(uprobe);
> +     put_ubp(ubp);
>  }
>  
>  /*
> @@ -2266,7 +2437,6 @@ static void handle_singlestep(struct uprobe_task 
> *utask, struct pt_regs *regs)
>       else
>               WARN_ON_ONCE(1);
>  
> -     put_uprobe(uprobe);
>       utask->active_uprobe = NULL;
>       utask->state = UTASK_RUNNING;
>       xol_free_insn_slot(current);

-- 
BR
Liao, Chang

Reply via email to