On 31.08.2021 14:28, Nikita Yushchenko wrote:
> High order allocation detector monitors allocations of order greater
> than zero, and generates an uevent if a configured number of allocations
> happen within configured time.
> 
> In return to this uevent, userspace can enable event tracing. If a
> stream of high-order allocations continues, the trace could help to
> detect the code path causing them.
> 
> HOAD has a sysfs control interface, at /sys/kernel/mm/hoad/control:
> - "enable ORDER COUNT MSECS"
>   Sets up monitoring allocations of order ORDER: if COUNT such
>   allocations are detected within MSECS, uevent is sent. Then further
>   uevents is suspended, to avoid userspace races.
> - "disable ORDER"
>   Stops monitoring allocations of order ORDER.
> - "resume [delay-msecs]"
>   Allow sending a new uevent, either immediately or after the given
>   delay.
> 
> The uevent is generated with ACTION="change", SUBSYSTEM="hoad", ORDER
> set to thr order of the allocation that has caused the uevent.
> 
> Also HOAD provides a tracepoint named "hoad", under kmem/ group, that
> could be used for tracing. This tracepoint hits on every allocation of
> order greater or equal to minimal order for which monitoring is enabled.
> 
> https://jira.sw.ru/browse/PSBM-92088
> Signed-off-by: Nikita Yushchenko <[email protected]>
> ---
>  include/trace/events/kmem.h |  12 ++
>  mm/page_alloc.c             | 257 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 269 insertions(+)
> 
> diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
> index 9cb647609df3..b425c6856bfd 100644
> --- a/include/trace/events/kmem.h
> +++ b/include/trace/events/kmem.h
> @@ -305,6 +305,18 @@ TRACE_EVENT(mm_page_alloc_extfrag,
>               __entry->alloc_migratetype == __entry->fallback_migratetype)
>  );
>  
> +TRACE_EVENT(hoad,
> +     TP_PROTO(int order),
> +     TP_ARGS(order),
> +     TP_STRUCT__entry(
> +             __field(int, order)
> +     ),
> +     TP_fast_assign(
> +             __entry->order = order;
> +     ),
> +     TP_printk("order=%d", __entry->order)
> +);
> +
>  #endif /* _TRACE_KMEM_H */
>  
>  /* This part must be outside protection */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 1ae193b26a1d..2a0d459bbaa3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3533,6 +3533,261 @@ static __always_inline void warn_high_order(int 
> order, gfp_t gfp_mask)
>       }
>  }
>  
> +struct hoad_order_info {
> +     unsigned long interval;
> +     int max_allocs;
> +     atomic_t counter;
> +     unsigned long since_jiffies;
> +     struct delayed_work reset_counter_work;
> +};
> +
> +static struct hoad_order_info *hoad_table[MAX_ORDER];
> +static DEFINE_MUTEX(hoad_mutex);
> +static struct kobject *hoad_kobj;
> +static int hoad_uevent_order;
> +static unsigned long hoad_resume_jiffies;
> +static int hoad_trace_min_order;
> +
> +static void hoad_reset_counter(struct work_struct *work)
> +{
> +     struct hoad_order_info *hoi = container_of(work,
> +                     struct hoad_order_info, reset_counter_work.work);
> +
> +     atomic_set(&hoi->counter, 0);

Do we really need work to do such simple action? Why plain timer is not enough?

> +}
> +
> +static void hoad_send_uevent(struct work_struct *work)
> +{
> +     char order_string[16];
> +     char *envp[] = { order_string, NULL };

Please, use new lines after declarations.

> +     sprintf(order_string, "ORDER=%d", hoad_uevent_order);
> +     kobject_uevent_env(hoad_kobj, KOBJ_CHANGE, envp);
> +}
> +static DECLARE_WORK(hoad_send_uevent_work, hoad_send_uevent);
> +
> +static void hoad_resume(struct work_struct *work)
> +{
> +     hoad_uevent_order = 0;

The same is here: do we really need work here?

> +}
> +static DECLARE_DELAYED_WORK(hoad_resume_work, hoad_resume);
> +
> +static void hoad_notice_alloc(int order, gfp_t gfp)
> +{
> +     struct hoad_order_info *hoi;
> +     int count;
> +     bool hit = false;
> +
> +     if (gfp & (__GFP_NORETRY | __GFP_ORDER_NOWARN))
> +             return;
> +
> +     if (order >= hoad_trace_min_order)
> +             trace_hoad(order);
> +
> +     rcu_read_lock();
> +     hoi = rcu_dereference(hoad_table[order]);
> +     if (hoi) {
> +             count = atomic_inc_return(&hoi->counter);
> +             if (count == 1) {
> +                     hoi->since_jiffies = jiffies;
> +                     schedule_delayed_work(&hoi->reset_counter_work,
> +                                     hoi->interval);
> +             }
> +             hit = (count == hoi->max_allocs);
> +     }
> +     rcu_read_unlock();
> +
> +     if (hit) {
> +             if (cmpxchg(&hoad_uevent_order, 0, order) == 0)
> +                     schedule_work(&hoad_send_uevent_work);
> +     }
> +}
> +
> +static void hoad_install_order_info(int order, struct hoad_order_info *hoi)
> +{
> +     struct hoad_order_info *oldhoi;
> +     int i;
> +
> +     mutex_lock(&hoad_mutex);
> +     oldhoi = hoad_table[order];
> +     rcu_assign_pointer(hoad_table[order], hoi);
> +     for (i = 1; i < MAX_ORDER; i++) {
> +             if (hoad_table[i])
> +                     break;
> +     }
> +     hoad_trace_min_order = i;
> +     mutex_unlock(&hoad_mutex);
> +
> +     if (oldhoi) {
> +             synchronize_rcu();
> +             cancel_delayed_work_sync(&oldhoi->reset_counter_work);

Why do we completely cancel this? We possible updated single order entity,
don't other orders still need to be reset?

> +             kfree(oldhoi);
> +     }
> +}
> +
> +static int hoad_enable_for_order(int order, int max_allocs,
> +             unsigned int interval_msecs)
> +{
> +     struct hoad_order_info *hoi;
> +     unsigned long interval;
> +
> +     if (order < 1 || order >= MAX_ORDER)
> +             return -EINVAL;
> +     if (max_allocs < 1)
> +             return -EINVAL;
> +     interval = msecs_to_jiffies(interval_msecs);
> +     if (interval < 1)
> +             return -EINVAL;
> +
> +     hoi = kzalloc(sizeof(*hoi), GFP_KERNEL);
> +     if (!hoi)
> +             return -ENOMEM;
> +     hoi->interval = interval;
> +     hoi->max_allocs = max_allocs;
> +     INIT_DELAYED_WORK(&hoi->reset_counter_work, hoad_reset_counter);
> +
> +     hoad_install_order_info(order, hoi);
> +     return 0;
> +}
> +
> +static int hoad_disable_for_order(int order)
> +{
> +     if (order < 1 || order >= MAX_ORDER)
> +             return -EINVAL;
> +
> +     hoad_install_order_info(order, NULL);
> +     return 0;
> +}
> +
> +static ssize_t hoad_control_show(struct kobject *kobj,
> +             struct kobj_attribute *attr, char *buf)
> +{
> +     char *p = buf;
> +     int order;
> +     struct hoad_order_info *hoi;
> +     int counter;
> +     long d;
> +     unsigned int msecs;
> +
> +     rcu_read_lock();
> +     for (order = 1; order < MAX_ORDER; order++) {
> +             hoi = rcu_dereference(hoad_table[order]);
> +             if (hoi) {
> +                     counter = atomic_read(&hoi->counter);
> +                     msecs = counter ?
> +                             jiffies_to_msecs(jiffies - hoi->since_jiffies) :
> +                             0;
> +                     p += sprintf(p, "order %u: %u/%u in %u/%u msecs\n",
> +                                     order, counter, hoi->max_allocs,
> +                                     msecs, jiffies_to_msecs(hoi->interval));

Any guarantees all printed in this function data fits provided @buf?

> +             }
> +     }
> +     rcu_read_unlock();
> +     if (hoad_uevent_order) {
> +             p += sprintf(p, "event generation suspended");
> +             d = (long)(hoad_resume_jiffies - jiffies);
> +             if (d > 0) {
> +                     p += sprintf(p, ", resume in ");
> +                     msecs = jiffies_to_msecs(d);
> +                     if (msecs >= 1000 * 60 * 60 * 2)
> +                             p += sprintf(p, "%u hours",
> +                                             msecs / (1000 * 60 * 60));
> +                     else if (msecs > 1000 * 60 * 2)
> +                             p += sprintf(p, "%u minutes",
> +                                             msecs / (1000 * 60));
> +                     else
> +                             p += sprintf(p, "%u seconds",
> +                                             (msecs + 999) / 1000);
> +             }
> +             *p++ = '\n';
> +     }
> +
> +     return p - buf;
> +}
> +
> +static ssize_t hoad_control_store(struct kobject *kobj,
> +             struct kobj_attribute *attr, const char *buf, size_t len)
> +{
> +     char *p, *q;
> +     int order, max_allocs, ret;
> +     unsigned int msecs;
> +     unsigned long d;
> +     char c;
> +
> +     if (len == 0)
> +             return 0;
> +     p = kstrdup(buf, GFP_KERNEL);
> +     if (!p)
> +             return -ENOMEM;
> +     q = strim(p);
> +     if (*q == '\0') {
> +             ret = 0;
> +             goto out;
> +     }
> +
> +     if (sscanf(q, "enable %u %u %u%c",
> +                             &order, &max_allocs, &msecs, &c) == 3)
> +             ret = hoad_enable_for_order(order, max_allocs, msecs);
> +     else if (sscanf(q, "disable %u%c", &order, &c) == 1)
> +             ret = hoad_disable_for_order(order);
> +     else if (sscanf(q, "resume %u%c", &msecs, &c) == 1) {
> +             if (msecs > 1000 * 60 * 60 * 24 * 5)

Every magic number should be described with #define.

> +                     ret = -EINVAL;
> +             else {
> +do_resume:
> +                     d = msecs_to_jiffies(msecs);
> +                     hoad_resume_jiffies = jiffies + d;
> +                     cancel_delayed_work_sync(&hoad_resume_work);
> +                     schedule_delayed_work(&hoad_resume_work, d);

It looks like we may use mod_delayed_work(system_wq, ...) here instead of two 
calls above.

> +                     ret = 0;
> +             }
> +     }
> +     else if (!strcmp(q, "resume")) {

Please, move this "else if" up to the same line with "}" bracket

> +             msecs = 0;
> +             goto do_resume;
> +     } else
> +             ret = -EINVAL;

In case of you have "{}" brackets in a single branch of if-else, then every if, 
else if, else
branch has to have "{}" brackets even if it's one liner.

> +
> +out:
> +     kfree(p);
> +     return ret ? ret : len;
> +}
> +
> +static struct kobj_attribute hoad_control_attr = {
> +     .attr.name = "control",
> +     .attr.mode = S_IRUSR | S_IWUSR,
> +     .show = hoad_control_show,
> +     .store = hoad_control_store,
> +};
> +
> +static int hoad_init(void)
> +{
> +     int ret;
> +
> +     /* To be able to generate uevents, need a kobject with kset defined.
> +      *
> +      * To avoid extra depth inside sysfs, create a kset and use it's
> +      * internal kobject, by setting it's 'kset' field to itself.
> +      */
> +     struct kset *kset = kset = kset_create_and_add("hoad", NULL, mm_kobj);

1)kset = kset is here.
2)New line is needed after declaration.

> +     if (!kset)
> +             return -ENOMEM;
> +     hoad_kobj = &kset->kobj;
> +     hoad_kobj->kset = kset;
> +
> +     ret = sysfs_create_file(hoad_kobj, &hoad_control_attr.attr);
> +     if (ret) {
> +             hoad_kobj->kset = NULL;
> +             hoad_kobj = NULL;
> +             kset_put(kset);
> +             return ret;
> +     }

Why not:

        ret = sysfs_create_file(hoad_kobj, &hoad_control_attr.attr);
        if (ret) {
                kset_put(kset);
                return ret;
        }

        hoad_kobj = &kset->kobj;
        hoad_kobj->kset = kset;

?

Also, as I see in kernel code, the pair bracket for kset_create_and_add() is 
kset_unregister().
Why we use kset_put() instead?

> +
> +     hoad_trace_min_order = MAX_ORDER;
> +     hoad_resume_jiffies = jiffies;
> +     return 0;
> +}
> +late_initcall(hoad_init);
> +
>  /*
>   * This is the 'heart' of the zoned buddy allocator.
>   */
> @@ -3557,6 +3812,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int 
> order,
>               !(current->flags & PF_MEMALLOC));
>  
>       warn_high_order(order, gfp_mask);
> +     if (order > 0)
> +             hoad_notice_alloc(order, gfp_mask);
>  
>       if (should_fail_alloc_page(gfp_mask, order))
>               return NULL;
> 

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to