from:"Alexander Duyck"

Re: [PATCH net-next 2/6] page_frag: unify gfp bits for order 3 page allocation

2024-01-08 Thread Alexander Duyck

On Mon, Jan 8, 2024 at 12:25 AM Yunsheng Lin  wrote:
>
> On 2024/1/5 23:35, Alexander H Duyck wrote:
> > On Wed, 2024-01-03 at 17:56 +0800, Yunsheng Lin wrote:
> >> Currently there seems to be three page frag implementions
> >> which all try to allocate order 3 page, if that fails, it
> >> then fail back to allocate order 0 page, and each of them
> >> all allow order 3 page allocation to fail under certain
> >> condition by using specific gfp bits.
> >>
> >> The gfp bits for order 3 page allocation are different
> >> between different implementation, __GFP_NOMEMALLOC is
> >> or'd to forbid access to emergency reserves memory for
> >> __page_frag_cache_refill(), but it is not or'd in other
> >> implementions, __GFP_DIRECT_RECLAIM is masked off to avoid
> >> direct reclaim in skb_page_frag_refill(), but it is not
> >> masked off in __page_frag_cache_refill().
> >>
> >> This patch unifies the gfp bits used between different
> >> implementions by or'ing __GFP_NOMEMALLOC and masking off
> >> __GFP_DIRECT_RECLAIM for order 3 page allocation to avoid
> >> possible pressure for mm.
> >>
> >> Signed-off-by: Yunsheng Lin 
> >> CC: Alexander Duyck 
> >> ---
> >>  drivers/vhost/net.c | 2 +-
> >>  mm/page_alloc.c | 4 ++--
> >>  net/core/sock.c | 2 +-
> >>  3 files changed, 4 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> >> index f2ed7167c848..e574e21cc0ca 100644
> >> --- a/drivers/vhost/net.c
> >> +++ b/drivers/vhost/net.c
> >> @@ -670,7 +670,7 @@ static bool vhost_net_page_frag_refill(struct 
> >> vhost_net *net, unsigned int sz,
> >>  /* Avoid direct reclaim but allow kswapd to wake */
> >>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
> >>__GFP_COMP | __GFP_NOWARN |
> >> -  __GFP_NORETRY,
> >> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
> >>SKB_FRAG_PAGE_ORDER);
> >>  if (likely(pfrag->page)) {
> >>  pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index 9a16305cf985..1f0b36dd81b5 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -4693,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct 
> >> page_frag_cache *nc,
> >>  gfp_t gfp = gfp_mask;
> >>
> >>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> -gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
> >> -__GFP_NOMEMALLOC;
> >> +gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
> >> +   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
> >>  page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
> >>  PAGE_FRAG_CACHE_MAX_ORDER);
> >>  nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
> >> diff --git a/net/core/sock.c b/net/core/sock.c
> >> index 446e945f736b..d643332c3ee5 100644
> >> --- a/net/core/sock.c
> >> +++ b/net/core/sock.c
> >> @@ -2900,7 +2900,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
> >> page_frag *pfrag, gfp_t gfp)
> >>  /* Avoid direct reclaim but allow kswapd to wake */
> >>  pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
> >>__GFP_COMP | __GFP_NOWARN |
> >> -  __GFP_NORETRY,
> >> +  __GFP_NORETRY | __GFP_NOMEMALLOC,
> >>SKB_FRAG_PAGE_ORDER);
> >>  if (likely(pfrag->page)) {
> >>  pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
> >
> > Looks fine to me.
> >
> > One thing you may want to consider would be to place this all in an
> > inline function that could just consolidate all the code.
>
> Do you think it is possible to further unify the implementations of the
> 'struct page_frag_cache' and 'struct page_frag', so adding a inline
> function for above is unnecessary?

Actually the skb_page_frag_refill seems to function more similarly to
how the Intel drivers do in terms of handling fragments. It is
basically slicing off pieces until either it runs out of them and
allocates a new one, or if the page reference count is one without
pre-allocating the references.

However, with that said many of the core bits are the same so it might
be possible to look at unifiying at least pieces of this. For example
the page_frag has the same first 3 members as the page_frag_cache so
it might be possible to look at refactoring things further to unify
more of the frag_refill logic.

Re: [PATCH net-next 4/6] vhost/net: remove vhost_net_page_frag_refill()

2024-01-08 Thread Alexander Duyck

On Mon, Jan 8, 2024 at 1:06 AM Yunsheng Lin  wrote:
>
> On 2024/1/6 0:06, Alexander H Duyck wrote:
> >>
> >>  static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
> >> @@ -1353,8 +1318,7 @@ static int vhost_net_open(struct inode *inode, 
> >> struct file *f)
> >>  vqs[VHOST_NET_VQ_RX]);
> >>
> >>  f->private_data = n;
> >> -n->page_frag.page = NULL;
> >> -n->refcnt_bias = 0;
> >> +n->pf_cache.va = NULL;
> >>
> >>  return 0;
> >>  }
> >> @@ -1422,8 +1386,9 @@ static int vhost_net_release(struct inode *inode, 
> >> struct file *f)
> >>  kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
> >>  kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
> >>  kfree(n->dev.vqs);
> >> -if (n->page_frag.page)
> >> -__page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
> >> +if (n->pf_cache.va)
> >> +__page_frag_cache_drain(virt_to_head_page(n->pf_cache.va),
> >> +n->pf_cache.pagecnt_bias);
> >>  kvfree(n);
> >>  return 0;
> >>  }
> >
> > I would recommend reordering this patch with patch 5. Then you could
> > remove the block that is setting "n->pf_cache.va = NULL" above and just
> > make use of page_frag_cache_drain in the lower block which would also
> > return the va to NULL.
>
> I am not sure if we can as there is no zeroing for 'struct vhost_net' in
> vhost_net_open().
>
> If we don't have "n->pf_cache.va = NULL", don't we use the uninitialized data
> when calling page_frag_alloc_align() for the first time?

I see. So kvmalloc is used instead of kvzalloc when allocating the
structure. That might be an opportunity to clean things up a bit by
making that change to reduce the risk of some piece of memory
initialization being missed.

That said, I still think reordering the two patches might be useful as
it would help to make it so that the change you make to vhost_net is
encapsulated in one patch to fully enable the use of the new page pool
API.

Re: [igb] netconsole triggers warning in netpoll_poll_dev

2021-04-07 Thread Alexander Duyck

On Wed, Apr 7, 2021 at 11:07 AM Jakub Kicinski  wrote:
>
> On Wed, 7 Apr 2021 09:25:28 -0700 Alexander Duyck wrote:
> > On Wed, Apr 7, 2021 at 8:37 AM Jakub Kicinski  wrote:
> > >
> > > On Wed, 7 Apr 2021 08:00:53 +0200 Oleksandr Natalenko wrote:
> > > > Thanks for the effort, but reportedly [1] it made no difference,
> > > > unfortunately.
> > > >
> > > > [1] https://bugzilla.kernel.org/show_bug.cgi?id=212573#c8
> > >
> > > The only other option I see is that somehow the NAPI has no rings.
> > >
> > > diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
> > > b/drivers/net/ethernet/intel/igb/igb_main.c
> > > index a45cd2b416c8..24568adc2fb1 100644
> > > --- a/drivers/net/ethernet/intel/igb/igb_main.c
> > > +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> > > @@ -7980,7 +7980,7 @@ static int igb_poll(struct napi_struct *napi, int 
> > > budget)
> > > struct igb_q_vector *q_vector = container_of(napi,
> > >  struct igb_q_vector,
> > >  napi);
> > > -   bool clean_complete = true;
> > > +   bool clean_complete = q_vector->tx.ring || q_vector->rx.ring;
> > > int work_done = 0;
> > >
> > >  #ifdef CONFIG_IGB_DCA
> >
> > It might make sense to just cast the work_done as a unsigned int, and
> > then on the end of igb_poll use:
> >   return min_t(unsigned int, work_done, budget - 1);
>
> Sure, that's simplest. I wasn't sure something is supposed to prevent
> this condition or if it's okay to cover it up.

I'm pretty sure it is okay to cover it up. In this case the "budget -
1" is supposed to be the upper limit on what can be reported. I think
it was assuming an unsigned value anyway.

Another alternative would be to default clean_complete to !!budget.
Then if budget is 0 clean_complete would always return false.

Re: [igb] netconsole triggers warning in netpoll_poll_dev

2021-04-07 Thread Alexander Duyck

On Wed, Apr 7, 2021 at 8:37 AM Jakub Kicinski  wrote:
>
> On Wed, 7 Apr 2021 08:00:53 +0200 Oleksandr Natalenko wrote:
> > Thanks for the effort, but reportedly [1] it made no difference,
> > unfortunately.
> >
> > [1] https://bugzilla.kernel.org/show_bug.cgi?id=212573#c8
>
> The only other option I see is that somehow the NAPI has no rings.
>
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
> b/drivers/net/ethernet/intel/igb/igb_main.c
> index a45cd2b416c8..24568adc2fb1 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -7980,7 +7980,7 @@ static int igb_poll(struct napi_struct *napi, int 
> budget)
> struct igb_q_vector *q_vector = container_of(napi,
>  struct igb_q_vector,
>  napi);
> -   bool clean_complete = true;
> +   bool clean_complete = q_vector->tx.ring || q_vector->rx.ring;
> int work_done = 0;
>
>  #ifdef CONFIG_IGB_DCA

It might make sense to just cast the work_done as a unsigned int, and
then on the end of igb_poll use:
  return min_t(unsigned int, work_done, budget - 1);

Re: [PATCH 4/4] mm/page_reporting: Fix possible user allocation failure

2021-04-02 Thread Alexander Duyck

On Fri, Mar 26, 2021 at 2:45 AM Xunlei Pang  wrote:
>
> We encountered user memory allocation failure(OOM) on our
> 512MiB tiny instances, it didn't happen after turning off
> the page reporting.
>
> After some debugging, it turns out 32*4MB=128MB(order-10)
> free pages were isolated during reporting window resulting
> in no free available.
>
> Actually this might also happen on large instances when
> having a few free memory.
>
> This patch introduces a rule to limit reporting capacity
> according to current free memory, and reduce accordingly
> for higher orders which could break this rule.
>
> For example,
>  100MiB free, sgl capacity for different orders are:
>order-9 : 32
>order-10: 16
>
> Reported-by: Helin Guo 
> Tested-by: Helin Guo 
> Signed-off-by: Xunlei Pang 

I'm curious how much of this would be solved by just making it so that
we reduce the capacity by half if we increase the order? So
specifically if we did something such as:
  capacity = (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER) >> order;

We just have to make sure the capacity is greater than zero before
entering the processing loop.

An alternative that occured to me while I reviewed this is to look at
just adding a reserve. That would be something like:
  reserve = PAGE_REPORTING_CAPACITY - capacity;

Basically the reserve would take up some space at the start of the
list so that you wouldn't need to actually change the capacity
directly. It would just be a matter of making certain we deducted it
and updated the offsets of the scatterlist as necessary.

> ---
>  mm/page_reporting.c | 89 
> +++--
>  1 file changed, 72 insertions(+), 17 deletions(-)
>
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 6ffedb8..2ec0ec0 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -129,8 +129,8 @@ void __page_reporting_notify(void)
>   */
>  static int
>  page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone 
> *zone,
> -unsigned int order, unsigned int mt,
> -struct scatterlist *sgl, unsigned int *offset)
> +unsigned int order, unsigned int mt, struct scatterlist 
> *sgl,
> +const unsigned int capacity, unsigned int *offset)
>  {
> struct free_area *area = >free_area[order];
> struct list_head *list = >free_list[mt];
> @@ -161,10 +161,10 @@ void __page_reporting_notify(void)
>  * list processed. This should result in us reporting all pages on
>  * an idle system in about 30 seconds.
>  *
> -* The division here should be cheap since PAGE_REPORTING_CAPACITY
> -* should always be a power of 2.
> +* The division here should be cheap since capacity should
> +* always be a power of 2.
>  */
> -   budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
> +   budget = DIV_ROUND_UP(area->nr_free, capacity * 16);

So the comment here is no longer valid when capacity became a
variable. An alternative to look at if we were to assume the shift
approach I mentioned above would be to then shift the budget based on
the reduced capacity.

> /* loop through free list adding unreported pages to sg list */
> list_for_each_entry_safe(page, next, list, lru) {
> @@ -196,7 +196,7 @@ void __page_reporting_notify(void)
> --(*offset);
> sg_set_page([*offset], page, page_len, 0);
>
> -   nr_pages = (PAGE_REPORTING_CAPACITY - *offset) << 
> order;
> +   nr_pages = (capacity - *offset) << order;
> if (zone->reported_pages + nr_pages >= threshold) {
> err = 1;
> break;

Rather than adding a capacity value it might work better to add a
"reserve" value so that we are just padding the start of the
scatterlist rather than having to reset it every time we change the
total capacity of the scatterlist. The advantage to that is that you
could drop all the changes where you are having to reset the list and
change the capacity.

Instead you would just need to update the check to "*offset <=
reserve" and the call to report/drain so that they take into account
the reserve offset.

> @@ -217,10 +217,10 @@ void __page_reporting_notify(void)
> spin_unlock_irq(>lock);
>
> /* begin processing pages in local list */
> -   err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
> +   err = prdev->report(prdev, sgl, capacity);
>

Assuming the change to "reserve" then this would be "[*offset],
PAGE_REPORTING_CAPACITY - *offset", or you could look at copying the
approach taken in the "leftover" path in page_reporting_process_zone.

> /* reset offset since the full list was reported */
> -   *offset = PAGE_REPORTING_CAPACITY;
> +

Re: [PATCH 2/4] mm/page_reporting: Introduce free page reporting factor

2021-04-02 Thread Alexander Duyck

On Fri, Mar 26, 2021 at 2:45 AM Xunlei Pang  wrote:
>
> Add new "/sys/kernel/mm/page_reporting/reporting_factor"
> within [0, 100], and stop page reporting when it reaches
> the configured threshold. Default is 100 which means no
> limitation is imposed. Percentile is adopted to reflect
> the fact that it reports on the per-zone basis.
>
> We can control the total number of reporting pages via
> this knob to avoid EPT violations which may affect the
> performance of the business, imagine the guest memory
> allocation burst or host long-tail memory reclaiming
> really hurt.

I'm not a fan of the concept as I don't think it really does what it
was meant to do. The way page reporting was meant to work is that when
we have enough free pages we will cycle through memory a few pages at
a time reporting what is unused to the hypervisor. It was meant to be
a scan more than something that just would stop once it touched a
certain part of the memory.

If you are wanting to truly reserve some amount of memory so that it
is always left held by the guest then it might make more sense to make
the value a fixed amount of memory rather than trying to do it as a
percentage.

Also we may need to look at adding some sort of
linearization/defragmentation logic for the reported pages. One issue
is that there are several things that will add pages to the end of the
free page lists. One of the reasons why I was processing the entire
list when I was processing reported pages was because the page freeing
functions will normally cause pages to be interleaved with the
reported pages on the end of the list. So if you are wanting to
reserve some pages as being non-reported we may need to add something
sort the lists periodically.

> This knob can help make customized control policies according
> to VM priority, it is also useful for testing, gray-release, etc.

As far as the knob itself it would make sense to combine this with
patch 3 since they are just different versions of the same control

> ---
>  mm/page_reporting.c | 60 
> -
>  1 file changed, 59 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index ba195ea..86c6479 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -11,6 +11,8 @@
>  #include "page_reporting.h"
>  #include "internal.h"
>
> +static int reporting_factor = 100;
> +
>  #define PAGE_REPORTING_DELAY   (2 * HZ)
>  static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
>
> @@ -134,6 +136,7 @@ void __page_reporting_notify(void)
> struct list_head *list = >free_list[mt];
> unsigned int page_len = PAGE_SIZE << order;
> struct page *page, *next;
> +   unsigned long threshold;
> long budget;
> int err = 0;
>
> @@ -144,6 +147,7 @@ void __page_reporting_notify(void)
> if (list_empty(list))
> return err;
>
> +   threshold = atomic_long_read(>managed_pages) * reporting_factor 
> / 100;

So at 0 you are setting this threshold to 0, however based on the code
below you are still pulling at least one page.

> spin_lock_irq(>lock);
>
> /*
> @@ -181,6 +185,8 @@ void __page_reporting_notify(void)
>
> /* Attempt to pull page from list and place in scatterlist */
> if (*offset) {
> +   unsigned long nr_pages;
> +
> if (!__isolate_free_page(page, order)) {
> next = page;
> break;
> @@ -190,6 +196,12 @@ void __page_reporting_notify(void)
> --(*offset);
> sg_set_page([*offset], page, page_len, 0);
>
> +   nr_pages = (PAGE_REPORTING_CAPACITY - *offset) << 
> order;
> +   if (zone->reported_pages + nr_pages >= threshold) {
> +   err = 1;
> +   break;
> +   }
> +

So here we are checking the threshold after we have already pulled the
page. With this being the case it might make more sense to either
allow for the full capacity of pages to be pulled and then check this
after they have been reported, or to move this check up to somewhere
before you start processing the pages. What you want to avoid is
having to perform this check for every individual page.

> continue;
> }
>
> @@ -244,9 +256,13 @@ void __page_reporting_notify(void)
> struct scatterlist *sgl, struct zone *zone)
>  {
> unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
> -   unsigned long watermark;
> +   unsigned long watermark, threshold;
> int err = 0;
>
> +   threshold = atomic_long_read(>managed_pages) * reporting_factor 
> / 100;
> +   if (zone->reported_pages >= threshold)
> +   return err;
> +

Rather than having to calculate the

Re: [PATCH 1/4] mm/page_reporting: Introduce free page reported counters

2021-04-02 Thread Alexander Duyck

On Fri, Mar 26, 2021 at 2:45 AM Xunlei Pang  wrote:
>
> It's useful to know how many memory has been actually reported,
> so add new zone::reported_pages to record that.
>
> Add "/sys/kernel/mm/page_reporting/reported_kbytes" for the
> actual memory has been reported.
>
> Add "/sys/kernel/mm/page_reporting/refault_kbytes" for the
> accumulated memory has refaulted in after been reported out.
>
> Signed-off-by: Xunlei Pang 
> ---
>  include/linux/mmzone.h |   3 ++
>  mm/page_alloc.c|   4 +-
>  mm/page_reporting.c| 112 
> +++--
>  mm/page_reporting.h|   5 +++
>  4 files changed, 119 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 47946ce..ebd169f 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -530,6 +530,9 @@ struct zone {
> atomic_long_t   managed_pages;
> unsigned long   spanned_pages;
> unsigned long   present_pages;
> +#ifdef CONFIG_PAGE_REPORTING
> +   unsigned long   reported_pages;
> +#endif
>  #ifdef CONFIG_CMA
> unsigned long   cma_pages;
>  #endif
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3e4b29ee..c2c5688 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -930,8 +930,10 @@ static inline void del_page_from_free_list(struct page 
> *page, struct zone *zone,
>unsigned int order)
>  {
> /* clear reported state and update reported page count */
> -   if (page_reported(page))
> +   if (page_reported(page)) {
> __ClearPageReported(page);
> +   page_reporting_update_refault(zone, 1 << order);
> +   }
>
> list_del(>lru);
> __ClearPageBuddy(page);
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index c50d93f..ba195ea 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -1,4 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -19,6 +20,22 @@ enum {
> PAGE_REPORTING_ACTIVE
>  };
>
> +#ifdef CONFIG_SYSFS
> +static struct percpu_counter refault_pages;
> +
> +void page_reporting_update_refault(struct zone *zone, unsigned int pages)
> +{
> +   zone->reported_pages -= pages;
> +   percpu_counter_add_batch(_pages, pages, INT_MAX / 2);
> +}
> +#else
> +void page_reporting_update_refault(struct zone *zone, unsigned int pages)
> +{
> +   zone->reported_pages -= pages;
> +}
> +#endif
> +
> +

I don't see the value added from the refault_pages counter.
Essentially all it will tell you is how many reported pages were
allocated. If you are really wanting to track a value such as this it
might make more sense to just track the total number of reported pages
over the lifetime of the system. At least with that you would once
again be able to take advantage of batching so it isn't occurring as
often.

>  /* request page reporting */
>  static void
>  __page_reporting_request(struct page_reporting_dev_info *prdev)
> @@ -66,7 +83,8 @@ void __page_reporting_notify(void)
>
>  static void
>  page_reporting_drain(struct page_reporting_dev_info *prdev,
> -struct scatterlist *sgl, unsigned int nents, bool 
> reported)
> +struct scatterlist *sgl, struct zone *zone,
> +unsigned int nents, bool reported)
>  {
> struct scatterlist *sg = sgl;
>
> @@ -92,8 +110,10 @@ void __page_reporting_notify(void)
>  * report on the new larger page when we make our way
>  * up to that higher order.
>  */
> -   if (PageBuddy(page) && buddy_order(page) == order)
> +   if (PageBuddy(page) && buddy_order(page) == order) {
> __SetPageReported(page);
> +   zone->reported_pages += (1 << order);
> +   }

The parenthesis around "1 << order" is redundant.

> } while ((sg = sg_next(sg)));
>
> /* reinitialize scatterlist now that it is empty */
> @@ -197,7 +217,7 @@ void __page_reporting_notify(void)
> spin_lock_irq(>lock);
>
> /* flush reported pages from the sg list */
> -   page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, 
> !err);
> +   page_reporting_drain(prdev, sgl, zone, 
> PAGE_REPORTING_CAPACITY, !err);
>
> /*
>  * Reset next to first entry, the old next isn't valid
> @@ -260,7 +280,7 @@ void __page_reporting_notify(void)
>
> /* flush any remaining pages out from the last report */
> spin_lock_irq(>lock);
> -   page_reporting_drain(prdev, sgl, leftover, !err);
> +   page_reporting_drain(prdev, sgl, zone, leftover, !err);
> spin_unlock_irq(>lock);
> }
>
> @@ -362,3 +382,87 @@ void page_reporting_unregister(struct 
>

Re: [PATCH 0/4] mm/page_reporting: Some knobs and fixes

2021-04-02 Thread Alexander Duyck

On Thu, Apr 1, 2021 at 9:09 PM Xunlei Pang  wrote:
>
> On 3/26/21 5:44 PM, Xunlei Pang wrote:
> > Add the following knobs in PATCH 1~3:
> >  /sys/kernel/mm/page_reporting/reported_kbytes
> >  /sys/kernel/mm/page_reporting/refault_kbytes
> >  /sys/kernel/mm/page_reporting/reporting_factor
> >
> > Fix unexpected user OOM in PATCH 4.
> >
> > Xunlei Pang (4):
> >   mm/page_reporting: Introduce free page reported counters
> >   mm/page_reporting: Introduce free page reporting factor
> >   mm/page_reporting: Introduce "page_reporting_factor=" boot parameter
> >   mm/page_reporting: Fix possible user allocation failure
> >
> >  Documentation/admin-guide/kernel-parameters.txt |   3 +
> >  include/linux/mmzone.h  |   3 +
> >  mm/page_alloc.c |   6 +-
> >  mm/page_reporting.c | 268 
> > ++--
> >  4 files changed, 260 insertions(+), 20 deletions(-)
> >
>
> Hi guys,
>
> Looks "Alexander Duyck " was not
> available, so Cced more, any comment?
>
> Thanks!

Yes, my Intel account has been offline since October. If you need to
reach me, my gmail is the best way to go.

As far as the patch series itself I am not exactly thrilled with it.
There seems to be a number of spots where things are being changed
such that the CPU overhead will be much more significant.

The cover page should actually say what the patch set is attempting to
accomplish. In the patch descriptions you have told us what you are
doing, but the why isn't completely clear. For example I am not sure
if the issue addressed in patch 4 was present before patches 1-3 were
introduced.

Re: [net] 5478fcd0f4: BUG:sleeping_function_called_from_invalid_context_at_include/linux/sched/mm.h

2021-03-22 Thread Alexander Duyck

On Mon, Mar 22, 2021 at 2:26 AM Antoine Tenart  wrote:
>
> Quoting Matthew Wilcox (2021-03-22 10:05:36)
> > On Mon, Mar 22, 2021 at 09:55:50AM +0100, Antoine Tenart wrote:
> > > I only had a quick look at this, but I think the issue should be fixed
> > > with:
> > >
> > > diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> > > index e16d54aabd4c..3ae3c20eb64c 100644
> > > --- a/net/core/net-sysfs.c
> > > +++ b/net/core/net-sysfs.c
> > > @@ -1378,7 +1378,7 @@ static ssize_t xps_queue_show(struct net_device 
> > > *dev, unsigned int index,
> > > nr_ids = dev_maps ? dev_maps->nr_ids :
> > >  (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues);
> > >
> > > -   mask = bitmap_zalloc(nr_ids, GFP_KERNEL);
> > > +   mask = bitmap_zalloc(nr_ids, GFP_ATOMIC);
> > > if (!mask) {
> > > rcu_read_unlock();
> > > return -ENOMEM;
> >
> > sysfs isn't a good reason to use GFP_ATOMIC.
> >
> > try something like this:
> >
> > -   mask = bitmap_zalloc(nr_ids, GFP_KERNEL);
> > +   mask = bitmap_zalloc(nr_ids, GFP_NOWAIT);
> > if (!mask) {
> > +   int new_nr_ids;
> > +
> > rcu_read_unlock();
> > -   return -ENOMEM;
> > +   mask = bitmap_zalloc(nr_ids, GFP_KERNEL);
> > +   if (!mask)
> > +   return -ENOMEM;
> > +   rcu_read_lock();
> > +   dev_maps = rcu_dereference(dev->xps_maps[type]);
> > +   /* if nr_ids shrank while we slept, do not overrun array.
> > +* if it increased, we just won't show the new ones
> > +*/
> > +   new_nr_ids = dev_maps ? dev_maps->nr_ids :
> > +   (type == XPS_CPUS ? nr_cpu_ids : 
> > dev->num_rx_queues);
> > +   if (new_nr_ids < nr_ids)
> > +   nr_ids = new_nr_ids;
>
> Thanks for the suggestion, I'll look into that. We could also just
> return -ENOMEM if the first allocation fails, retrying adds a lot of
> complexity.
>
> Antoine

I agree that the retry logic is probably unneeded. In addition we
probably don't need GFP_ATOMIC as GFP_NOWAIT will probably be good
enough as the allocation can fail and just return an -ENOMEM in the
case of low memory.

Thanks.

- Alex

Re: [PATCH net-next v4 0/6] net: qualcomm: rmnet: stop using C bit-fields

2021-03-15 Thread Alexander Duyck

On Mon, Mar 15, 2021 at 6:36 AM Alex Elder  wrote:
>
> The main reason for version 4 of this series is that a bug was
> introduced in version 3, and that is fixed.
>
> But a nice note from Vladimir Oltean got me thinking about the
> necessity of using accessors defined in , and I
> concluded there was no need.  So this version simplifies things
> further, using bitwise AND and OR operators (rather than, e.g.,
> u8_get_bits()) to access all values encoded in bit fields.
>
> This version has been tested using IPv4 with checksum offload
> enabled and disabled.  Traffic over the link included ICMP (ping),
> UDP (iperf), and TCP (wget).
>
> Version 3 of this series used BIT() rather than GENMASK() to define
> single-bit masks, and bitwise AND operators to access them.
>
> Version 2 fixed bugs in the way the value written into the header
> was computed in version 1.
>
> The series was first posted here:
>   https://lore.kernel.org/netdev/20210304223431.15045-1-el...@linaro.org/
>
> -Alex
>
> Alex Elder (6):
>   net: qualcomm: rmnet: mark trailer field endianness
>   net: qualcomm: rmnet: simplify some byte order logic
>   net: qualcomm: rmnet: kill RMNET_MAP_GET_*() accessor macros
>   net: qualcomm: rmnet: use masks instead of C bit-fields
>   net: qualcomm: rmnet: don't use C bit-fields in rmnet checksum trailer
>   net: qualcomm: rmnet: don't use C bit-fields in rmnet checksum header
>
>  .../ethernet/qualcomm/rmnet/rmnet_handlers.c  | 10 +--
>  .../net/ethernet/qualcomm/rmnet/rmnet_map.h   | 12 
>  .../qualcomm/rmnet/rmnet_map_command.c| 11 +++-
>  .../ethernet/qualcomm/rmnet/rmnet_map_data.c  | 60 -
>  include/linux/if_rmnet.h  | 65 +--
>  5 files changed, 69 insertions(+), 89 deletions(-)
>

Other than the minor nit I pointed out in patch 2 the set looks good to me.

Reviewed-by: Alexander Duyck

Re: [PATCH net-next v4 2/6] net: qualcomm: rmnet: simplify some byte order logic

2021-03-15 Thread Alexander Duyck

On Mon, Mar 15, 2021 at 6:36 AM Alex Elder  wrote:
>
> In rmnet_map_ipv4_ul_csum_header() and rmnet_map_ipv6_ul_csum_header()
> the offset within a packet at which checksumming should commence is
> calculated.  This calculation involves byte swapping and a forced type
> conversion that makes it hard to understand.
>
> Simplify this by computing the offset in host byte order, then
> converting the result when assigning it into the header field.
>
> Signed-off-by: Alex Elder 
> Reviewed-by: Bjorn Andersson 
> ---
>  .../ethernet/qualcomm/rmnet/rmnet_map_data.c  | 22 ++-
>  1 file changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c 
> b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
> index 21d38167f9618..bd1aa11c9ce59 100644
> --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
> +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
> @@ -197,12 +197,13 @@ rmnet_map_ipv4_ul_csum_header(void *iphdr,
>   struct rmnet_map_ul_csum_header *ul_header,
>   struct sk_buff *skb)
>  {
> -   struct iphdr *ip4h = (struct iphdr *)iphdr;
> -   __be16 *hdr = (__be16 *)ul_header, offset;
> +   __be16 *hdr = (__be16 *)ul_header;
> +   struct iphdr *ip4h = iphdr;
> +   u16 offset;
> +
> +   offset = skb_transport_header(skb) - (unsigned char *)iphdr;
> +   ul_header->csum_start_offset = htons(offset);

Rather than using skb_transport_header the correct pointer to use is
probably skb_checksum_start. The two are essentially synonymous but
the checksumming code is supposed to use skb_checksum_start.

Alternatively you could look at possibly using skb_network_header_len
as that would be the same value assuming that both headers are the
outer headers. Then you could avoid the extra pointer overhead.

>
> -   offset = htons((__force u16)(skb_transport_header(skb) -
> -(unsigned char *)iphdr));
> -   ul_header->csum_start_offset = offset;
> ul_header->csum_insert_offset = skb->csum_offset;
> ul_header->csum_enabled = 1;
> if (ip4h->protocol == IPPROTO_UDP)
> @@ -239,12 +240,13 @@ rmnet_map_ipv6_ul_csum_header(void *ip6hdr,
>   struct rmnet_map_ul_csum_header *ul_header,
>   struct sk_buff *skb)
>  {
> -   struct ipv6hdr *ip6h = (struct ipv6hdr *)ip6hdr;
> -   __be16 *hdr = (__be16 *)ul_header, offset;
> +   __be16 *hdr = (__be16 *)ul_header;
> +   struct ipv6hdr *ip6h = ip6hdr;
> +   u16 offset;
> +
> +   offset = skb_transport_header(skb) - (unsigned char *)ip6hdr;
> +   ul_header->csum_start_offset = htons(offset);

Same here.

>
> -   offset = htons((__force u16)(skb_transport_header(skb) -
> -(unsigned char *)ip6hdr));
> -   ul_header->csum_start_offset = offset;
> ul_header->csum_insert_offset = skb->csum_offset;
> ul_header->csum_enabled = 1;
>
> --
> 2.27.0
>

Re: [PATCH net-next] net: ipa: make ipa_table_hash_support() inline

2021-03-15 Thread Alexander Duyck

On Mon, Mar 15, 2021 at 8:01 AM Alex Elder  wrote:
>
> In review, Alexander Duyck suggested that ipa_table_hash_support()
> was trivial enough that it could be implemented as a static inline
> function in the header file.  But the patch had already been
> accepted.  Implement his suggestion.
>
> Signed-off-by: Alex Elder 

Looks good to me.

Reviewed-by: Alexander Duyck 

> ---
>  drivers/net/ipa/ipa_table.c | 5 -
>  drivers/net/ipa/ipa_table.h | 5 -
>  2 files changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/ipa/ipa_table.c b/drivers/net/ipa/ipa_table.c
> index baaab3dd0e63c..7450e27068f19 100644
> --- a/drivers/net/ipa/ipa_table.c
> +++ b/drivers/net/ipa/ipa_table.c
> @@ -239,11 +239,6 @@ static void ipa_table_validate_build(void)
>
>  #endif /* !IPA_VALIDATE */
>
> -bool ipa_table_hash_support(struct ipa *ipa)
> -{
> -   return ipa->version != IPA_VERSION_4_2;
> -}
> -
>  /* Zero entry count means no table, so just return a 0 address */
>  static dma_addr_t ipa_table_addr(struct ipa *ipa, bool filter_mask, u16 
> count)
>  {
> diff --git a/drivers/net/ipa/ipa_table.h b/drivers/net/ipa/ipa_table.h
> index 1a68d20f19d6a..889c2e93b1223 100644
> --- a/drivers/net/ipa/ipa_table.h
> +++ b/drivers/net/ipa/ipa_table.h
> @@ -55,7 +55,10 @@ static inline bool ipa_filter_map_valid(struct ipa *ipa, 
> u32 filter_mask)
>   * ipa_table_hash_support() - Return true if hashed tables are supported
>   * @ipa:   IPA pointer
>   */
> -bool ipa_table_hash_support(struct ipa *ipa);
> +static inline bool ipa_table_hash_support(struct ipa *ipa)
> +{
> +   return ipa->version != IPA_VERSION_4_2;
> +}
>
>  /**
>   * ipa_table_reset() - Reset filter and route tables entries to "none"
> --
> 2.27.0
>

Re: [PATCH] SUNRPC: Refresh rq_pages using a bulk page allocator

2021-03-12 Thread Alexander Duyck

On Fri, Mar 12, 2021 at 1:57 PM Chuck Lever  wrote:
>
> Reduce the rate at which nfsd threads hammer on the page allocator.
> This improves throughput scalability by enabling the threads to run
> more independently of each other.
>
> Signed-off-by: Chuck Lever 
> ---
> Hi Mel-
>
> This patch replaces patch 5/7 in v4 of your alloc_pages_bulk()
> series. It implements code clean-ups suggested by Alexander Duyck.
> It builds and has seen some light testing.
>
>
>  net/sunrpc/svc_xprt.c |   39 +++
>  1 file changed, 27 insertions(+), 12 deletions(-)

The updated patch looks good to me. I am good with having my
Reviewed-by added for patches 1-6. I think the only one that still
needs work is patch 7.

Reviewed-by: Alexander Duyck

Re: [PATCH 7/7] net: page_pool: use alloc_pages_bulk in refill code path

2021-03-12 Thread Alexander Duyck

On Fri, Mar 12, 2021 at 7:43 AM Mel Gorman  wrote:
>
> From: Jesper Dangaard Brouer 
>
> There are cases where the page_pool need to refill with pages from the
> page allocator. Some workloads cause the page_pool to release pages
> instead of recycling these pages.
>
> For these workload it can improve performance to bulk alloc pages from
> the page-allocator to refill the alloc cache.
>
> For XDP-redirect workload with 100G mlx5 driver (that use page_pool)
> redirecting xdp_frame packets into a veth, that does XDP_PASS to create
> an SKB from the xdp_frame, which then cannot return the page to the
> page_pool. In this case, we saw[1] an improvement of 18.8% from using
> the alloc_pages_bulk API (3,677,958 pps -> 4,368,926 pps).
>
> [1] 
> https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool06_alloc_pages_bulk.org
>
> Signed-off-by: Jesper Dangaard Brouer 
> Signed-off-by: Mel Gorman 
> Reviewed-by: Ilias Apalodimas 
> ---
>  net/core/page_pool.c | 62 
>  1 file changed, 39 insertions(+), 23 deletions(-)
>
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 40e1b2beaa6c..a5889f1b86aa 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -208,44 +208,60 @@ noinline
>  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
>  gfp_t _gfp)
>  {
> +   const int bulk = PP_ALLOC_CACHE_REFILL;
> +   struct page *page, *next, *first_page;
> unsigned int pp_flags = pool->p.flags;
> -   struct page *page;
> +   unsigned int pp_order = pool->p.order;
> +   int pp_nid = pool->p.nid;
> +   LIST_HEAD(page_list);
> gfp_t gfp = _gfp;
>
> -   /* We could always set __GFP_COMP, and avoid this branch, as
> -* prep_new_page() can handle order-0 with __GFP_COMP.
> -*/
> -   if (pool->p.order)
> +   /* Don't support bulk alloc for high-order pages */
> +   if (unlikely(pp_order)) {
> gfp |= __GFP_COMP;
> +   first_page = alloc_pages_node(pp_nid, gfp, pp_order);
> +   if (unlikely(!first_page))
> +   return NULL;
> +   goto out;
> +   }
>
> -   /* FUTURE development:
> -*
> -* Current slow-path essentially falls back to single page
> -* allocations, which doesn't improve performance.  This code
> -* need bulk allocation support from the page allocator code.
> -*/
> -
> -   /* Cache was empty, do real allocation */
> -#ifdef CONFIG_NUMA
> -   page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
> -#else
> -   page = alloc_pages(gfp, pool->p.order);
> -#endif
> -   if (!page)
> +   if (unlikely(!__alloc_pages_bulk(gfp, pp_nid, NULL, bulk, 
> _list)))
> return NULL;
>
> +   /* First page is extracted and returned to caller */
> +   first_page = list_first_entry(_list, struct page, lru);
> +   list_del(_page->lru);
> +

This seems kind of broken to me. If you pull the first page and then
cannot map it you end up returning NULL even if you placed a number of
pages in the cache.

It might make more sense to have the loop below record a pointer to
the last page you processed and handle things in two stages so that on
the first iteration you map one page.

So something along the lines of:
1. Initialize last_page to NULL

for each page in the list
  2. Map page
  3. If last_page is non-NULL, move to cache
  4. Assign page to last_page
  5. Return to step 2 for each page in list

6. return last_page

> +   /* Remaining pages store in alloc.cache */
> +   list_for_each_entry_safe(page, next, _list, lru) {
> +   list_del(>lru);
> +   if ((pp_flags & PP_FLAG_DMA_MAP) &&
> +   unlikely(!page_pool_dma_map(pool, page))) {
> +   put_page(page);
> +   continue;
> +   }

So if you added a last_page pointer what you could do is check for it
here and assign it to the alloc cache. If last_page is not set the
block would be skipped.

> +   if (likely(pool->alloc.count < PP_ALLOC_CACHE_SIZE)) {
> +   pool->alloc.cache[pool->alloc.count++] = page;
> +   pool->pages_state_hold_cnt++;
> +   trace_page_pool_state_hold(pool, page,
> +  
> pool->pages_state_hold_cnt);
> +   } else {
> +   put_page(page);

If you are just calling put_page here aren't you leaking DMA mappings?
Wouldn't you need to potentially unmap the page before you call
put_page on it?

> +   }
> +   }
> +out:
> if ((pp_flags & PP_FLAG_DMA_MAP) &&
> -   unlikely(!page_pool_dma_map(pool, page))) {
> -   put_page(page);
> +   unlikely(!page_pool_dma_map(pool, first_page))) {
> +

Re: [PATCH 5/7] SUNRPC: Refresh rq_pages using a bulk page allocator

2021-03-12 Thread Alexander Duyck

On Fri, Mar 12, 2021 at 7:43 AM Mel Gorman  wrote:
>
> From: Chuck Lever 
>
> Reduce the rate at which nfsd threads hammer on the page allocator.
> This improves throughput scalability by enabling the threads to run
> more independently of each other.
>
> Signed-off-by: Chuck Lever 
> Signed-off-by: Mel Gorman 
> ---
>  net/sunrpc/svc_xprt.c | 43 +++
>  1 file changed, 31 insertions(+), 12 deletions(-)
>
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index cfa7e4776d0e..38a8d6283801 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -642,11 +642,12 @@ static void svc_check_conn_limits(struct svc_serv *serv)
>  static int svc_alloc_arg(struct svc_rqst *rqstp)
>  {
> struct svc_serv *serv = rqstp->rq_server;
> +   unsigned long needed;
> struct xdr_buf *arg;
> +   struct page *page;
> int pages;
> int i;
>
> -   /* now allocate needed pages.  If we get a failure, sleep briefly */
> pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT;
> if (pages > RPCSVC_MAXPAGES) {
> pr_warn_once("svc: warning: pages=%u > RPCSVC_MAXPAGES=%lu\n",
> @@ -654,19 +655,28 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
> /* use as many pages as possible */
> pages = RPCSVC_MAXPAGES;
> }
> -   for (i = 0; i < pages ; i++)
> -   while (rqstp->rq_pages[i] == NULL) {
> -   struct page *p = alloc_page(GFP_KERNEL);
> -   if (!p) {
> -   set_current_state(TASK_INTERRUPTIBLE);
> -   if (signalled() || kthread_should_stop()) {
> -   set_current_state(TASK_RUNNING);
> -   return -EINTR;
> -   }
> -   schedule_timeout(msecs_to_jiffies(500));
> +

> +   for (needed = 0, i = 0; i < pages ; i++)
> +   if (!rqstp->rq_pages[i])
> +   needed++;

I would use an opening and closing braces for the for loop since
technically the if is a multiline statement. It will make this more
readable.

> +   if (needed) {
> +   LIST_HEAD(list);
> +
> +retry:

Rather than kind of open code a while loop why not just make this
"while (needed)"? Then all you have to do is break out of the for loop
and you will automatically return here instead of having to jump to
two different labels.

> +   alloc_pages_bulk(GFP_KERNEL, needed, );

Rather than not using the return value would it make sense here to
perhaps subtract it from needed? Then you would know if any of the
allocation requests weren't fulfilled.

> +   for (i = 0; i < pages; i++) {

It is probably optimizing for the exception case, but I don't think
you want the "i = 0" here. If you are having to stop because the list
is empty it probably makes sense to resume where you left off. So you
should probably be initializing i to 0 before we check for needed.

> +   if (!rqstp->rq_pages[i]) {

It might be cleaner here to just do a "continue" if rq_pages[i] is populated.

> +   page = list_first_entry_or_null(,
> +   struct page,
> +   lru);
> +   if (unlikely(!page))
> +   goto empty_list;

I think I preferred the original code that wasn't jumping away from
the loop here. With the change I suggested above that would switch the
if(needed) to while(needed) you could have it just break out of the
for loop to place itself back in the while loop.

> +   list_del(>lru);
> +   rqstp->rq_pages[i] = page;
> +   needed--;
> }
> -   rqstp->rq_pages[i] = p;
> }
> +   }
> rqstp->rq_page_end = >rq_pages[pages];
> rqstp->rq_pages[pages] = NULL; /* this might be seen in 
> nfsd_splice_actor() */
>
> @@ -681,6 +691,15 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
> arg->len = (pages-1)*PAGE_SIZE;
> arg->tail[0].iov_len = 0;
> return 0;
> +
> +empty_list:
> +   set_current_state(TASK_INTERRUPTIBLE);
> +   if (signalled() || kthread_should_stop()) {
> +   set_current_state(TASK_RUNNING);
> +   return -EINTR;
> +   }
> +   schedule_timeout(msecs_to_jiffies(500));
> +   goto retry;
>  }
>
>  static bool
> --
> 2.26.2
>

Re: [PATCH 2/5] mm/page_alloc: Add a bulk page allocator

2021-03-11 Thread Alexander Duyck

On Thu, Mar 11, 2021 at 3:49 AM Mel Gorman  wrote:
>
> This patch adds a new page allocator interface via alloc_pages_bulk,
> and __alloc_pages_bulk_nodemask. A caller requests a number of pages
> to be allocated and added to a list. They can be freed in bulk using
> free_pages_bulk().
>
> The API is not guaranteed to return the requested number of pages and
> may fail if the preferred allocation zone has limited free memory, the
> cpuset changes during the allocation or page debugging decides to fail
> an allocation. It's up to the caller to request more pages in batch
> if necessary.
>
> Note that this implementation is not very efficient and could be improved
> but it would require refactoring. The intent is to make it available early
> to determine what semantics are required by different callers. Once the
> full semantics are nailed down, it can be refactored.
>
> Signed-off-by: Mel Gorman 
> ---
>  include/linux/gfp.h |  13 +
>  mm/page_alloc.c | 118 +++-
>  2 files changed, 129 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 8572a1474e16..4903d1cc48dc 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -515,6 +515,10 @@ static inline int arch_make_page_accessible(struct page 
> *page)
>  }
>  #endif
>
> +int __alloc_pages_bulk_nodemask(gfp_t gfp_mask, int preferred_nid,
> +   nodemask_t *nodemask, int nr_pages,
> +   struct list_head *list);
> +
>  struct page *
>  __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
> nodemask_t *nodemask);
> @@ -525,6 +529,14 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, int 
> preferred_nid)
> return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
>  }
>
> +/* Bulk allocate order-0 pages */
> +static inline unsigned long
> +alloc_pages_bulk(gfp_t gfp_mask, unsigned long nr_pages, struct list_head 
> *list)
> +{
> +   return __alloc_pages_bulk_nodemask(gfp_mask, numa_mem_id(), NULL,
> +   nr_pages, list);
> +}
> +
>  /*
>   * Allocate pages, preferring the node given as nid. The node must be valid 
> and
>   * online. For more general interface, see alloc_pages_node().
> @@ -594,6 +606,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t 
> size, gfp_t gfp_mask);
>
>  extern void __free_pages(struct page *page, unsigned int order);
>  extern void free_pages(unsigned long addr, unsigned int order);
> +extern void free_pages_bulk(struct list_head *list);
>
>  struct page_frag_cache;
>  extern void __page_frag_cache_drain(struct page *page, unsigned int count);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3e4b29ee2b1e..415059324dc3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4436,6 +4436,21 @@ static void wake_all_kswapds(unsigned int order, gfp_t 
> gfp_mask,
> }
>  }
>
> +/* Drop reference counts and free order-0 pages from a list. */
> +void free_pages_bulk(struct list_head *list)
> +{
> +   struct page *page, *next;
> +
> +   list_for_each_entry_safe(page, next, list, lru) {
> +   trace_mm_page_free_batched(page);
> +   if (put_page_testzero(page)) {
> +   list_del(>lru);
> +   __free_pages_ok(page, 0, FPI_NONE);
> +   }
> +   }
> +}
> +EXPORT_SYMBOL_GPL(free_pages_bulk);
> +
>  static inline unsigned int
>  gfp_to_alloc_flags(gfp_t gfp_mask)
>  {
> @@ -4919,6 +4934,9 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, 
> unsigned int order,
> struct alloc_context *ac, gfp_t *alloc_mask,
> unsigned int *alloc_flags)
>  {
> +   gfp_mask &= gfp_allowed_mask;
> +   *alloc_mask = gfp_mask;
> +
> ac->highest_zoneidx = gfp_zone(gfp_mask);
> ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
> ac->nodemask = nodemask;

It might be better to pull this and the change from the bottom out
into a seperate patch. I was reviewing this and when I hit the bottom
I apparently had the same question other reviewers had wondering if it
was intentional. By splitting it out it would be easier to review.

> @@ -4960,6 +4978,104 @@ static inline bool prepare_alloc_pages(gfp_t 
> gfp_mask, unsigned int order,
> return true;
>  }
>
> +/*
> + * This is a batched version of the page allocator that attempts to
> + * allocate nr_pages quickly from the preferred zone and add them to list.
> + *
> + * Returns the number of pages allocated.
> + */
> +int __alloc_pages_bulk_nodemask(gfp_t gfp_mask, int preferred_nid,
> +   nodemask_t *nodemask, int nr_pages,
> +   struct list_head *alloc_list)
> +{
> +   struct page *page;
> +   unsigned long flags;
> +   struct zone *zone;
> +   struct zoneref *z;
> +

Re: [PATCH v17 1/9] mm: Adjust shuffle code to allow for future coalescing

2021-03-10 Thread Alexander Duyck

Hi Bala,

There was a similar effort several months ago that was trying to do
this in conjunction with pre-zeroing of pages. I suspect if you wanted
to you could probably pick up some of their patch set and work with
that. It can be found at:
https://www.spinics.net/lists/linux-mm/msg239735.html

Thanks.

- Alex

On Tue, Mar 9, 2021 at 12:13 AM Bodeddula, Balasubramaniam
 wrote:
>
> Hi Alexander,
>
>
>
> My team was evaluating FPR and observed that these patches don’t report 
> memory for deallocated hugeapages directly and need to cycle through buddy 
> allocator. For example, say we need to allocate a maximum of 12 * 1G 
> hugepages (by setting nr_hugepages), use 8 * 1G hugepages, and then 
> deallocate 4 * 1G hugepages. Unlike regular 4K pages, this 4G worth of memory 
> will not be reported until we set nr_hugepages to 8 (wait sometime(?) for FPR 
> to do its work) and set it back again to 12. While this works fine in theory, 
> in practice,  setting nr_hugepages to 12 could fail too due to fragmentation 
> (this could depend on other processes memory usage behavior).
>
>
>
> If FPR could report this free memory without cycling through buddy allocator, 
> it makes the solution more robust. I am looking for advice on how feasible 
> this approach is and what would be the effort for building this 
> functionality. In general, if there are other thoughts on how we can address 
> this, please do let me know.
>
>
>
> Thanks,
>
> bala

Re: [PATCH 0/3] fix a couple of atm->phy_data related issues

2021-03-08 Thread Alexander Duyck

Hi Tong,

Is this direct-assigned hardware or is QEMU being used to emulate the
hardware here? Admittedly I don't know that much about ATM, so I am
not sure when/if those phys would have gone out of production. However
since the code dates back to 2005 I am guessing it is on the old side.

Ultimately the decision is up to Chas. However if there has been code
in place for this long that would trigger this kind of null pointer
dereference then it kind of points to the fact that those phys have
probably not been in use since at least back when Linus switched over
to git in 2005.

Thanks,

- Alex

On Mon, Mar 8, 2021 at 9:55 AM Tong Zhang  wrote:
>
> Hi Alex,
> attached is the kernel log for zatm(uPD98402) -- I also have
> idt77252's log -- which is similar to this one --
> I think it makes sense to drop if no one is actually using it --
> - Tong
>
> [5.740774] BUG: KASAN: null-ptr-deref in uPD98402_start+0x5e/0x219
> [uPD98402]
> [5.741179] Write of size 4 at addr 002c by task modprobe/96
> [5.741548]
> [5.741637] CPU: 0 PID: 96 Comm: modprobe Not tainted 5.12.0-rc2-dirty #71
> [5.742017] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009),
> BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014
> [5.742635] Call Trace:
> [5.742775]  dump_stack+0x8a/0xb5
> [5.742966]  kasan_report.cold+0x10f/0x111
> [5.743197]  ? uPD98402_start+0x5e/0x219 [uPD98402]
> [5.743473]  uPD98402_start+0x5e/0x219 [uPD98402]
> [5.743739]  zatm_init_one+0x10b5/0x1311 [zatm]
> [5.743998]  ? zatm_int.cold+0x30/0x30 [zatm]
> [5.744246]  ? _raw_write_lock_irqsave+0xd0/0xd0
> [5.744507]  ? __mutex_lock_slowpath+0x10/0x10
> [5.744757]  ? _raw_spin_unlock_irqrestore+0xd/0x20
> [5.745030]  ? zatm_int.cold+0x30/0x30 [zatm]
> [5.745278]  local_pci_probe+0x6f/0xb0
> [5.745492]  pci_device_probe+0x171/0x240
> [5.745718]  ? pci_device_remove+0xe0/0xe0
> [5.745949]  ? kernfs_create_link+0xb6/0x110
> [5.746190]  ? sysfs_do_create_link_sd.isra.0+0x76/0xe0
> [5.746482]  really_probe+0x161/0x420
> [5.746691]  driver_probe_device+0x6d/0xd0
> [5.746923]  device_driver_attach+0x82/0x90
> [5.747158]  ? device_driver_attach+0x90/0x90
> [5.747402]  __driver_attach+0x60/0x100
> [5.747621]  ? device_driver_attach+0x90/0x90
> [5.747864]  bus_for_each_dev+0xe1/0x140
> [5.748075]  ? subsys_dev_iter_exit+0x10/0x10
> [5.748320]  ? klist_node_init+0x61/0x80
> [5.748542]  bus_add_driver+0x254/0x2a0
> [5.748760]  driver_register+0xd3/0x150
> [5.748977]  ? 0xc003
> [5.749163]  do_one_initcall+0x84/0x250
> [5.749380]  ? trace_event_raw_event_initcall_finish+0x150/0x150
> [5.749714]  ? _raw_spin_unlock_irqrestore+0xd/0x20
> [5.749987]  ? create_object+0x395/0x510
> [5.750210]  ? kasan_unpoison+0x21/0x50
> [5.750427]  do_init_module+0xf8/0x350
> [5.750640]  load_module+0x40c5/0x4410
> [5.750854]  ? module_frob_arch_sections+0x20/0x20
> [5.751123]  ? kernel_read_file+0x1cd/0x3e0
> [5.751364]  ? __do_sys_finit_module+0x108/0x170
> [5.751628]  __do_sys_finit_module+0x108/0x170
> [5.751879]  ? __ia32_sys_init_module+0x40/0x40
> [5.752126]  ? file_open_root+0x200/0x200
> [5.752353]  ? do_sys_open+0x85/0xe0
> [5.752556]  ? filp_open+0x50/0x50
> [5.752750]  ? fpregs_assert_state_consistent+0x4d/0x60
> [5.753042]  ? exit_to_user_mode_prepare+0x2f/0x130
> [5.753316]  do_syscall_64+0x33/0x40
> [5.753519]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [5.753802] RIP: 0033:0x7ff64032dcf7
>  ff c3 48 c7 c6 01 00 00 00 e9 a1
> [5.755029] RSP: 002b:7ffd250ea358 EFLAGS: 0246 ORIG_RAX:
> 0139
> [5.755449] RAX: ffda RBX: 01093a70 RCX: 
> 7ff64032dcf7
> [5.755847] RDX:  RSI: 010929e0 RDI: 
> 0003
> [5.756242] RBP: 0003 R08:  R09: 
> 0001
> [5.756635] R10: 7ff640391300 R11: 0246 R12: 
> 010929e0
> [5.757029] R13:  R14: 01092dd0 R15: 
> 0001
>
> On Mon, Mar 8, 2021 at 12:47 PM Alexander Duyck
>  wrote:
> >
> > On Mon, Mar 8, 2021 at 12:39 AM Tong Zhang  wrote:
> > >
> > > there are two drivers(zatm and idt77252) using PRIV() (i.e. atm->phy_data)
> > > to store private data, but the driver happens to populate wrong
> > > pointers: atm->dev_data. which actually cause null-ptr-dereference in
> > > following PRIV(dev). This patch series attemps to fix those two issues
> > > along with a typo in atm struct.
> > >
> > > Tong Zhan

Re: [PATCH 0/3] fix a couple of atm->phy_data related issues

2021-03-08 Thread Alexander Duyck

On Mon, Mar 8, 2021 at 12:39 AM Tong Zhang  wrote:
>
> there are two drivers(zatm and idt77252) using PRIV() (i.e. atm->phy_data)
> to store private data, but the driver happens to populate wrong
> pointers: atm->dev_data. which actually cause null-ptr-dereference in
> following PRIV(dev). This patch series attemps to fix those two issues
> along with a typo in atm struct.
>
> Tong Zhang (3):
>   atm: fix a typo in the struct description
>   atm: uPD98402: fix incorrect allocation
>   atm: idt77252: fix null-ptr-dereference
>
>  drivers/atm/idt77105.c | 4 ++--
>  drivers/atm/uPD98402.c | 2 +-
>  include/linux/atmdev.h | 2 +-
>  3 files changed, 4 insertions(+), 4 deletions(-)

For the 2 phys you actually seen null pointer dereferences or are your
changes based on just code review?

I ask because it seems like this code has been this way since 2005 and
in the case of uPD98402_start the code doesn't seem like it should
function the way it was as PRIV is phy_data and there being issues
seems pretty obvious since the initialization of things happens
immediately after the allocation.

I'm just wondering if it might make more sense to drop the code if it
hasn't been run in 15+ years rather than updating it?

Re: [PATCH v6 net-next 00/11] skbuff: introduce skbuff_heads bulking and reusing

2021-02-13 Thread Alexander Duyck

On Sat, Feb 13, 2021 at 6:10 AM Alexander Lobakin  wrote:
>
> Currently, all sorts of skb allocation always do allocate
> skbuff_heads one by one via kmem_cache_alloc().
> On the other hand, we have percpu napi_alloc_cache to store
> skbuff_heads queued up for freeing and flush them by bulks.
>
> We can use this cache not only for bulk-wiping, but also to obtain
> heads for new skbs and avoid unconditional allocations, as well as
> for bulk-allocating (like XDP's cpumap code and veth driver already
> do).
>
> As this might affect latencies, cache pressure and lots of hardware
> and driver-dependent stuff, this new feature is mostly optional and
> can be issued via:
>  - a new napi_build_skb() function (as a replacement for build_skb());
>  - existing {,__}napi_alloc_skb() and napi_get_frags() functions;
>  - __alloc_skb() with passing SKB_ALLOC_NAPI in flags.
>
> iperf3 showed 35-70 Mbps bumps for both TCP and UDP while performing
> VLAN NAT on 1.2 GHz MIPS board. The boost is likely to be bigger
> on more powerful hosts and NICs with tens of Mpps.
>
> Note on skbuff_heads from distant slabs or pfmemalloc'ed slabs:
>  - kmalloc()/kmem_cache_alloc() itself allows by default allocating
>memory from the remote nodes to defragment their slabs. This is
>controlled by sysctl, but according to this, skbuff_head from a
>remote node is an OK case;
>  - The easiest way to check if the slab of skbuff_head is remote or
>pfmemalloc'ed is:
>
> if (!dev_page_is_reusable(virt_to_head_page(skb)))
> /* drop it */;
>
>...*but*, regarding that most slabs are built of compound pages,
>virt_to_head_page() will hit unlikely-branch every single call.
>This check costed at least 20 Mbps in test scenarios and seems
>like it'd be better to _not_ do this.



> Alexander Lobakin (11):
>   skbuff: move __alloc_skb() next to the other skb allocation functions
>   skbuff: simplify kmalloc_reserve()
>   skbuff: make __build_skb_around() return void
>   skbuff: simplify __alloc_skb() a bit
>   skbuff: use __build_skb_around() in __alloc_skb()
>   skbuff: remove __kfree_skb_flush()
>   skbuff: move NAPI cache declarations upper in the file
>   skbuff: introduce {,__}napi_build_skb() which reuses NAPI cache heads
>   skbuff: allow to optionally use NAPI cache from __alloc_skb()
>   skbuff: allow to use NAPI cache from __napi_alloc_skb()
>   skbuff: queue NAPI_MERGED_FREE skbs into NAPI cache instead of freeing
>
>  include/linux/skbuff.h |   4 +-
>  net/core/dev.c |  16 +-
>  net/core/skbuff.c  | 428 +++--
>  3 files changed, 242 insertions(+), 206 deletions(-)
>

With the last few changes and testing to verify the need to drop the
cache clearing this patch set looks good to me.

Reviewed-by: Alexander Duyck

Re: [PATCH v3 net-next 4/5] net: ipa: introduce ipa_table_hash_support()

2021-02-12 Thread Alexander Duyck

On Fri, Feb 12, 2021 at 6:40 AM Alex Elder  wrote:
>
> Introduce a new function to abstract the knowledge of whether hashed
> routing and filter tables are supported for a given IPA instance.
>
> IPA v4.2 is the only one that doesn't support hashed tables (now
> and for the foreseeable future), but the name of the helper function
> is better for explaining what's going on.
>
> Signed-off-by: Alex Elder 
> ---
> v2: - Update copyrights.
>
>  drivers/net/ipa/ipa_cmd.c   |  2 +-
>  drivers/net/ipa/ipa_table.c | 16 +---
>  drivers/net/ipa/ipa_table.h |  8 +++-
>  3 files changed, 17 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/ipa/ipa_cmd.c b/drivers/net/ipa/ipa_cmd.c
> index fd8bf6468d313..35e35852c25c5 100644
> --- a/drivers/net/ipa/ipa_cmd.c
> +++ b/drivers/net/ipa/ipa_cmd.c
> @@ -268,7 +268,7 @@ static bool ipa_cmd_register_write_valid(struct ipa *ipa)
> /* If hashed tables are supported, ensure the hash flush register
>  * offset will fit in a register write IPA immediate command.
>  */
> -   if (ipa->version != IPA_VERSION_4_2) {
> +   if (ipa_table_hash_support(ipa)) {
> offset = ipa_reg_filt_rout_hash_flush_offset(ipa->version);
> name = "filter/route hash flush";
> if (!ipa_cmd_register_write_offset_valid(ipa, name, offset))
> diff --git a/drivers/net/ipa/ipa_table.c b/drivers/net/ipa/ipa_table.c
> index 32e2d3e052d55..baaab3dd0e63c 100644
> --- a/drivers/net/ipa/ipa_table.c
> +++ b/drivers/net/ipa/ipa_table.c
> @@ -1,7 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0
>
>  /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
> - * Copyright (C) 2018-2020 Linaro Ltd.
> + * Copyright (C) 2018-2021 Linaro Ltd.
>   */
>
>  #include 
> @@ -239,6 +239,11 @@ static void ipa_table_validate_build(void)
>
>  #endif /* !IPA_VALIDATE */
>
> +bool ipa_table_hash_support(struct ipa *ipa)
> +{
> +   return ipa->version != IPA_VERSION_4_2;
> +}
> +

Since this is only a single comparison it might make more sense to
make this a static inline and place it in ipa.h. Otherwise you are
just bloating the code up to jump to such a small function.

>  /* Zero entry count means no table, so just return a 0 address */
>  static dma_addr_t ipa_table_addr(struct ipa *ipa, bool filter_mask, u16 
> count)
>  {
> @@ -412,8 +417,7 @@ int ipa_table_hash_flush(struct ipa *ipa)
> struct gsi_trans *trans;
> u32 val;
>
> -   /* IPA version 4.2 does not support hashed tables */
> -   if (ipa->version == IPA_VERSION_4_2)
> +   if (!ipa_table_hash_support(ipa))
> return 0;
>
> trans = ipa_cmd_trans_alloc(ipa, 1);
> @@ -531,8 +535,7 @@ static void ipa_filter_config(struct ipa *ipa, bool modem)
> enum gsi_ee_id ee_id = modem ? GSI_EE_MODEM : GSI_EE_AP;
> u32 ep_mask = ipa->filter_map;
>
> -   /* IPA version 4.2 has no hashed route tables */
> -   if (ipa->version == IPA_VERSION_4_2)
> +   if (!ipa_table_hash_support(ipa))
> return;
>
> while (ep_mask) {
> @@ -582,8 +585,7 @@ static void ipa_route_config(struct ipa *ipa, bool modem)
>  {
> u32 route_id;
>
> -   /* IPA version 4.2 has no hashed route tables */
> -   if (ipa->version == IPA_VERSION_4_2)
> +   if (!ipa_table_hash_support(ipa))
> return;
>
> for (route_id = 0; route_id < IPA_ROUTE_COUNT_MAX; route_id++)
> diff --git a/drivers/net/ipa/ipa_table.h b/drivers/net/ipa/ipa_table.h
> index 78038d14fcea9..1a68d20f19d6a 100644
> --- a/drivers/net/ipa/ipa_table.h
> +++ b/drivers/net/ipa/ipa_table.h
> @@ -1,7 +1,7 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>
>  /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
> - * Copyright (C) 2019-2020 Linaro Ltd.
> + * Copyright (C) 2019-2021 Linaro Ltd.
>   */
>  #ifndef _IPA_TABLE_H_
>  #define _IPA_TABLE_H_
> @@ -51,6 +51,12 @@ static inline bool ipa_filter_map_valid(struct ipa *ipa, 
> u32 filter_mask)
>
>  #endif /* !IPA_VALIDATE */
>
> +/**
> + * ipa_table_hash_support() - Return true if hashed tables are supported
> + * @ipa:   IPA pointer
> + */
> +bool ipa_table_hash_support(struct ipa *ipa);
> +
>  /**
>   * ipa_table_reset() - Reset filter and route tables entries to "none"
>   * @ipa:   IPA pointer

Just define the function here and make it a static inline.

Re: [PATCH v5 net-next 06/11] skbuff: remove __kfree_skb_flush()

2021-02-11 Thread Alexander Duyck

On Thu, Feb 11, 2021 at 10:57 AM Alexander Lobakin  wrote:
>
> This function isn't much needed as NAPI skb queue gets bulk-freed
> anyway when there's no more room, and even may reduce the efficiency
> of bulk operations.
> It will be even less needed after reusing skb cache on allocation path,
> so remove it and this way lighten network softirqs a bit.
>
> Suggested-by: Eric Dumazet 
> Signed-off-by: Alexander Lobakin 

I'm wondering if you have any actual gains to show from this patch?

The reason why I ask is because the flushing was happening at the end
of the softirq before the system basically gave control back over to
something else. As such there is a good chance for the memory to be
dropped from the cache by the time we come back to it. So it may be
just as expensive if not more so than accessing memory that was just
freed elsewhere and placed in the slab cache.

> ---
>  include/linux/skbuff.h |  1 -
>  net/core/dev.c |  7 +--
>  net/core/skbuff.c  | 12 
>  3 files changed, 1 insertion(+), 19 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 0a4e91a2f873..0e0707296098 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -2919,7 +2919,6 @@ static inline struct sk_buff *napi_alloc_skb(struct 
> napi_struct *napi,
>  }
>  void napi_consume_skb(struct sk_buff *skb, int budget);
>
> -void __kfree_skb_flush(void);
>  void __kfree_skb_defer(struct sk_buff *skb);
>
>  /**
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 321d41a110e7..4154d4683bb9 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4944,8 +4944,6 @@ static __latent_entropy void net_tx_action(struct 
> softirq_action *h)
> else
> __kfree_skb_defer(skb);
> }
> -
> -   __kfree_skb_flush();
> }
>
> if (sd->output_queue) {
> @@ -7012,7 +7010,6 @@ static int napi_threaded_poll(void *data)
> __napi_poll(napi, );
> netpoll_poll_unlock(have);
>
> -   __kfree_skb_flush();
> local_bh_enable();
>
> if (!repoll)

So it looks like this is the one exception to my comment above. Here
we should probably be adding a "if (!repoll)" before calling
__kfree_skb_flush().

> @@ -7042,7 +7039,7 @@ static __latent_entropy void net_rx_action(struct 
> softirq_action *h)
>
> if (list_empty()) {
> if (!sd_has_rps_ipi_waiting(sd) && 
> list_empty())
> -   goto out;
> +   return;
> break;
> }
>
> @@ -7069,8 +7066,6 @@ static __latent_entropy void net_rx_action(struct 
> softirq_action *h)
> __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>
> net_rps_action_and_irq_enable(sd);
> -out:
> -   __kfree_skb_flush();
>  }
>
>  struct netdev_adjacent {
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 1c6f6ef70339..4be2bb969535 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -838,18 +838,6 @@ void __consume_stateless_skb(struct sk_buff *skb)
> kfree_skbmem(skb);
>  }
>
> -void __kfree_skb_flush(void)
> -{
> -   struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
> -
> -   /* flush skb_cache if containing objects */
> -   if (nc->skb_count) {
> -   kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
> -nc->skb_cache);
> -   nc->skb_count = 0;
> -   }
> -}
> -
>  static inline void _kfree_skb_defer(struct sk_buff *skb)
>  {
> struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
> --
> 2.30.1
>
>

Re: [PATCH v5 net-next 09/11] skbuff: allow to optionally use NAPI cache from __alloc_skb()

2021-02-11 Thread Alexander Duyck

On Thu, Feb 11, 2021 at 11:00 AM Alexander Lobakin  wrote:
>
> Reuse the old and forgotten SKB_ALLOC_NAPI to add an option to get
> an skbuff_head from the NAPI cache instead of inplace allocation
> inside __alloc_skb().
> This implies that the function is called from softirq or BH-off
> context, not for allocating a clone or from a distant node.
>
> Signed-off-by: Alexander Lobakin 
> ---
>  net/core/skbuff.c | 13 +
>  1 file changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 9e1a8ded4acc..a0b457ae87c2 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -397,15 +397,20 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
> gfp_mask,
> struct sk_buff *skb;
> u8 *data;
> bool pfmemalloc;
> +   bool clone;
>
> -   cache = (flags & SKB_ALLOC_FCLONE)
> -   ? skbuff_fclone_cache : skbuff_head_cache;
> +   clone = !!(flags & SKB_ALLOC_FCLONE);

The boolean conversion here is probably unnecessary. I would make
clone an int like flags and work with that. I suspect the compiler is
doing it already, but it is better to be explicit.

> +   cache = clone ? skbuff_fclone_cache : skbuff_head_cache;
>
> if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
> gfp_mask |= __GFP_MEMALLOC;
>
> /* Get the HEAD */
> -   skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
> +   if ((flags & SKB_ALLOC_NAPI) && !clone &&

Rather than having to do two checks you could just check for
SKB_ALLOC_NAPI and SKB_ALLOC_FCLONE in a single check. You could just
do something like:
if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI) == SKB_ALLOC_NAPI)

That way you can avoid the extra conditional jumps and can start
computing the flags value sooner.

> +   likely(node == NUMA_NO_NODE || node == numa_mem_id()))
> +   skb = napi_skb_cache_get();
> +   else
> +   skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
> if (unlikely(!skb))
> return NULL;
> prefetchw(skb);
> @@ -436,7 +441,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
> gfp_mask,
> __build_skb_around(skb, data, 0);
> skb->pfmemalloc = pfmemalloc;
>
> -   if (flags & SKB_ALLOC_FCLONE) {
> +   if (clone) {
> struct sk_buff_fclones *fclones;
>
> fclones = container_of(skb, struct sk_buff_fclones, skb1);
> --
> 2.30.1
>
>

Re: [PATCH net-next RESEND 3/5] net: stmmac: dwmac-sun8i: Use reset_control_reset

2021-02-08 Thread Alexander Duyck

On Sun, Feb 7, 2021 at 10:32 PM Samuel Holland  wrote:
>
> Use the appropriate function instead of reimplementing it,
> and update the error message to match the code.
>
> Reviewed-by: Chen-Yu Tsai 
> Signed-off-by: Samuel Holland 
> ---
>  drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 6 ++
>  1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c 
> b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
> index 3c3d0b99d3e8..0e8d88417251 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
> @@ -806,11 +806,9 @@ static int sun8i_dwmac_power_internal_phy(struct 
> stmmac_priv *priv)
> /* Make sure the EPHY is properly reseted, as U-Boot may leave
>  * it at deasserted state, and thus it may fail to reset EMAC.
>  */
> -   reset_control_assert(gmac->rst_ephy);
> -
> -   ret = reset_control_deassert(gmac->rst_ephy);
> +   ret = reset_control_reset(gmac->rst_ephy);
> if (ret) {
> -   dev_err(priv->device, "Cannot deassert internal phy\n");
> +   dev_err(priv->device, "Cannot reset internal PHY\n");
> clk_disable_unprepare(gmac->ephy_clk);
> return ret;
> }

I'm assuming you have exclusive access to the phy and this isn't a
shared line? Just wanting to confirm since the function call has the
following comment in the header for the documentation.

 * Consumers must not use reset_control_(de)assert on shared reset lines when
 * reset_control_reset has been used.
 *

If that is the case it might not hurt to add some documentation to
your call to reset_control_reset here explaining that it is safe to do
so since you have exclusive access.

Re: [PATCH net-next] udp: allow forwarding of plain (non-fraglisted) UDP GRO packets

2021-01-12 Thread Alexander Duyck


On 1/12/21 1:16 PM, Alexander Lobakin wrote:

Commit 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") actually
not only added a support for fraglisted UDP GRO, but also tweaked
some logics the way that non-fraglisted UDP GRO started to work for
forwarding too.
Tests showed that currently forwarding and NATing of plain UDP GRO
packets are performed fully correctly, regardless if the target
netdevice has a support for hardware/driver GSO UDP L4 or not.
Add the last element and allow to form plain UDP GRO packets if
there is no socket -> we are on forwarding path.

Plain UDP GRO forwarding even shows better performance than fraglisted
UDP GRO in some cases due to not wasting one skbuff_head per every
segment.

Signed-off-by: Alexander Lobakin 
---
  net/ipv4/udp_offload.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ff39e94781bf..9d71df3d52ce 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -460,12 +460,13 @@ struct sk_buff *udp_gro_receive(struct list_head *head, 
struct sk_buff *skb,
if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
  
-	if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) {

+   if (!sk || (sk && udp_sk(sk)->gro_enabled) ||
+   NAPI_GRO_CB(skb)->is_flist) {
pp = call_gro_receive(udp_gro_receive_segment, head, skb);
return pp;
}
  


The second check for sk in "(sk && udp_sk(sk)->gro_enabled)" is 
redundant and can be dropped. You already verified it is present when 
you checked for !sk before the logical OR.



-   if (!sk || NAPI_GRO_CB(skb)->encap_mark ||
+   if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
 NAPI_GRO_CB(skb)->csum_cnt == 0 &&
 !NAPI_GRO_CB(skb)->csum_valid) ||

Re: [PATCH 4/6] hugetlb: avoid allocation failed when page reporting is on going

2021-01-07 Thread Alexander Duyck

On Wed, Jan 6, 2021 at 7:57 PM Liang Li  wrote:
>
> > > Page reporting isolates free pages temporarily when reporting
> > > free pages information. It will reduce the actual free pages
> > > and may cause application failed for no enough available memory.
> > > This patch try to solve this issue, when there is no free page
> > > and page repoting is on going, wait until it is done.
> > >
> > > Cc: Alexander Duyck 
> >
> > Please don't use this email address for me anymore. Either use
> > alexander.du...@gmail.com or alexanderdu...@fb.com. I am getting
> > bounces when I reply to this thread because of the old address.
>
> No problem.
>
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > index eb533995cb49..0fccd5f96954 100644
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -2320,6 +2320,12 @@ struct page *alloc_huge_page(struct vm_area_struct 
> > > *vma,
> > > goto out_uncharge_cgroup_reservation;
> > >
> > > spin_lock(_lock);
> > > +   while (h->free_huge_pages <= 1 && h->isolated_huge_pages) {
> > > +   spin_unlock(_lock);
> > > +   mutex_lock(>mtx_prezero);
> > > +   mutex_unlock(>mtx_prezero);
> > > +   spin_lock(_lock);
> > > +   }
> >
> > This seems like a bad idea. It kind of defeats the whole point of
> > doing the page zeroing outside of the hugetlb_lock. Also it is
> > operating on the assumption that the only way you might get a page is
> > from the page zeroing logic.
> >
> > With the page reporting code we wouldn't drop the count to zero. We
> > had checks that were going through and monitoring the watermarks and
> > if we started to hit the low watermark we would stop page reporting
> > and just assume there aren't enough pages to report. You might need to
> > look at doing something similar here so that you can avoid colliding
> > with the allocator.
>
> For hugetlb, things are a little different, Just like Mike points out:
>  "On some systems, hugetlb pages are a precious resource and
>   the sysadmin carefully configures the number needed by
>   applications.  Removing a hugetlb page (even for a very short
>   period of time) could cause serious application failure."
>
> Just keeping some pages in the freelist is not enough to prevent that from
> happening, because these pages may be allocated while zero out is on
> going, and application may still run into a situation for not available free
> pages.

I get what you are saying. However I don't know if it is acceptable
for the allocating thread to be put to sleep in this situation. There
are two scenarios where I can see this being problematic.

One is a setup where you put the page allocator to sleep and while it
is sleeping another thread is then freeing a page and your thread
cannot respond to that newly freed page and is stuck waiting on the
zeroed page.

The second issue is that users may want a different option of just
breaking up the request into smaller pages rather than waiting on the
page zeroing, or to do something else while waiting on the page. So
instead of sitting on the request and waiting it might make more sense
to return an error pointer like EAGAIN or EBUSY to indicate that there
is a page there, but it is momentarily tied up.

Re: [PATCH 4/6] hugetlb: avoid allocation failed when page reporting is on going

2021-01-06 Thread Alexander Duyck

On Tue, Jan 5, 2021 at 7:50 PM Liang Li  wrote:
>
> Page reporting isolates free pages temporarily when reporting
> free pages information. It will reduce the actual free pages
> and may cause application failed for no enough available memory.
> This patch try to solve this issue, when there is no free page
> and page repoting is on going, wait until it is done.
>
> Cc: Alexander Duyck 

Please don't use this email address for me anymore. Either use
alexander.du...@gmail.com or alexanderdu...@fb.com. I am getting
bounces when I reply to this thread because of the old address.

> Cc: Mel Gorman 
> Cc: Andrea Arcangeli 
> Cc: Dan Williams 
> Cc: Dave Hansen 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Andrew Morton 
> Cc: Alex Williamson 
> Cc: Michael S. Tsirkin 
> Cc: Liang Li 
> Signed-off-by: Liang Li 
> ---
>  include/linux/hugetlb.h | 2 ++
>  mm/hugetlb.c| 9 +
>  mm/page_reporting.c | 6 +-
>  3 files changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index d55e6a00b3dc..73b2934ba91c 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -490,6 +490,7 @@ struct hstate {
> unsigned long resv_huge_pages;
> unsigned long surplus_huge_pages;
> unsigned long nr_overcommit_huge_pages;
> +   unsigned long isolated_huge_pages;
> struct list_head hugepage_activelist;
> struct list_head hugepage_freelists[MAX_NUMNODES];
> unsigned int nr_huge_pages_node[MAX_NUMNODES];
> @@ -500,6 +501,7 @@ struct hstate {
> struct cftype cgroup_files_dfl[7];
> struct cftype cgroup_files_legacy[9];
>  #endif
> +   struct mutex mtx_prezero;
> char name[HSTATE_NAME_LEN];
>  };
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index eb533995cb49..0fccd5f96954 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2320,6 +2320,12 @@ struct page *alloc_huge_page(struct vm_area_struct 
> *vma,
> goto out_uncharge_cgroup_reservation;
>
> spin_lock(_lock);
> +   while (h->free_huge_pages <= 1 && h->isolated_huge_pages) {
> +   spin_unlock(_lock);
> +   mutex_lock(>mtx_prezero);
> +   mutex_unlock(>mtx_prezero);
> +   spin_lock(_lock);
> +   }

This seems like a bad idea. It kind of defeats the whole point of
doing the page zeroing outside of the hugetlb_lock. Also it is
operating on the assumption that the only way you might get a page is
from the page zeroing logic.

With the page reporting code we wouldn't drop the count to zero. We
had checks that were going through and monitoring the watermarks and
if we started to hit the low watermark we would stop page reporting
and just assume there aren't enough pages to report. You might need to
look at doing something similar here so that you can avoid colliding
with the allocator.


> /*
>  * glb_chg is passed to indicate whether or not a page must be taken
>  * from the global free pool (global change).  gbl_chg == 0 indicates
> @@ -3208,6 +3214,7 @@ void __init hugetlb_add_hstate(unsigned int order)
> INIT_LIST_HEAD(>hugepage_activelist);
> h->next_nid_to_alloc = first_memory_node;
> h->next_nid_to_free = first_memory_node;
> +   mutex_init(>mtx_prezero);
> snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
> huge_page_size(h)/1024);
>
> @@ -5541,6 +5548,7 @@ void isolate_free_huge_page(struct page *page, struct 
> hstate *h, int nid)
>
> list_move(>lru, >hugepage_activelist);
> set_page_refcounted(page);
> +   h->isolated_huge_pages++;
>  }
>
>  void putback_isolate_huge_page(struct hstate *h, struct page *page)
> @@ -5548,6 +5556,7 @@ void putback_isolate_huge_page(struct hstate *h, struct 
> page *page)
> int nid = page_to_nid(page);
>
> list_move(>lru, >hugepage_freelists[nid]);
> +   h->isolated_huge_pages--;
>  }
>
>  bool isolate_huge_page(struct page *page, struct list_head *list)
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index cc31696225bb..99e1e688d7c1 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -272,12 +272,15 @@ hugepage_reporting_process_hstate(struct 
> page_reporting_dev_info *prdev,
> int ret = 0, nid;
>
> offset = max_items;
> +   mutex_lock(>mtx_prezero);
> for (nid = 0; nid < MAX_NUMNODES; nid++) {
> ret = hugepage_reporting_cycle(prdev, h, nid, sgl, ,
>

Re: [PATCH 2/6] mm: let user decide page reporting option

2021-01-06 Thread Alexander Duyck

On Tue, Jan 5, 2021 at 7:48 PM Liang Li  wrote:
>
> Some key parameters for page reporting are now hard coded, different
> users of the framework may have their special requirements, make
> these parameter configrable and let the user decide them.
>
> Cc: Alexander Duyck 
> Cc: Mel Gorman 
> Cc: Andrea Arcangeli 
> Cc: Dan Williams 
> Cc: Dave Hansen 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Andrew Morton 
> Cc: Alex Williamson 
> Cc: Michael S. Tsirkin 
> Cc: Liang Li 
> Signed-off-by: Liang Li 
> ---
>  drivers/virtio/virtio_balloon.c |  3 +++
>  include/linux/page_reporting.h  |  3 +++
>  mm/page_reporting.c | 13 +
>  mm/page_reporting.h |  6 +++---
>  4 files changed, 18 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 8985fc2cea86..684bcc39ef5a 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -993,6 +993,9 @@ static int virtballoon_probe(struct virtio_device *vdev)
> goto out_unregister_oom;
> }
>
> +   vb->pr_dev_info.mini_order = pageblock_order;
> +   vb->pr_dev_info.batch_size = 16 * 1024 * 1024; /* 16M */
> +   vb->pr_dev_info.delay_jiffies = 2 * HZ; /* 2 seconds */
> err = page_reporting_register(>pr_dev_info);
> if (err)
> goto out_unregister_oom;
> diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
> index 3b99e0ec24f2..63e1e9fbcaa2 100644
> --- a/include/linux/page_reporting.h
> +++ b/include/linux/page_reporting.h
> @@ -13,6 +13,9 @@ struct page_reporting_dev_info {
> int (*report)(struct page_reporting_dev_info *prdev,
>   struct scatterlist *sg, unsigned int nents);
>
> +   unsigned long batch_size;
> +   unsigned long delay_jiffies;
> +   int mini_order;
> /* work struct for processing reports */
> struct delayed_work work;
>
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 694df981ddd2..39bc6a9d7b73 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -13,6 +13,7 @@
>  #define PAGE_REPORTING_DELAY   (2 * HZ)
>  static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
>  unsigned long page_report_batch_size  __read_mostly = 16 * 1024 * 1024UL;
> +int page_report_mini_order = pageblock_order;
>
>  enum {
> PAGE_REPORTING_IDLE = 0,
> @@ -44,7 +45,7 @@ __page_reporting_request(struct page_reporting_dev_info 
> *prdev)
>  * now we are limiting this to running no more than once every
>  * couple of seconds.
>  */
> -   schedule_delayed_work(>work, PAGE_REPORTING_DELAY);
> +   schedule_delayed_work(>work, prdev->delay_jiffies);
>  }
>

So this ends up being the reason why you needed to add the batch size
value. However I don't really see it working as expected since you
could essentially have 1 page freed 4M times that could trigger your
page zeroing logic. So for example if a NIC is processing frames and
ends up freeing and then reallocating some small batch of pages this
could would be running often even though there isn't really all that
many pages that needed zeroing.

>  /* notify prdev of free page reporting request */
> @@ -230,7 +231,7 @@ page_reporting_process_zone(struct 
> page_reporting_dev_info *prdev,
>
> /* Generate minimum watermark to be able to guarantee progress */
> watermark = low_wmark_pages(zone) +
> -   (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
> +   (PAGE_REPORTING_CAPACITY << prdev->mini_order);
>
> /*
>  * Cancel request if insufficient free memory or if we failed

With the page order being able to be greatly reduced this could have a
significant impact on if this code really has any value. Previously we
were able to guarantee a pretty significant number of higher order
pages free. With this we might only be guaranteeing something like 32
4K pages which is pretty small compared to what can end up being
pulled out at the higher end.

> @@ -240,7 +241,7 @@ page_reporting_process_zone(struct 
> page_reporting_dev_info *prdev,
> return err;
>
> /* Process each free list starting from lowest order/mt */
> -   for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
> +   for (order = prdev->mini_order; order < MAX_ORDER; order++) {
> for (mt = 0; mt < MIGRATE_TYPES; mt++) {
> /* We do not pull pages from the isolate free

Re: [PATCH 1/6] mm: Add batch size for free page reporting

2021-01-06 Thread Alexander Duyck

On Tue, Jan 5, 2021 at 7:47 PM Liang Li  wrote:
>
> Use the page order as the only threshold for page reporting
> is not flexible and has some flaws. Because scan a long free
> list is not cheap, it's better to wake up the page reporting
> worker when there are more pages, wake it up for a sigle page
> may not worth.
> This patch add a batch size as another threshold to control the
> waking up of reporting worker.
>
> Cc: Alexander Duyck 
> Cc: Mel Gorman 
> Cc: Andrea Arcangeli 
> Cc: Dan Williams 
> Cc: Dave Hansen 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Andrew Morton 
> Cc: Alex Williamson 
> Cc: Michael S. Tsirkin 
> Cc: Liang Li 
> Signed-off-by: Liang Li 

So you are going to need a lot more explanation for this. Page
reporting already had the concept of batching as you could only scan
once every 2 seconds as I recall. Thus the "PAGE_REPORTING_DELAY". The
change you are making doesn't make any sense without additional
context.

> ---
>  mm/page_reporting.c |  1 +
>  mm/page_reporting.h | 12 ++--
>  2 files changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index cd8e13d41df4..694df981ddd2 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -12,6 +12,7 @@
>
>  #define PAGE_REPORTING_DELAY   (2 * HZ)
>  static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
> +unsigned long page_report_batch_size  __read_mostly = 16 * 1024 * 1024UL;
>
>  enum {
> PAGE_REPORTING_IDLE = 0,
> diff --git a/mm/page_reporting.h b/mm/page_reporting.h
> index 2c385dd4ddbd..b8fb3bbb345f 100644
> --- a/mm/page_reporting.h
> +++ b/mm/page_reporting.h
> @@ -12,6 +12,8 @@
>
>  #define PAGE_REPORTING_MIN_ORDER   pageblock_order
>
> +extern unsigned long page_report_batch_size;
> +
>  #ifdef CONFIG_PAGE_REPORTING
>  DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
>  void __page_reporting_notify(void);
> @@ -33,6 +35,8 @@ static inline bool page_reported(struct page *page)
>   */
>  static inline void page_reporting_notify_free(unsigned int order)
>  {
> +   static long batch_size;
> +

I'm not sure this makes a tone of sense to place the value in an
inline function. It might make more sense to put this new code in
__page_reporting_notify so that all callers would be referring to the
same batch_size value and you don't have to bother with the export of
the page_report_batch_size value.

> /* Called from hot path in __free_one_page() */
> if (!static_branch_unlikely(_reporting_enabled))
> return;
> @@ -41,8 +45,12 @@ static inline void page_reporting_notify_free(unsigned int 
> order)
> if (order < PAGE_REPORTING_MIN_ORDER)
> return;
>
> -   /* This will add a few cycles, but should be called infrequently */
> -   __page_reporting_notify();
> +   batch_size += (1 << order) << PAGE_SHIFT;
> +   if (batch_size >= page_report_batch_size) {
> +   batch_size = 0;

I would probably run this in the opposite direction. Rather than
running batch_size to zero I would look at adding a "batch_remaining"
and then when it is < 0 you could then reset it back to
page_report_batch_size. Doing that you only have to read one variable
most of the time instead of doing a comparison against two.

> +   /* This add a few cycles, but should be called infrequently */
> +   __page_reporting_notify();
> +   }
>  }
>  #else /* CONFIG_PAGE_REPORTING */
>  #define page_reported(_page)   false
> --
> 2.18.2
>
>

Re: [RFC PATCH 1/3] mm: support hugetlb free page reporting

2020-12-23 Thread Alexander Duyck

On Tue, Dec 22, 2020 at 7:39 PM Liang Li  wrote:
>
> > > +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
> > > +struct hstate *h, unsigned int nid,
> > > +struct scatterlist *sgl, unsigned int *offset)
> > > +{
> > > +   struct list_head *list = >hugepage_freelists[nid];
> > > +   unsigned int page_len = PAGE_SIZE << h->order;
> > > +   struct page *page, *next;
> > > +   long budget;
> > > +   int ret = 0, scan_cnt = 0;
> > > +
> > > +   /*
> > > +* Perform early check, if free area is empty there is
> > > +* nothing to process so we can skip this free_list.
> > > +*/
> > > +   if (list_empty(list))
> > > +   return ret;
> > > +
> > > +   spin_lock_irq(_lock);
> > > +
> > > +   if (huge_page_order(h) > MAX_ORDER)
> > > +   budget = HUGEPAGE_REPORTING_CAPACITY;
> > > +   else
> > > +   budget = HUGEPAGE_REPORTING_CAPACITY * 32;
> >
> > Wouldn't huge_page_order always be more than MAX_ORDER? Seems like we
> > don't even really need budget since this should probably be pulling
> > out no more than one hugepage at a time.
>
> I want to disting a 2M page and 1GB page here. The order of 1GB page is 
> greater
> than MAX_ORDER while 2M page's order is less than MAX_ORDER.

The budget here is broken. When I put the budget in page reporting it
was so that we wouldn't try to report all of the memory in a given
region. It is meant to hold us to no more than one pass through 1/16
of the free memory. So essentially we will be slowly processing all of
memory and it will take 16 calls (32 seconds) for us to process a
system that is sitting completely idle. It is meant to pace us so we
don't spend a ton of time doing work that will be undone, not to
prevent us from burying a CPU which is what seems to be implied here.

Using HUGEPAGE_REPORTING_CAPACITY makes no sense here. I was using it
in the original definition because it was how many pages we could
scoop out at a time and then I was aiming for a 16th of that. Here you
are arbitrarily squaring HUGEPAGE_REPORTING_CAPACITY in terms of the
amount of work you will doo since you are using it as a multiple
instead of a divisor.

> >
> > > +   /* loop through free list adding unreported pages to sg list */
> > > +   list_for_each_entry_safe(page, next, list, lru) {
> > > +   /* We are going to skip over the reported pages. */
> > > +   if (PageReported(page)) {
> > > +   if (++scan_cnt >= MAX_SCAN_NUM) {
> > > +   ret = scan_cnt;
> > > +   break;
> > > +   }
> > > +   continue;
> > > +   }
> > > +
> >
> > It would probably have been better to place this set before your new
> > set. I don't see your new set necessarily being the best use for page
> > reporting.
>
> I haven't really latched on to what you mean, could you explain it again?

It would be better for you to spend time understanding how this patch
set works before you go about expanding it to do other things.
Mistakes like the budget one above kind of point out the fact that you
don't understand how this code was supposed to work and just kind of
shoehorned you page zeroing code onto it.

It would be better to look at trying to understand this code first
before you extend it to support your zeroing use case. So adding huge
pages first might make more sense than trying to zero and push the
order down. The fact is the page reporting extension should be minimal
for huge pages since they are just passed as a scatterlist so you
should only need to add a small bit to page_reporting.c to extend it
to support this use case.

> >
> > > +   /*
> > > +* If we fully consumed our budget then update our
> > > +* state to indicate that we are requesting additional
> > > +* processing and exit this list.
> > > +*/
> > > +   if (budget < 0) {
> > > +   atomic_set(>state, 
> > > PAGE_REPORTING_REQUESTED);
> > > +   next = page;
> > > +   break;
> > > +   }
> > > +
> >
> > If budget is only ever going to be 1 then we probably could just look
> > at making this the default case for any time we find a non-reported
> > page.
>
> and here again.

It comes down to the fact that the changes you made have a significant
impact on how this is supposed to function. Reducing the scatterlist
to a size of one makes the whole point of doing batching kind of
pointless. Basically the code should be rewritten with the assumption
that if you find a page you report it.

The old code would batch things up because there is significant
overhead to be addressed when going to the hypervisor to report said
memory. Your code doesn't seem to really take anything like that into
account

Re: [RFC PATCH 1/3] mm: support hugetlb free page reporting

2020-12-22 Thread Alexander Duyck

On Mon, Dec 21, 2020 at 11:47 PM Liang Li  wrote:
>
> Free page reporting only supports buddy pages, it can't report the
> free pages reserved for hugetlbfs case. On the other hand, hugetlbfs
> is a good choice for a system with a huge amount of RAM, because it
> can help to reduce the memory management overhead and improve system
> performance.
> This patch add the support for reporting hugepages in the free list
> of hugetlb, it canbe used by virtio_balloon driver for memory
> overcommit and pre zero out free pages for speeding up memory population.
>
> Cc: Alexander Duyck 
> Cc: Mel Gorman 
> Cc: Andrea Arcangeli 
> Cc: Dan Williams 
> Cc: Dave Hansen 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Andrew Morton 
> Cc: Alex Williamson 
> Cc: Michael S. Tsirkin 
> Cc: Jason Wang 
> Cc: Mike Kravetz 
> Cc: Liang Li 
> Signed-off-by: Liang Li 
> ---
>  include/linux/hugetlb.h|   3 +
>  include/linux/page_reporting.h |   5 +
>  mm/hugetlb.c   |  29 
>  mm/page_reporting.c| 287 +
>  mm/page_reporting.h|  34 
>  5 files changed, 358 insertions(+)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index ebca2ef02212..a72ad25501d3 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  struct ctl_table;
>  struct user_struct;
> @@ -114,6 +115,8 @@ int hugetlb_treat_movable_handler(struct ctl_table *, 
> int, void *, size_t *,
>  int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t 
> *,
> loff_t *);
>
> +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid);
> +void putback_isolate_huge_page(struct hstate *h, struct page *page);
>  int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct 
> vm_area_struct *);
>  long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>  struct page **, struct vm_area_struct **,
> diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
> index 63e1e9fbcaa2..0da3d1a6f0cc 100644
> --- a/include/linux/page_reporting.h
> +++ b/include/linux/page_reporting.h
> @@ -7,6 +7,7 @@
>
>  /* This value should always be a power of 2, see page_reporting_cycle() */
>  #define PAGE_REPORTING_CAPACITY32
> +#define HUGEPAGE_REPORTING_CAPACITY1
>
>  struct page_reporting_dev_info {
> /* function that alters pages to make them "reported" */
> @@ -26,4 +27,8 @@ struct page_reporting_dev_info {
>  /* Tear-down and bring-up for page reporting devices */
>  void page_reporting_unregister(struct page_reporting_dev_info *prdev);
>  int page_reporting_register(struct page_reporting_dev_info *prdev);
> +
> +/* Tear-down and bring-up for hugepage reporting devices */
> +void hugepage_reporting_unregister(struct page_reporting_dev_info *prdev);
> +int hugepage_reporting_register(struct page_reporting_dev_info *prdev);
>  #endif /*_LINUX_PAGE_REPORTING_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index cbf32d2824fd..de6ce147dfe2 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -41,6 +41,7 @@
>  #include 
>  #include 
>  #include 
> +#include "page_reporting.h"
>  #include "internal.h"
>
>  int hugetlb_max_hstate __read_mostly;
> @@ -1028,6 +1029,11 @@ static void enqueue_huge_page(struct hstate *h, struct 
> page *page)
> list_move(>lru, >hugepage_freelists[nid]);
> h->free_huge_pages++;
> h->free_huge_pages_node[nid]++;
> +   if (hugepage_reported(page)) {
> +   __ClearPageReported(page);
> +   pr_info("%s, free_huge_pages=%ld\n", __func__, 
> h->free_huge_pages);
> +   }
> +   hugepage_reporting_notify_free(h->order);
>  }
>
>  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> @@ -5531,6 +5537,29 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long 
> address, pgd_t *pgd, int fla
> return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> 
> PAGE_SHIFT);
>  }
>
> +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid)
> +{
> +   bool ret = true;
> +
> +   VM_BUG_ON_PAGE(!PageHead(page), page);
> +
> +   list_move(>lru, >hugepage_activelist);
> +   set_page_refcounted(page);
> +   h->free_huge_pages--;
> +   h->free_huge_pages_node[nid]--;
> +
> +   return ret;
> +}
> +
> +void putback_isolate_huge_page(struct hstate *h, struct page *page)
>

Re: [RFC v2 PATCH 0/4] speed up page allocation for __GFP_ZERO

2020-12-22 Thread Alexander Duyck

On Mon, Dec 21, 2020 at 8:25 AM Liang Li  wrote:
>
> The first version can be found at: https://lkml.org/lkml/2020/4/12/42
>
> Zero out the page content usually happens when allocating pages with
> the flag of __GFP_ZERO, this is a time consuming operation, it makes
> the population of a large vma area very slowly. This patch introduce
> a new feature for zero out pages before page allocation, it can help
> to speed up page allocation with __GFP_ZERO.
>
> My original intention for adding this feature is to shorten VM
> creation time when SR-IOV devicde is attached, it works good and the
> VM creation time is reduced by about 90%.
>
> Creating a VM [64G RAM, 32 CPUs] with GPU passthrough
> =
> QEMU use 4K pages, THP is off
>   round1  round2  round3
> w/o this patch:23.5s   24.7s   24.6s
> w/ this patch: 10.2s   10.3s   11.2s
>
> QEMU use 4K pages, THP is on
>   round1  round2  round3
> w/o this patch:17.9s   14.8s   14.9s
> w/ this patch: 1.9s1.8s1.9s
> =
>
> Obviously, it can do more than this. We can benefit from this feature
> in the flowing case:

So I am not sure page reporting is the best thing to base this page
zeroing setup on. The idea with page reporting is to essentially act
as a leaky bucket and allow the guest to drop memory it isn't using
slowly so if it needs to reinflate it won't clash with the
applications that need memory. What you are doing here seems far more
aggressive in that you are going down to low order pages and sleeping
instead of rescheduling for the next time interval.

Also I am not sure your SR-IOV creation time test is a good
justification for this extra overhead. With your patches applied all
you are doing is making use of the free time before the test to do the
page zeroing instead of doing it during your test. As such your CPU
overhead prior to running the test would be higher and you haven't
captured that information.

One thing I would be interested in seeing is what is the load this is
adding when you are running simple memory allocation/free type tests
on the system. For example it might be useful to see what the
will-it-scale page_fault1 tests look like with this patch applied
versus not applied. I suspect it would be adding some amount of
overhead as you have to spend a ton of time scanning all the pages and
that will be considerable overhead.

Re: [PATCH v4 0/4] Improve s0ix flows for systems i219LM

2020-12-14 Thread Alexander Duyck

On Mon, Dec 14, 2020 at 7:35 AM Mario Limonciello
 wrote:
>
> commit e086ba2fccda ("e1000e: disable s0ix entry and exit flows for ME 
> systems")
> disabled s0ix flows for systems that have various incarnations of the
> i219-LM ethernet controller.  This was done because of some regressions
> caused by an earlier
> commit 632fbd5eb5b0e ("e1000e: fix S0ix flows for cable connected case")
> with i219-LM controller.
>
> Per discussion with Intel architecture team this direction should be changed 
> and
> allow S0ix flows to be used by default.  This patch series includes 
> directional
> changes for their conclusions in https://lkml.org/lkml/2020/12/13/15.
>
> Changes from v3 to v4:
>  - Drop patch 1 for proper s0i3.2 entry, it was separated and is now merged 
> in kernel
>  - Add patch to only run S0ix flows if shutdown succeeded which was suggested 
> in
>thread
>  - Adjust series for guidance from https://lkml.org/lkml/2020/12/13/15
>* Revert i219-LM disallow-list.
>* Drop all patches for systems tested by Dell in an allow list
>* Increase ULP timeout to 1000ms
> Changes from v2 to v3:
>  - Correct some grammar and spelling issues caught by Bjorn H.
>* s/s0ix/S0ix/ in all commit messages
>* Fix a typo in commit message
>* Fix capitalization of proper nouns
>  - Add more pre-release systems that pass
>  - Re-order the series to add systems only at the end of the series
>  - Add Fixes tag to a patch in series.
>
> Changes from v1 to v2:
>  - Directly incorporate Vitaly's dependency patch in the series
>  - Split out s0ix code into it's own file
>  - Adjust from DMI matching to PCI subsystem vendor ID/device matching
>  - Remove module parameter and sysfs, use ethtool flag instead.
>  - Export s0ix flag to ethtool private flags
>  - Include more people and lists directly in this submission chain.
>
> Mario Limonciello (4):
>   e1000e: Only run S0ix flows if shutdown succeeded
>   e1000e: bump up timeout to wait when ME un-configure ULP mode
>   Revert "e1000e: disable s0ix entry and exit flows for ME systems"
>   e1000e: Export S0ix flags to ethtool
>
>  drivers/net/ethernet/intel/e1000e/e1000.h   |  1 +
>  drivers/net/ethernet/intel/e1000e/ethtool.c | 40 ++
>  drivers/net/ethernet/intel/e1000e/ich8lan.c |  4 +-
>  drivers/net/ethernet/intel/e1000e/netdev.c  | 59 -
>  4 files changed, 53 insertions(+), 51 deletions(-)
>

The changes look good to me.

Reviewed-by: Alexander Duyck

[net-next PATCH v3] tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit

2020-12-12 Thread Alexander Duyck

From: Alexander Duyck 

There are cases where a fastopen SYN may trigger either a ICMP_TOOBIG
message in the case of IPv6 or a fragmentation request in the case of
IPv4. This results in the socket stalling for a second or more as it does
not respond to the message by retransmitting the SYN frame.

Normally a SYN frame should not be able to trigger a ICMP_TOOBIG or
ICMP_FRAG_NEEDED however in the case of fastopen we can have a frame that
makes use of the entire MSS. In the case of fastopen it does, and an
additional complication is that the retransmit queue doesn't contain the
original frames. As a result when tcp_simple_retransmit is called and
walks the list of frames in the queue it may not mark the frames as lost
because both the SYN and the data packet each individually are smaller than
the MSS size after the adjustment. This results in the socket being stalled
until the retransmit timer kicks in and forces the SYN frame out again
without the data attached.

In order to resolve this we can reduce the MSS the packets are compared
to in tcp_simple_retransmit to -1 for cases where we are still in the
TCP_SYN_SENT state for a fastopen socket. Doing this we will mark all of
the packets related to the fastopen SYN as lost.

Signed-off-by: Alexander Duyck 
---

v2: Changed logic to invalidate all retransmit queue frames if fastopen SYN
v3: Updated commit message to reflect actual solution in 3rd paragraph

 net/ipv4/tcp_input.c |   17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9e8a6c1aa019..e44327a39a1f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2688,7 +2688,22 @@ void tcp_simple_retransmit(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
-   unsigned int mss = tcp_current_mss(sk);
+   int mss;
+
+   /* A fastopen SYN request is stored as two separate packets within
+* the retransmit queue, this is done by tcp_send_syn_data().
+* As a result simply checking the MSS of the frames in the queue
+* will not work for the SYN packet.
+*
+* Us being here is an indication of a path MTU issue so we can
+* assume that the fastopen SYN was lost and just mark all the
+* frames in the retransmit queue as lost. We will use an MSS of
+* -1 to mark all frames as lost, otherwise compute the current MSS.
+*/
+   if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
+   mss = -1;
+   else
+   mss = tcp_current_mss(sk);
 
skb_rbtree_walk(skb, >tcp_rtx_queue) {
if (tcp_skb_seglen(skb) > mss)

[net-next PATCH v2] tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit

2020-12-12 Thread Alexander Duyck

From: Alexander Duyck 

There are cases where a fastopen SYN may trigger either a ICMP_TOOBIG
message in the case of IPv6 or a fragmentation request in the case of
IPv4. This results in the socket stalling for a second or more as it does
not respond to the message by retransmitting the SYN frame.

Normally a SYN frame should not be able to trigger a ICMP_TOOBIG or
ICMP_FRAG_NEEDED however in the case of fastopen we can have a frame that
makes use of the entire MSS. In the case of fastopen it does, and an
additional complication is that the retransmit queue doesn't contain the
original frames. As a result when tcp_simple_retransmit is called and
walks the list of frames in the queue it may not mark the frames as lost
because both the SYN and the data packet each individually are smaller than
the MSS size after the adjustment. This results in the socket being stalled
until the retransmit timer kicks in and forces the SYN frame out again
without the data attached.

In order to resolve this we can generate our best estimate for the original
packet size by detecting the fastopen SYN frame and then adding the
overhead for MAX_TCP_OPTION_SPACE and verifying if the SYN w/ data would
have exceeded the MSS. If so we can mark the frame as lost and retransmit
it.

Signed-off-by: Alexander Duyck 
---
 net/ipv4/tcp_input.c |   17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9e8a6c1aa019..e44327a39a1f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2688,7 +2688,22 @@ void tcp_simple_retransmit(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
-   unsigned int mss = tcp_current_mss(sk);
+   int mss;
+
+   /* A fastopen SYN request is stored as two separate packets within
+* the retransmit queue, this is done by tcp_send_syn_data().
+* As a result simply checking the MSS of the frames in the queue
+* will not work for the SYN packet.
+*
+* Us being here is an indication of a path MTU issue so we can
+* assume that the fastopen SYN was lost and just mark all the
+* frames in the retransmit queue as lost. We will use an MSS of
+* -1 to mark all frames as lost, otherwise compute the current MSS.
+*/
+   if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
+   mss = -1;
+   else
+   mss = tcp_current_mss(sk);
 
skb_rbtree_walk(skb, >tcp_rtx_queue) {
if (tcp_skb_seglen(skb) > mss)

Re: [net-next PATCH] tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit

2020-12-12 Thread Alexander Duyck

On Sat, Dec 12, 2020 at 11:07 AM Yuchung Cheng  wrote:
>
> On Sat, Dec 12, 2020 at 11:01 AM Alexander Duyck
>  wrote:
> >
> > On Sat, Dec 12, 2020 at 10:34 AM Yuchung Cheng  wrote:
> > >
> > > On Fri, Dec 11, 2020 at 5:28 PM Alexander Duyck
> > >  wrote:
> > > >
> > > > From: Alexander Duyck 
> > > >
> > > > There are cases where a fastopen SYN may trigger either a ICMP_TOOBIG
> > > > message in the case of IPv6 or a fragmentation request in the case of
> > > > IPv4. This results in the socket stalling for a second or more as it 
> > > > does
> > > > not respond to the message by retransmitting the SYN frame.
> > > >
> > > > Normally a SYN frame should not be able to trigger a ICMP_TOOBIG or
> > > > ICMP_FRAG_NEEDED however in the case of fastopen we can have a frame 
> > > > that
> > > > makes use of the entire MSS. In the case of fastopen it does, and an
> > > > additional complication is that the retransmit queue doesn't contain the
> > > > original frames. As a result when tcp_simple_retransmit is called and
> > > > walks the list of frames in the queue it may not mark the frames as lost
> > > > because both the SYN and the data packet each individually are smaller 
> > > > than
> > > > the MSS size after the adjustment. This results in the socket being 
> > > > stalled
> > > > until the retransmit timer kicks in and forces the SYN frame out again
> > > > without the data attached.
> > > >
> > > > In order to resolve this we can generate our best estimate for the 
> > > > original
> > > > packet size by detecting the fastopen SYN frame and then adding the
> > > > overhead for MAX_TCP_OPTION_SPACE and verifying if the SYN w/ data would
> > > > have exceeded the MSS. If so we can mark the frame as lost and 
> > > > retransmit
> > > > it.
> > > >
> > > > Signed-off-by: Alexander Duyck 
> > > > ---
> > > >  net/ipv4/tcp_input.c |   30 +++---
> > > >  1 file changed, 27 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > > index 9e8a6c1aa019..79375b58de84 100644
> > > > --- a/net/ipv4/tcp_input.c
> > > > +++ b/net/ipv4/tcp_input.c
> > > > @@ -2686,11 +2686,35 @@ static void tcp_mtup_probe_success(struct sock 
> > > > *sk)
> > > >  void tcp_simple_retransmit(struct sock *sk)
> > > >  {
> > > > const struct inet_connection_sock *icsk = inet_csk(sk);
> > > > +   struct sk_buff *skb = tcp_rtx_queue_head(sk);
> > > > struct tcp_sock *tp = tcp_sk(sk);
> > > > -   struct sk_buff *skb;
> > > > -   unsigned int mss = tcp_current_mss(sk);
> > > > +   unsigned int mss;
> > > > +
> > > > +   /* A fastopen SYN request is stored as two separate packets 
> > > > within
> > > > +* the retransmit queue, this is done by tcp_send_syn_data().
> > > > +* As a result simply checking the MSS of the frames in the 
> > > > queue
> > > > +* will not work for the SYN packet. So instead we must make a 
> > > > best
> > > > +* effort attempt by validating the data frame with the mss size
> > > > +* that would be computed now by tcp_send_syn_data and comparing
> > > > +* that against the data frame that would have been included 
> > > > with
> > > > +* the SYN.
> > > > +*/
> > > > +   if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN && tp->syn_data) {
> > > > +   struct sk_buff *syn_data = skb_rb_next(skb);
> > > > +
> > > > +   mss = tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) +
> > > > + tp->tcp_header_len - sizeof(struct tcphdr) -
> > > > + MAX_TCP_OPTION_SPACE;
> > > nice comment! The original syn_data mss needs to be inferred which is
> > > a hassle to get right. my sense is path-mtu issue is enough to warrant
> > > they are lost.
> > > I suggest simply mark syn & its data lost if tcp_simple_retransmit is
> > > called during TFO handshake, i.e.
> > >
> > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > index

Re: [net-next PATCH] tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit

2020-12-12 Thread Alexander Duyck

On Sat, Dec 12, 2020 at 10:34 AM Yuchung Cheng  wrote:
>
> On Fri, Dec 11, 2020 at 5:28 PM Alexander Duyck
>  wrote:
> >
> > From: Alexander Duyck 
> >
> > There are cases where a fastopen SYN may trigger either a ICMP_TOOBIG
> > message in the case of IPv6 or a fragmentation request in the case of
> > IPv4. This results in the socket stalling for a second or more as it does
> > not respond to the message by retransmitting the SYN frame.
> >
> > Normally a SYN frame should not be able to trigger a ICMP_TOOBIG or
> > ICMP_FRAG_NEEDED however in the case of fastopen we can have a frame that
> > makes use of the entire MSS. In the case of fastopen it does, and an
> > additional complication is that the retransmit queue doesn't contain the
> > original frames. As a result when tcp_simple_retransmit is called and
> > walks the list of frames in the queue it may not mark the frames as lost
> > because both the SYN and the data packet each individually are smaller than
> > the MSS size after the adjustment. This results in the socket being stalled
> > until the retransmit timer kicks in and forces the SYN frame out again
> > without the data attached.
> >
> > In order to resolve this we can generate our best estimate for the original
> > packet size by detecting the fastopen SYN frame and then adding the
> > overhead for MAX_TCP_OPTION_SPACE and verifying if the SYN w/ data would
> > have exceeded the MSS. If so we can mark the frame as lost and retransmit
> > it.
> >
> > Signed-off-by: Alexander Duyck 
> > ---
> >  net/ipv4/tcp_input.c |   30 +++---
> >  1 file changed, 27 insertions(+), 3 deletions(-)
> >
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 9e8a6c1aa019..79375b58de84 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -2686,11 +2686,35 @@ static void tcp_mtup_probe_success(struct sock *sk)
> >  void tcp_simple_retransmit(struct sock *sk)
> >  {
> > const struct inet_connection_sock *icsk = inet_csk(sk);
> > +   struct sk_buff *skb = tcp_rtx_queue_head(sk);
> > struct tcp_sock *tp = tcp_sk(sk);
> > -   struct sk_buff *skb;
> > -   unsigned int mss = tcp_current_mss(sk);
> > +   unsigned int mss;
> > +
> > +   /* A fastopen SYN request is stored as two separate packets within
> > +* the retransmit queue, this is done by tcp_send_syn_data().
> > +* As a result simply checking the MSS of the frames in the queue
> > +* will not work for the SYN packet. So instead we must make a best
> > +* effort attempt by validating the data frame with the mss size
> > +* that would be computed now by tcp_send_syn_data and comparing
> > +* that against the data frame that would have been included with
> > +* the SYN.
> > +*/
> > +   if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN && tp->syn_data) {
> > +   struct sk_buff *syn_data = skb_rb_next(skb);
> > +
> > +   mss = tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) +
> > + tp->tcp_header_len - sizeof(struct tcphdr) -
> > + MAX_TCP_OPTION_SPACE;
> nice comment! The original syn_data mss needs to be inferred which is
> a hassle to get right. my sense is path-mtu issue is enough to warrant
> they are lost.
> I suggest simply mark syn & its data lost if tcp_simple_retransmit is
> called during TFO handshake, i.e.
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 62f7aabc7920..7f0c4f2947eb 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -2864,7 +2864,8 @@ void tcp_simple_retransmit(struct sock *sk)
> unsigned int mss = tcp_current_mss(sk);
>
> skb_rbtree_walk(skb, >tcp_rtx_queue) {
> -   if (tcp_skb_seglen(skb) > mss)
> +   if (tcp_skb_seglen(skb) > mss ||
> +   (tp->syn_data && sk->sk_state == TCP_SYN_SENT))
> tcp_mark_skb_lost(sk, skb);
> }
>
> We have a TFO packetdrill test that verifies my suggested fix should
> trigger an immediate retransmit vs 1s wait.

Okay, I will go that route, although I will still probably make one
minor cleanup. Instead of testing for syn_data and state per packet I
will probably keep the bit where I overwrite mss since it is only used
in the loop. What I can do is switch it from unsigned int to int since
technically tcp_current_mss and tcp_skb_seglen are both a signed int
anyway. Then I can just set mss to -1 in the syn_data && TCP_SYN_SENT
case. That way all of the frames in the ring should fail the check
while only having to add one initial check outside the loop.

[net-next PATCH] tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit

2020-12-11 Thread Alexander Duyck

From: Alexander Duyck 

There are cases where a fastopen SYN may trigger either a ICMP_TOOBIG
message in the case of IPv6 or a fragmentation request in the case of
IPv4. This results in the socket stalling for a second or more as it does
not respond to the message by retransmitting the SYN frame.

Normally a SYN frame should not be able to trigger a ICMP_TOOBIG or
ICMP_FRAG_NEEDED however in the case of fastopen we can have a frame that
makes use of the entire MSS. In the case of fastopen it does, and an
additional complication is that the retransmit queue doesn't contain the
original frames. As a result when tcp_simple_retransmit is called and
walks the list of frames in the queue it may not mark the frames as lost
because both the SYN and the data packet each individually are smaller than
the MSS size after the adjustment. This results in the socket being stalled
until the retransmit timer kicks in and forces the SYN frame out again
without the data attached.

In order to resolve this we can generate our best estimate for the original
packet size by detecting the fastopen SYN frame and then adding the
overhead for MAX_TCP_OPTION_SPACE and verifying if the SYN w/ data would
have exceeded the MSS. If so we can mark the frame as lost and retransmit
it.

Signed-off-by: Alexander Duyck 
---
 net/ipv4/tcp_input.c |   30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9e8a6c1aa019..79375b58de84 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2686,11 +2686,35 @@ static void tcp_mtup_probe_success(struct sock *sk)
 void tcp_simple_retransmit(struct sock *sk)
 {
const struct inet_connection_sock *icsk = inet_csk(sk);
+   struct sk_buff *skb = tcp_rtx_queue_head(sk);
struct tcp_sock *tp = tcp_sk(sk);
-   struct sk_buff *skb;
-   unsigned int mss = tcp_current_mss(sk);
+   unsigned int mss;
+
+   /* A fastopen SYN request is stored as two separate packets within
+* the retransmit queue, this is done by tcp_send_syn_data().
+* As a result simply checking the MSS of the frames in the queue
+* will not work for the SYN packet. So instead we must make a best
+* effort attempt by validating the data frame with the mss size
+* that would be computed now by tcp_send_syn_data and comparing
+* that against the data frame that would have been included with
+* the SYN.
+*/
+   if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN && tp->syn_data) {
+   struct sk_buff *syn_data = skb_rb_next(skb);
+
+   mss = tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) +
+ tp->tcp_header_len - sizeof(struct tcphdr) -
+ MAX_TCP_OPTION_SPACE;
 
-   skb_rbtree_walk(skb, >tcp_rtx_queue) {
+   if (syn_data && syn_data->len > mss)
+   tcp_mark_skb_lost(sk, skb);
+
+   skb = syn_data;
+   } else {
+   mss = tcp_current_mss(sk);
+   }
+
+   skb_rbtree_walk_from(skb) {
if (tcp_skb_seglen(skb) > mss)
tcp_mark_skb_lost(sk, skb);
}

Re: [net PATCH] tcp: Mark fastopen SYN packet as lost when receiving ICMP_TOOBIG/ICMP_FRAG_NEEDED

2020-12-11 Thread Alexander Duyck

On Fri, Dec 11, 2020 at 11:18 AM Eric Dumazet  wrote:
>
> On Fri, Dec 11, 2020 at 6:15 PM Alexander Duyck
>  wrote:
> >
> > On Fri, Dec 11, 2020 at 8:22 AM Eric Dumazet  wrote:
> > >
> > > On Fri, Dec 11, 2020 at 5:03 PM Alexander Duyck
> > >  wrote:
> > >
> > > > That's fine. I can target this for net-next. I had just selected net
> > > > since I had considered it a fix, but I suppose it could be considered
> > > > a behavioral change.
> > >
> > > We are very late in the 5.10 cycle, and we never handled ICMP in this
> > > state, so net-next is definitely better.
> > >
> > > Note that RFC 7413 states in 4.1.3 :
> > >
> > >  The client MUST cache cookies from servers for later Fast Open
> > >connections.  For a multihomed client, the cookies are dependent on
> > >the client and server IP addresses.  Hence, the client should cache
> > >at most one (most recently received) cookie per client and server IP
> > >address pair.
> > >
> > >When caching cookies, we recommend that the client also cache the
> > >Maximum Segment Size (MSS) advertised by the server.  The client can
> > >cache the MSS advertised by the server in order to determine the
> > >maximum amount of data that the client can fit in the SYN packet in
> > >subsequent TFO connections.  Caching the server MSS is useful
> > >because, with Fast Open, a client sends data in the SYN packet before
> > >the server announces its MSS in the SYN-ACK packet.  If the client
> > >sends more data in the SYN packet than the server will accept, this
> > >will likely require the client to retransmit some or all of the data.
> > >Hence, caching the server MSS can enhance performance.
> > >
> > >Without a cached server MSS, the amount of data in the SYN packet is
> > >limited to the default MSS of 536 bytes for IPv4 [RFC1122] and 1220
> > >bytes for IPv6 [RFC2460].  Even if the client complies with this
> > >limit when sending the SYN, it is known that an IPv4 receiver
> > >advertising an MSS less than 536 bytes can receive a segment larger
> > >than it is expecting.
> > >
> > >If the cached MSS is larger than the typical size (1460 bytes for
> > >IPv4 or 1440 bytes for IPv6), then the excess data in the SYN packet
> > >may cause problems that offset the performance benefit of Fast Open.
> > >For example, the unusually large SYN may trigger IP fragmentation and
> > >may confuse firewalls or middleboxes, causing SYN retransmission and
> > >other side effects.  Therefore, the client MAY limit the cached MSS
> > >to 1460 bytes for IPv4 or 1440 for IPv6.
> > >
> > >
> > > Relying on ICMP is fragile, since they can be filtered in some way.
> >
> > In this case I am not relying on the ICMP, but thought that since I
> > have it I should make use of it. WIthout the ICMP we would still just
> > be waiting on the retransmit timer.
> >
> > The problem case has a v6-in-v6 tunnel between the client and the
> > endpoint so both ends assume an MTU 1500 and advertise a 1440 MSS
> > which works fine until they actually go to send a large packet between
> > the two. At that point the tunnel is triggering an ICMP_TOOBIG and the
> > endpoint is stalling since the MSS is dropped to 1400, but the SYN and
> > data payload were already smaller than that so no retransmits are
> > being triggered. This results in TFO being 1s slower than non-TFO
> > because of the failure to trigger the retransmit for the frame that
> > violated the PMTU. The patch is meant to get the two back into
> > comparable times.
>
> Okay... Have you studied why tcp_v4_mtu_reduced() (and IPv6 equivalent)
> code does not yet handle the retransmit in TCP_SYN_SENT state ?

The problem lies in tcp_simple_retransmit(). Specifically the loop at
the start of the function goes to check the retransmit queue to see if
there are any packets larger than MSS and finds none since we don't
place the SYN w/ data in there and instead have a separate SYN and
data packet.

I'm debating if I should take an alternative approach and modify the
loop at the start of tcp_simple_transmit to add a check for a SYN
packet, tp->syn_data being set, and then comparing the next frame
length + MAX_TCP_HEADER_OPTIONS versus mss.

Re: [net PATCH] tcp: Mark fastopen SYN packet as lost when receiving ICMP_TOOBIG/ICMP_FRAG_NEEDED

2020-12-11 Thread Alexander Duyck

On Fri, Dec 11, 2020 at 8:22 AM Eric Dumazet  wrote:
>
> On Fri, Dec 11, 2020 at 5:03 PM Alexander Duyck
>  wrote:
>
> > That's fine. I can target this for net-next. I had just selected net
> > since I had considered it a fix, but I suppose it could be considered
> > a behavioral change.
>
> We are very late in the 5.10 cycle, and we never handled ICMP in this
> state, so net-next is definitely better.
>
> Note that RFC 7413 states in 4.1.3 :
>
>  The client MUST cache cookies from servers for later Fast Open
>connections.  For a multihomed client, the cookies are dependent on
>the client and server IP addresses.  Hence, the client should cache
>at most one (most recently received) cookie per client and server IP
>address pair.
>
>When caching cookies, we recommend that the client also cache the
>Maximum Segment Size (MSS) advertised by the server.  The client can
>cache the MSS advertised by the server in order to determine the
>maximum amount of data that the client can fit in the SYN packet in
>subsequent TFO connections.  Caching the server MSS is useful
>because, with Fast Open, a client sends data in the SYN packet before
>the server announces its MSS in the SYN-ACK packet.  If the client
>sends more data in the SYN packet than the server will accept, this
>will likely require the client to retransmit some or all of the data.
>Hence, caching the server MSS can enhance performance.
>
>Without a cached server MSS, the amount of data in the SYN packet is
>limited to the default MSS of 536 bytes for IPv4 [RFC1122] and 1220
>bytes for IPv6 [RFC2460].  Even if the client complies with this
>limit when sending the SYN, it is known that an IPv4 receiver
>advertising an MSS less than 536 bytes can receive a segment larger
>than it is expecting.
>
>If the cached MSS is larger than the typical size (1460 bytes for
>IPv4 or 1440 bytes for IPv6), then the excess data in the SYN packet
>may cause problems that offset the performance benefit of Fast Open.
>For example, the unusually large SYN may trigger IP fragmentation and
>may confuse firewalls or middleboxes, causing SYN retransmission and
>other side effects.  Therefore, the client MAY limit the cached MSS
>to 1460 bytes for IPv4 or 1440 for IPv6.
>
>
> Relying on ICMP is fragile, since they can be filtered in some way.

In this case I am not relying on the ICMP, but thought that since I
have it I should make use of it. WIthout the ICMP we would still just
be waiting on the retransmit timer.

The problem case has a v6-in-v6 tunnel between the client and the
endpoint so both ends assume an MTU 1500 and advertise a 1440 MSS
which works fine until they actually go to send a large packet between
the two. At that point the tunnel is triggering an ICMP_TOOBIG and the
endpoint is stalling since the MSS is dropped to 1400, but the SYN and
data payload were already smaller than that so no retransmits are
being triggered. This results in TFO being 1s slower than non-TFO
because of the failure to trigger the retransmit for the frame that
violated the PMTU. The patch is meant to get the two back into
comparable times.

Re: [net PATCH] tcp: Mark fastopen SYN packet as lost when receiving ICMP_TOOBIG/ICMP_FRAG_NEEDED

2020-12-11 Thread Alexander Duyck

On Thu, Dec 10, 2020 at 10:24 PM Eric Dumazet  wrote:
>
> On Fri, Dec 11, 2020 at 2:55 AM Alexander Duyck
>  wrote:
> >
> > From: Alexander Duyck 
> >
> > In the case of a fastopen SYN there are cases where it may trigger either a
> > ICMP_TOOBIG message in the case of IPv6 or a fragmentation request in the
> > case of IPv4. This results in the socket stalling for a second or more as
> > it does not respond to the message by retransmitting the SYN frame.
> >
> > Normally a SYN frame should not be able to trigger a ICMP_TOOBIG or
> > ICMP_FRAG_NEEDED however in the case of fastopen we can have a frame that
> > makes use of the entire MTU. In the case of fastopen it does, and an
> > additional complication is that the retransmit queue doesn't contain the
> > original frames. As a result when tcp_simple_retransmit is called and
> > walks the list of frames in the queue it may not mark the frames as lost
> > because both the SYN and the data packet each individually are smaller than
> > the MSS size after the adjustment. This results in the socket being stalled
> > until the retransmit timer kicks in and forces the SYN frame out again
> > without the data attached.
> >
> > In order to resolve this we need to mark the SYN frame as lost if it is the
> > first packet in the queue. Doing this allows the socket to recover much
> > more quickly without the retransmit timeout stall.
> >
> > Signed-off-by: Alexander Duyck 
>
>
> I do not think it is net candidate, but net-next
>
> Yuchung might correct me, but I think TCP Fastopen standard was very
> conservative about payload len in the SYN packet
>
> So receiving an ICMP was never considered.

That's fine. I can target this for net-next. I had just selected net
since I had considered it a fix, but I suppose it could be considered
a behavioral change.

> > ---
> >  include/net/tcp.h|1 +
> >  net/ipv4/tcp_input.c |8 
> >  net/ipv4/tcp_ipv4.c  |6 ++
> >  net/ipv6/tcp_ipv6.c  |4 
> >  4 files changed, 19 insertions(+)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index d4ef5bf94168..6181ad98727a 100644
> > --- a/include/net/tcp.h
>
>
> > +++ b/net/ipv4/tcp_ipv4.c
> > @@ -546,6 +546,12 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
> > if (sk->sk_state == TCP_LISTEN)
> > goto out;
> >
> > +   /* fastopen SYN may have triggered the fragmentation
> > +* request. Mark the SYN or SYN/ACK as lost.
> > +*/
> > +   if (sk->sk_state == TCP_SYN_SENT)
> > +   tcp_mark_syn_lost(sk);
>
> This is going to crash in some cases, you do not know if you own the socket.
> (Look a few lines below)

Okay, I will look into moving this down into the block below since I
assume if it is owned by user we cannot make these changes.

> > +
> > tp->mtu_info = info;
> > if (!sock_owned_by_user(sk)) {
> > tcp_v4_mtu_reduced(sk);
> > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> > index 992cbf3eb9e3..d7b1346863e3 100644
> > --- a/net/ipv6/tcp_ipv6.c
> > +++ b/net/ipv6/tcp_ipv6.c
> > @@ -443,6 +443,10 @@ static int tcp_v6_err(struct sk_buff *skb, struct 
> > inet6_skb_parm *opt,
> > if (!ip6_sk_accept_pmtu(sk))
> > goto out;
> >
> > +   /* fastopen SYN may have triggered TOOBIG, mark it lost. */
> > +   if (sk->sk_state == TCP_SYN_SENT)
> > +   tcp_mark_syn_lost(sk);
>
>
> Same issue here.

I'll move this one too.

> > +
> > tp->mtu_info = ntohl(info);
> > if (!sock_owned_by_user(sk))
> > tcp_v6_mtu_reduced(sk);
> >
> >

[net PATCH] tcp: Mark fastopen SYN packet as lost when receiving ICMP_TOOBIG/ICMP_FRAG_NEEDED

2020-12-10 Thread Alexander Duyck

From: Alexander Duyck 

In the case of a fastopen SYN there are cases where it may trigger either a
ICMP_TOOBIG message in the case of IPv6 or a fragmentation request in the
case of IPv4. This results in the socket stalling for a second or more as
it does not respond to the message by retransmitting the SYN frame.

Normally a SYN frame should not be able to trigger a ICMP_TOOBIG or
ICMP_FRAG_NEEDED however in the case of fastopen we can have a frame that
makes use of the entire MTU. In the case of fastopen it does, and an
additional complication is that the retransmit queue doesn't contain the
original frames. As a result when tcp_simple_retransmit is called and
walks the list of frames in the queue it may not mark the frames as lost
because both the SYN and the data packet each individually are smaller than
the MSS size after the adjustment. This results in the socket being stalled
until the retransmit timer kicks in and forces the SYN frame out again
without the data attached.

In order to resolve this we need to mark the SYN frame as lost if it is the
first packet in the queue. Doing this allows the socket to recover much
more quickly without the retransmit timeout stall.

Signed-off-by: Alexander Duyck 
---
 include/net/tcp.h|1 +
 net/ipv4/tcp_input.c |8 
 net/ipv4/tcp_ipv4.c  |6 ++
 net/ipv6/tcp_ipv6.c  |4 
 4 files changed, 19 insertions(+)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d4ef5bf94168..6181ad98727a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2062,6 +2062,7 @@ void tcp_init(void);
 
 /* tcp_recovery.c */
 void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
+void tcp_mark_syn_lost(struct sock *sk);
 void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
 extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
u32 reo_wnd);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 389d1b340248..d0c5248bc4e1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1066,6 +1066,14 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff 
*skb)
}
 }
 
+void tcp_mark_syn_lost(struct sock *sk)
+{
+   struct sk_buff *skb = tcp_rtx_queue_head(sk);
+
+   if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+   tcp_mark_skb_lost(sk, skb);
+}
+
 /* Updates the delivered and delivered_ce counts */
 static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
bool ece_ack)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8391aa29e7a4..ad62fe029646 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -546,6 +546,12 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
if (sk->sk_state == TCP_LISTEN)
goto out;
 
+   /* fastopen SYN may have triggered the fragmentation
+* request. Mark the SYN or SYN/ACK as lost.
+*/
+   if (sk->sk_state == TCP_SYN_SENT)
+   tcp_mark_syn_lost(sk);
+
tp->mtu_info = info;
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 992cbf3eb9e3..d7b1346863e3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -443,6 +443,10 @@ static int tcp_v6_err(struct sk_buff *skb, struct 
inet6_skb_parm *opt,
if (!ip6_sk_accept_pmtu(sk))
goto out;
 
+   /* fastopen SYN may have triggered TOOBIG, mark it lost. */
+   if (sk->sk_state == TCP_SYN_SENT)
+   tcp_mark_syn_lost(sk);
+
tp->mtu_info = ntohl(info);
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);

Re: [PATCH v3 0/7] Improve s0ix flows for systems i219LM

2020-12-09 Thread Alexander Duyck

On Wed, Dec 9, 2020 at 6:44 AM Hans de Goede  wrote:
>
> Hi,
>
> On 12/8/20 5:14 PM, Alexander Duyck wrote:
> > On Tue, Dec 8, 2020 at 1:30 AM Hans de Goede  wrote:
> >>
> >> Hi,
> >>
> >> On 12/8/20 6:08 AM, Neftin, Sasha wrote:
> >>> On 12/7/2020 17:41, Limonciello, Mario wrote:
> >>>>> First of all thank you for working on this.
> >>>>>
> >>>>> I must say though that I don't like the approach taken here very
> >>>>> much.
> >>>>>
> >>>>> This is not so much a criticism of this series as it is a criticism
> >>>>> of the earlier decision to simply disable s0ix on all devices
> >>>>> with the i219-LM + and active ME.
> >>>>
> >>>> I was not happy with that decision either as it did cause regressions
> >>>> on all of the "named" Comet Lake laptops that were in the market at
> >>>> the time.  The "unnamed" ones are not yet released, and I don't feel
> >>>> it's fair to call it a regression on "unreleased" hardware.
> >>>>
> >>>>>
> >>>>> AFAIK there was a perfectly acceptable patch to workaround those
> >>>>> broken devices, which increased a timeout:
> >>>>> https://patchwork.ozlabs.org/project/intel-wired-
> >>>>> lan/patch/20200323191639.48826-1-aaron...@canonical.com/
> >>>>>
> >>>>> That patch was nacked because it increased the resume time
> >>>>> *on broken devices*.
> >>>>>
> >>> Officially CSME/ME not POR for Linux and we haven't interface to the ME. 
> >>> Nobody can tell how long (and why) ME will hold PHY access semaphore ant 
> >>> just increasing the resuming time (ULP configure) won't be solve the 
> >>> problem. This is not reliable approach.
> >>> I would agree users can add ME system on their responsibilities.
> >>
> >> It is not clear to me what you are trying to say here.
> >
> > Based on the earlier thread you had referenced and his comment here it
> > sounds like while adding time will work for most cases, it doesn't
> > solve it for all cases.
>
> AFAIK there are 0 documented cases where the suspend/resume issue
> continues to be a problem after the timeout has been increased.
>
> If you know of actual documented cases (rather then this just being
> a theoretical problem), then please provide links to those cases.

If there are such notes I wouldn't have access to them. Do we know if
any sort of errata document has been posted for this issue by Intel?
That would be where an explanation of the problems and the reasoning
behind the workaround would be defined. Without that I am just
speculating based off of what has been said here and in the other
thread.

> > The problem is as a vendor you are usually
> > stuck looking for a solution that will work for all cases which can
> > lead to things like having to drop features because they can be
> > problematic for a few cases.
>
> I disagree, there will/might always be some broken corner case
> laptop-model / hw-design out there on which a feature breaks. Simply
> disabling all features which might cause problems in "a few cases"
> would mean that we pretty much have to disable over half the features
> in the kernel.
>
> Take for example SATA NCQ (command queing) this is know to not work
> on some devices, up to the point of where with some buggy firmwares
> it may cause full systems hangs and/or data-corruption. So this is
> a much bigger problem then the "system won't suspend" issue we
> are talking about here. Still the ATA subsys maintainers have enabled
> this by default because it is an important feature to have and they
> are using a deny-list to avoid enabling this on known broken hardware;
> and yes every know and then we need to add a new model to the deny-list.
>
> And the same for SATA ALPM support (a power-management feature like s0ix)
> that is enabled by default too, combined with a deny-list.
> I'm very familiar with the ALPM case since I pushed of it being
> enabled by default and I've done most of the maintenance work
> of the deny-list since it was enabled by default.
>
> The kernel is full of this pattern, we don't disable an important
> feature (and power-management is important) just because of this
> causing issues in "a few cases". And again you say "a few cases"
> but I know of 0 documented cases where this issue is still a problem
> after bumping the timeout.

It al

Re: [PATCH v3 0/7] Improve s0ix flows for systems i219LM

2020-12-08 Thread Alexander Duyck

On Tue, Dec 8, 2020 at 1:30 AM Hans de Goede  wrote:
>
> Hi,
>
> On 12/8/20 6:08 AM, Neftin, Sasha wrote:
> > On 12/7/2020 17:41, Limonciello, Mario wrote:
> >>> First of all thank you for working on this.
> >>>
> >>> I must say though that I don't like the approach taken here very
> >>> much.
> >>>
> >>> This is not so much a criticism of this series as it is a criticism
> >>> of the earlier decision to simply disable s0ix on all devices
> >>> with the i219-LM + and active ME.
> >>
> >> I was not happy with that decision either as it did cause regressions
> >> on all of the "named" Comet Lake laptops that were in the market at
> >> the time.  The "unnamed" ones are not yet released, and I don't feel
> >> it's fair to call it a regression on "unreleased" hardware.
> >>
> >>>
> >>> AFAIK there was a perfectly acceptable patch to workaround those
> >>> broken devices, which increased a timeout:
> >>> https://patchwork.ozlabs.org/project/intel-wired-
> >>> lan/patch/20200323191639.48826-1-aaron...@canonical.com/
> >>>
> >>> That patch was nacked because it increased the resume time
> >>> *on broken devices*.
> >>>
> > Officially CSME/ME not POR for Linux and we haven't interface to the ME. 
> > Nobody can tell how long (and why) ME will hold PHY access semaphore ant 
> > just increasing the resuming time (ULP configure) won't be solve the 
> > problem. This is not reliable approach.
> > I would agree users can add ME system on their responsibilities.
>
> It is not clear to me what you are trying to say here.

Based on the earlier thread you had referenced and his comment here it
sounds like while adding time will work for most cases, it doesn't
solve it for all cases. The problem is as a vendor you are usually
stuck looking for a solution that will work for all cases which can
lead to things like having to drop features because they can be
problematic for a few cases.

> Are you saying that you insist on keeping the e1000e_check_me check and
> thus needlessly penalizing 100s of laptops models with higher
> power-consumption unless these 100s of laptops are added manually
> to an allow list for this?
>
> I'm sorry but that is simply unacceptable, the maintenance burden
> of that is just way too high.

Think about this the other way though. If it is enabled and there are
cases where adding a delay doesn't resolve it then it still doesn't
really solve the issue does it?

> Testing on the models where the timeout issue was first hit has
> shown that increasing the timeout does actually fix it on those
> models. Sure in theory the ME on some buggy model could hold the
> semaphore even longer, but then the right thing would be to
> have a deny-list for s0ix where we can add those buggy models
> (none of which we have encountered sofar). Just like we have
> denylist for buggy hw in other places in the kernel.

This would actually have a higher maintenance burden then just
disabling the feature. Having to individually test for and deny-list
every one-off system with this bad configuration would be a pretty
significant burden. That also implies somebody would have access to
such systems and that is not normally the case. Even Intel doesn't
have all possible systems that would include this NIC.

> Maintaining an ever growing allow list for the *theoretical*
> case of encountering a model where things do not work with
> the increased timeout is not a workable and this not an
> acceptable solution.

I'm not a fan of the allow-list either, but it is preferable to a
deny-list where you have to first trigger the bug before you realize
it is there. Ideally there should be another solution in which the ME
could somehow set a flag somewhere in the hardware to indicate that it
is alive and the driver could read that order to determine if the ME
is actually alive and can skip this workaround. Then this could all be
avoided and it can be safely assumed the system is working correctly.

> The initial addition of the e1000e_check_me check instead
> of just going with the confirmed fix of bumping the timeout
> was already highly controversial and should IMHO never have
> been done.

How big was the sample size for the "confirmed" fix? How many
different vendors were there within the mix? The problem is while it
may have worked for the case you encountered you cannot say with
certainty that it worked in all cases unless you had samples of all
the different hardware out there.

> Combining this with an ever-growing allow-list on which every
> new laptop model needs to be added separately + a new
> "s0ix-enabled" ethertool flag, which existence is basically
> an admission that the allow-list approach is flawed goes
> from controversial to just plain not acceptable.

I don't view this as problematic, however this is some overhead to it.
One thing I don't know is if anyone has looked at is if the issue only
applies to a few specific system vendors. Currently the allow-list is
based on the subdevice ID. One thing we could look at doing

Re: [PATCH v3 0/7] Improve s0ix flows for systems i219LM

2020-12-06 Thread Alexander Duyck

On Sat, Dec 5, 2020 at 3:49 PM Jakub Kicinski  wrote:
>
> On Fri, 4 Dec 2020 14:38:03 -0800 Alexander Duyck wrote:
> > > > The patches look good to me. Just need to address the minor issue that
> > > > seems to have been present prior to the introduction of this patch
> > > > set.
> > > >
> > > > Reviewed-by: Alexander Duyck 
> > >
> > > Thanks for your review.  Just some operational questions - since this 
> > > previously
> > > existed do you want me to re-spin the series to a v4 for this, or should 
> > > it be
> > > a follow up after the series?
> > >
> > > If I respin it, would you prefer that change to occur at the start or end
> > > of the series?
> >
> > I don't need a respin, but if you are going to fix it you should
> > probably put out the patch as something like a 8/7. If you respin it
> > should happen near the start of the series as it is a bug you are
> > addressing.
>
> Don't we need that patch to be before this series so it can be
> back ported easily? Or is it not really a bug?

You're right. For backports it would make it easier to have the patch
be before the changes. As far as being a bug, it is one, but it isn't
an urgent bug as it is basically some bad exception handling so the
likelihood of seeing it should be quite low.

Re: [PATCH v3 0/7] Improve s0ix flows for systems i219LM

2020-12-04 Thread Alexander Duyck

On Fri, Dec 4, 2020 at 2:28 PM Limonciello, Mario
 wrote:
>
> > -Original Message-
> > From: Alexander Duyck 
> > Sent: Friday, December 4, 2020 15:27
> > To: Limonciello, Mario
> > Cc: Jeff Kirsher; Tony Nguyen; intel-wired-lan; LKML; Linux PM; Netdev; 
> > Jakub
> > Kicinski; Sasha Netfin; Aaron Brown; Stefan Assmann; David Miller; David
> > Arcari; Shen, Yijun; Yuan, Perry; anthony.w...@canonical.com
> > Subject: Re: [PATCH v3 0/7] Improve s0ix flows for systems i219LM
> >
> >
> > [EXTERNAL EMAIL]
> >
> > On Fri, Dec 4, 2020 at 12:09 PM Mario Limonciello
> >  wrote:
> > >
> > > commit e086ba2fccda ("e1000e: disable s0ix entry and exit flows for ME
> > systems")
> > > disabled s0ix flows for systems that have various incarnations of the
> > > i219-LM ethernet controller.  This was done because of some regressions
> > > caused by an earlier
> > > commit 632fbd5eb5b0e ("e1000e: fix S0ix flows for cable connected case")
> > > with i219-LM controller.
> > >
> > > Performing suspend to idle with these ethernet controllers requires a
> > properly
> > > configured system.  To make enabling such systems easier, this patch
> > > series allows determining if enabled and turning on using ethtool.
> > >
> > > The flows have also been confirmed to be configured correctly on Dell's
> > Latitude
> > > and Precision CML systems containing the i219-LM controller, when the 
> > > kernel
> > also
> > > contains the fix for s0i3.2 entry previously submitted here and now part 
> > > of
> > this
> > > series.
> > > https://marc.info/?l=linux-netdev=160677194809564=2
> > >
> > > Patches 4 through 7 will turn the behavior on by default for some of 
> > > Dell's
> > > CML and TGL systems.
> >
> > The patches look good to me. Just need to address the minor issue that
> > seems to have been present prior to the introduction of this patch
> > set.
> >
> > Reviewed-by: Alexander Duyck 
>
> Thanks for your review.  Just some operational questions - since this 
> previously
> existed do you want me to re-spin the series to a v4 for this, or should it be
> a follow up after the series?
>
> If I respin it, would you prefer that change to occur at the start or end
> of the series?

I don't need a respin, but if you are going to fix it you should
probably put out the patch as something like a 8/7. If you respin it
should happen near the start of the series as it is a bug you are
addressing.

Re: [PATCH v3 0/7] Improve s0ix flows for systems i219LM

2020-12-04 Thread Alexander Duyck

On Fri, Dec 4, 2020 at 12:09 PM Mario Limonciello
 wrote:
>
> commit e086ba2fccda ("e1000e: disable s0ix entry and exit flows for ME 
> systems")
> disabled s0ix flows for systems that have various incarnations of the
> i219-LM ethernet controller.  This was done because of some regressions
> caused by an earlier
> commit 632fbd5eb5b0e ("e1000e: fix S0ix flows for cable connected case")
> with i219-LM controller.
>
> Performing suspend to idle with these ethernet controllers requires a properly
> configured system.  To make enabling such systems easier, this patch
> series allows determining if enabled and turning on using ethtool.
>
> The flows have also been confirmed to be configured correctly on Dell's 
> Latitude
> and Precision CML systems containing the i219-LM controller, when the kernel 
> also
> contains the fix for s0i3.2 entry previously submitted here and now part of 
> this
> series.
> https://marc.info/?l=linux-netdev=160677194809564=2
>
> Patches 4 through 7 will turn the behavior on by default for some of Dell's
> CML and TGL systems.

The patches look good to me. Just need to address the minor issue that
seems to have been present prior to the introduction of this patch
set.

Reviewed-by: Alexander Duyck

Re: [PATCH v3 2/7] e1000e: Move all S0ix related code into its own source file

2020-12-04 Thread Alexander Duyck

On Fri, Dec 4, 2020 at 12:09 PM Mario Limonciello
 wrote:
>
> Introduce a flag to indicate the device should be using the S0ix
> flows and use this flag to run those functions.
>
> Splitting the code to it's own file will make future heuristics
> more self contained.
>
> Tested-by: Yijun Shen 
> Signed-off-by: Mario Limonciello 

One minor issue pointed out below.

> ---
>  drivers/net/ethernet/intel/e1000e/Makefile |   2 +-
>  drivers/net/ethernet/intel/e1000e/e1000.h  |   4 +
>  drivers/net/ethernet/intel/e1000e/netdev.c | 272 +---
>  drivers/net/ethernet/intel/e1000e/s0ix.c   | 280 +
>  4 files changed, 290 insertions(+), 268 deletions(-)
>  create mode 100644 drivers/net/ethernet/intel/e1000e/s0ix.c
>
> diff --git a/drivers/net/ethernet/intel/e1000e/Makefile 
> b/drivers/net/ethernet/intel/e1000e/Makefile
> index 44e58b6e7660..f2332c01f86c 100644
> --- a/drivers/net/ethernet/intel/e1000e/Makefile
> +++ b/drivers/net/ethernet/intel/e1000e/Makefile
> @@ -9,5 +9,5 @@ obj-$(CONFIG_E1000E) += e1000e.o
>
>  e1000e-objs := 82571.o ich8lan.o 80003es2lan.o \
>mac.o manage.o nvm.o phy.o \
> -  param.o ethtool.o netdev.o ptp.o
> +  param.o ethtool.o netdev.o s0ix.o ptp.o
>
> diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h 
> b/drivers/net/ethernet/intel/e1000e/e1000.h
> index ba7a0f8f6937..b13f956285ae 100644
> --- a/drivers/net/ethernet/intel/e1000e/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000e/e1000.h
> @@ -436,6 +436,7 @@ s32 e1000e_get_base_timinca(struct e1000_adapter 
> *adapter, u32 *timinca);
>  #define FLAG2_DFLT_CRC_STRIPPING  BIT(12)
>  #define FLAG2_CHECK_RX_HWTSTAMP   BIT(13)
>  #define FLAG2_CHECK_SYSTIM_OVERFLOW   BIT(14)
> +#define FLAG2_ENABLE_S0IX_FLOWS   BIT(15)
>
>  #define E1000_RX_DESC_PS(R, i) \
> (&(((union e1000_rx_desc_packet_split *)((R).desc))[i]))
> @@ -462,6 +463,9 @@ enum latency_range {
>  extern char e1000e_driver_name[];
>
>  void e1000e_check_options(struct e1000_adapter *adapter);
> +void e1000e_s0ix_entry_flow(struct e1000_adapter *adapter);
> +void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter);
> +void e1000e_maybe_enable_s0ix(struct e1000_adapter *adapter);
>  void e1000e_set_ethtool_ops(struct net_device *netdev);
>
>  int e1000e_open(struct net_device *netdev);
> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
> b/drivers/net/ethernet/intel/e1000e/netdev.c
> index 128ab6898070..cd9839e86615 100644
> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c



>  static int e1000e_pm_freeze(struct device *dev)
>  {
> struct net_device *netdev = dev_get_drvdata(dev);
> @@ -6962,7 +6701,6 @@ static __maybe_unused int e1000e_pm_suspend(struct 
> device *dev)
> struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev));
> struct e1000_adapter *adapter = netdev_priv(netdev);
> struct pci_dev *pdev = to_pci_dev(dev);
> -   struct e1000_hw *hw = >hw;
> int rc;
>
> e1000e_flush_lpic(pdev);
> @@ -6974,8 +6712,7 @@ static __maybe_unused int e1000e_pm_suspend(struct 
> device *dev)
> e1000e_pm_thaw(dev);
>
> /* Introduce S0ix implementation */
> -   if (hw->mac.type >= e1000_pch_cnp &&
> -   !e1000e_check_me(hw->adapter->pdev->device))
> +   if (adapter->flags2 & FLAG2_ENABLE_S0IX_FLOWS)
> e1000e_s0ix_entry_flow(adapter);

So the placement of this code raises some issues. It isn't a problem
with your patch but a bug in the driver that needs to be addressed. I
am assuming you only need to perform this flow if you successfully
froze the part. However this is doing it in all cases, which is why
the e1000e_pm_thaw is being called before you call this code. This is
something that should probably be an "else if" rather than a seperate
if statement.

>
> return rc;
> @@ -6986,12 +6723,10 @@ static __maybe_unused int e1000e_pm_resume(struct 
> device *dev)
> struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev));
> struct e1000_adapter *adapter = netdev_priv(netdev);
> struct pci_dev *pdev = to_pci_dev(dev);
> -   struct e1000_hw *hw = >hw;
> int rc;
>
> /* Introduce S0ix implementation */
> -   if (hw->mac.type >= e1000_pch_cnp &&
> -   !e1000e_check_me(hw->adapter->pdev->device))
> +   if (adapter->flags2 & FLAG2_ENABLE_S0IX_FLOWS)
> e1000e_s0ix_exit_flow(adapter);
>
> rc = __e1000_resume(pdev);
> @@ -7655,6 +7390,9 @@ static int e1000_probe(struct pci_dev *pdev, const 
> struct pci_device_id *ent)
> if (!(adapter->flags & FLAG_HAS_AMT))
> e1000e_get_hw_control(adapter);
>
> +   /* use heuristics to decide whether to enable s0ix flows */
> +   e1000e_maybe_enable_s0ix(adapter);
> +
> strlcpy(netdev->name, "eth%d", sizeof(netdev->name));
> err =

Re: [PATCH v4 net-next 00/13] Add ethtool ntuple filters support

2020-11-14 Thread Alexander Duyck

On Sat, Nov 14, 2020 at 11:53 AM Naveen Mamindlapalli
 wrote:
>
> This patch series adds support for ethtool ntuple filters, unicast
> address filtering, VLAN offload and SR-IOV ndo handlers. All of the
> above features are based on the Admin Function(AF) driver support to
> install and delete the low level MCAM entries. Each MCAM entry is
> programmed with the packet fields to match and what actions to take
> if the match succeeds. The PF driver requests AF driver to allocate
> set of MCAM entries to be used to install the flows by that PF. The
> entries will be freed when the PF driver is unloaded.
>
> * The patches 1 to 4 adds AF driver infrastructure to install and
>   delete the low level MCAM flow entries.
> * Patch 5 adds ethtool ntuple filter support.
> * Patch 6 adds unicast MAC address filtering.
> * Patch 7 adds support for dumping the MCAM entries via debugfs.
> * Patches 8 to 10 adds support for VLAN offload.
> * Patch 10 to 11 adds support for SR-IOV ndo handlers.
> * Patch 12 adds support to read the MCAM entries.
>
> Misc:
> * Removed redundant mailbox NIX_RXVLAN_ALLOC.
>
> Change-log:
> v4:
> - Fixed review comments from Alexander Duyck on v3.
> - Added macros for KEX profile configuration values.
> - TCP/UDP SPORT+DPORT extracted using single entry.
> - Use eth_broadcast_addr() instead of memcpy to avoid one extra 
> variable.
> - Fix "ether type" to "Ethertype" & "meta data" to "metadata" in 
> comments.
> - Added more comments.
> v3:
> - Fixed Saeed's review comments on v2.
> - Fixed modifying the netdev->flags from driver.
> - Fixed modifying the netdev features and hw_features after 
> register_netdev.
> - Removed unwanted ndo_features_check callback.
> v2:
> - Fixed the sparse issues reported by Jakub.

All of the fixes look like they are in place.

Reviewed-by: Alexander Duyck

Re: [PATCH v3 net-next 04/13] octeontx2-af: Add mbox messages to install and delete MCAM rules

2020-11-12 Thread Alexander Duyck

On Tue, Nov 10, 2020 at 11:22 PM Naveen Mamindlapalli
 wrote:
>
> From: Subbaraya Sundeep 
>
> Added new mailbox messages to install and delete MCAM rules.
> These mailbox messages will be used for adding/deleting ethtool
> n-tuple filters by NIX PF. The installed MCAM rules are stored
> in a list that will be traversed later to delete the MCAM entries
> when the interface is brought down or when PCIe FLR is received.
> The delete mailbox supports deleting a single MCAM entry or range
> of entries or all the MCAM entries owned by the pcifunc. Each MCAM
> entry can be associated with a HW match stat entry if the mailbox
> requester wants to check the hit count for debugging.
>
> Modified adding default unicast DMAC match rule using install
> flow API. The default unicast DMAC match entry installed by
> Administrative Function is saved and can be changed later by the
> mailbox user to fit additional fields, or the default MCAM entry
> rule action can be used for other flow rules installed later.
>
> Modified rvu_mbox_handler_nix_lf_free mailbox to add a flag to
> disable or delete the MCAM entries. The MCAM entries are disabled
> when the interface is brought down and deleted in FLR handler.
> The disabled MCAM entries will be re-enabled when the interface
> is brought up again.
>
> Signed-off-by: Subbaraya Sundeep 
> Signed-off-by: Sunil Goutham 
> Signed-off-by: Naveen Mamindlapalli 

A couple minor issues to address, called out in comments below.

> ---
>  drivers/net/ethernet/marvell/octeontx2/af/common.h |   2 +
>  drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  76 ++-
>  drivers/net/ethernet/marvell/octeontx2/af/npc.h|  57 +-
>  drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  13 +
>  .../net/ethernet/marvell/octeontx2/af/rvu_nix.c|  19 +-
>  .../net/ethernet/marvell/octeontx2/af/rvu_npc.c| 217 ++-
>  .../net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c | 721 
> +
>  .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c   |  12 +-
>  8 files changed, 1065 insertions(+), 52 deletions(-)
>



> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c 
> b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
> index eb4eaa7ece3a..a7759ecfa586 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
> @@ -219,7 +219,7 @@ static int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
> return npc_get_ucast_mcam_index(mcam, pcifunc, nixlf);
>  }
>
> -static int npc_get_bank(struct npc_mcam *mcam, int index)
> +int npc_get_bank(struct npc_mcam *mcam, int index)
>  {
> int bank = index / mcam->banksize;
>
> @@ -241,8 +241,8 @@ static bool is_mcam_entry_enabled(struct rvu *rvu, struct 
> npc_mcam *mcam,
> return (cfg & 1);
>  }
>
> -static void npc_enable_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam,
> - int blkaddr, int index, bool enable)
> +void npc_enable_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam,
> +  int blkaddr, int index, bool enable)
>  {
> int bank = npc_get_bank(mcam, index);
> int actbank = bank;
> @@ -359,6 +359,41 @@ static void npc_get_keyword(struct mcam_entry *entry, 
> int idx,
> *cam0 = ~*cam1 & kw_mask;
>  }
>
> +static void npc_get_default_entry_action(struct rvu *rvu, struct npc_mcam 
> *mcam,
> +int blkaddr, int index,
> +struct mcam_entry *entry)
> +{
> +   u16 owner, target_func;
> +   struct rvu_pfvf *pfvf;
> +   int bank, nixlf;
> +   u64 rx_action;
> +
> +   owner = mcam->entry2pfvf_map[index];
> +   target_func = (entry->action >> 4) & 0x;
> +   /* return incase target is PF or LBK or rule owner is not PF */
> +   if (is_afvf(target_func) || (owner & RVU_PFVF_FUNC_MASK) ||
> +   !(target_func & RVU_PFVF_FUNC_MASK))
> +   return;
> +
> +   pfvf = rvu_get_pfvf(rvu, target_func);
> +   mcam->entry2target_pffunc[index] = target_func;
> +   /* return if nixlf is not attached or initialized */
> +   if (!is_nixlf_attached(rvu, target_func) || !pfvf->def_ucast_rule)
> +   return;
> +
> +   /* get VF ucast entry rule */
> +   nix_get_nixlf(rvu, target_func, , NULL);
> +   index = npc_get_nixlf_mcam_index(mcam, target_func,
> +nixlf, NIXLF_UCAST_ENTRY);
> +   bank = npc_get_bank(mcam, index);
> +   index &= (mcam->banksize - 1);
> +
> +   rx_action = rvu_read64(rvu, blkaddr,
> +  NPC_AF_MCAMEX_BANKX_ACTION(index, bank));
> +   if (rx_action)
> +   entry->action = rx_action;
> +}
> +
>  static void npc_config_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam,
>   int blkaddr, int index, u8 intf,
>   struct mcam_entry *entry, bool

Re: [PATCH v3 net-next 03/13] octeontx2-af: Generate key field bit mask from KEX profile

2020-11-12 Thread Alexander Duyck

On Tue, Nov 10, 2020 at 11:24 PM Naveen Mamindlapalli
 wrote:
>
> From: Subbaraya Sundeep 
>
> Key Extraction(KEX) profile decides how the packet metadata such as
> layer information and selected packet data bytes at each layer are
> placed in MCAM search key. This patch reads the configured KEX profile
> parameters to find out the bit position and bit mask for each field.
> The information is used when programming the MCAM match data by SW
> to match a packet flow and take appropriate action on the flow. This
> patch also verifies the mandatory fields such as channel and DMAC
> are not overwritten by the KEX configuration of other fields.
>
> Signed-off-by: Subbaraya Sundeep 
> Signed-off-by: Sunil Goutham 
> Signed-off-by: Naveen Mamindlapalli 

A few minor spelling issues, otherwise it looks fine.

Reviewed-by: Alexander Duyck 

> ---
>  drivers/net/ethernet/marvell/octeontx2/af/Makefile |   2 +-
>  drivers/net/ethernet/marvell/octeontx2/af/npc.h|  48 ++
>  drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  38 ++
>  .../net/ethernet/marvell/octeontx2/af/rvu_npc.c|  11 +-
>  .../net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c | 562 
> +
>  5 files changed, 658 insertions(+), 3 deletions(-)
>  create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c
>
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile 
> b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
> index 2f7a861d0c7b..ffc681b67f1c 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
> @@ -9,4 +9,4 @@ obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o
>
>  octeontx2_mbox-y := mbox.o rvu_trace.o
>  octeontx2_af-y := cgx.o rvu.o rvu_cgx.o rvu_npa.o rvu_nix.o \
> - rvu_reg.o rvu_npc.o rvu_debugfs.o ptp.o
> + rvu_reg.o rvu_npc.o rvu_debugfs.o ptp.o rvu_npc_fs.o
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h 
> b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
> index 91a9d00e4fb5..0fe47216f771 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/npc.h
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
> @@ -140,6 +140,54 @@ enum npc_kpu_lh_ltype {
> NPC_LT_LH_CUSTOM1 = 0xF,
>  };
>
> +/* list of known and supported fields in packet header and
> + * fields present in key structure.
> + */
> +enum key_fields {
> +   NPC_DMAC,
> +   NPC_SMAC,
> +   NPC_ETYPE,
> +   NPC_OUTER_VID,
> +   NPC_TOS,
> +   NPC_SIP_IPV4,
> +   NPC_DIP_IPV4,
> +   NPC_SIP_IPV6,
> +   NPC_DIP_IPV6,
> +   NPC_SPORT_TCP,
> +   NPC_DPORT_TCP,
> +   NPC_SPORT_UDP,
> +   NPC_DPORT_UDP,
> +   NPC_SPORT_SCTP,
> +   NPC_DPORT_SCTP,
> +   NPC_HEADER_FIELDS_MAX,
> +   NPC_CHAN = NPC_HEADER_FIELDS_MAX, /* Valid when Rx */
> +   NPC_PF_FUNC, /* Valid when Tx */
> +   NPC_ERRLEV,
> +   NPC_ERRCODE,
> +   NPC_LXMB,
> +   NPC_LA,
> +   NPC_LB,
> +   NPC_LC,
> +   NPC_LD,
> +   NPC_LE,
> +   NPC_LF,
> +   NPC_LG,
> +   NPC_LH,
> +   /* ether type for untagged frame */
> +   NPC_ETYPE_ETHER,
> +   /* ether type for single tagged frame */
> +   NPC_ETYPE_TAG1,
> +   /* ether type for double tagged frame */
> +   NPC_ETYPE_TAG2,
> +   /* outer vlan tci for single tagged frame */
> +   NPC_VLAN_TAG1,
> +   /* outer vlan tci for double tagged frame */
> +   NPC_VLAN_TAG2,
> +   /* other header fields programmed to extract but not of our interest 
> */
> +   NPC_UNKNOWN,
> +   NPC_KEY_FIELDS_MAX,
> +};
> +
>  struct npc_kpu_profile_cam {
> u8 state;
> u8 state_mask;
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
> b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
> index 1724dbd18847..7e556c7b6ccf 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
> @@ -15,6 +15,7 @@
>  #include "rvu_struct.h"
>  #include "common.h"
>  #include "mbox.h"
> +#include "npc.h"
>
>  /* PCI device IDs */
>  #definePCI_DEVID_OCTEONTX2_RVU_AF  0xA065
> @@ -105,6 +106,36 @@ struct nix_mce_list {
> int max;
>  };
>
> +/* layer meta data to uniquely identify a packet header field */

s/meta data/metadata/

> +struct npc_layer_mdata {
> +   u8 lid;
> +   u8 ltype;
> +   u8 hdr;
> +   u8 key;
> +   u8 len;
> +};
> +



> +   /* Handle header fields which can come from multiple layers like
> +

Re: [PATCH v3 net-next 02/13] octeontx2-af: Verify MCAM entry channel and PF_FUNC

2020-11-12 Thread Alexander Duyck

On Tue, Nov 10, 2020 at 11:18 PM Naveen Mamindlapalli
 wrote:
>
> From: Subbaraya Sundeep 
>
> This patch adds support to verify the channel number sent by
> mailbox requester before writing MCAM entry for Ingress packets.
> Similarly for Egress packets, verifying the PF_FUNC sent by the
> mailbox user.
>
> Signed-off-by: Subbaraya Sundeep 
> Signed-off-by: Kiran Kumar K 
> Signed-off-by: Sunil Goutham 
> Signed-off-by: Naveen Mamindlapalli 

One minor nit below. Otherwise looks good to me.

Reviewed-by: Alexander Duyck 

> ---
>  drivers/net/ethernet/marvell/octeontx2/af/rvu.c|  4 +-
>  drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  2 +
>  .../net/ethernet/marvell/octeontx2/af/rvu_npc.c| 78 
> ++
>  3 files changed, 82 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
> b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
> index a28a518c0eae..e8b5aaf73201 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
> @@ -2642,7 +2642,7 @@ static void rvu_enable_afvf_intr(struct rvu *rvu)
>
>  #define PCI_DEVID_OCTEONTX2_LBK 0xA061
>
> -static int lbk_get_num_chans(void)
> +int rvu_get_num_lbk_chans(void)
>  {
> struct pci_dev *pdev;
> void __iomem *base;
> @@ -2677,7 +2677,7 @@ static int rvu_enable_sriov(struct rvu *rvu)
> return 0;
> }
>
> -   chans = lbk_get_num_chans();
> +   chans = rvu_get_num_lbk_chans();
> if (chans < 0)
> return chans;
>
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
> b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
> index 5ac9bb12415f..1724dbd18847 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
> @@ -445,6 +445,7 @@ int rvu_get_lf(struct rvu *rvu, struct rvu_block *block, 
> u16 pcifunc, u16 slot);
>  int rvu_lf_reset(struct rvu *rvu, struct rvu_block *block, int lf);
>  int rvu_get_blkaddr(struct rvu *rvu, int blktype, u16 pcifunc);
>  int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool 
> zero);
> +int rvu_get_num_lbk_chans(void);
>
>  /* RVU HW reg validation */
>  enum regmap_block {
> @@ -535,6 +536,7 @@ bool is_npc_intf_tx(u8 intf);
>  bool is_npc_intf_rx(u8 intf);
>  bool is_npc_interface_valid(struct rvu *rvu, u8 intf);
>  int rvu_npc_get_tx_nibble_cfg(struct rvu *rvu, u64 nibble_ena);
> +int npc_mcam_verify_channel(struct rvu *rvu, u16 pcifunc, u8 intf, u16 
> channel);
>
>  #ifdef CONFIG_DEBUG_FS
>  void rvu_dbg_init(struct rvu *rvu);
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c 
> b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
> index 989533a3d2ce..3666159bb6b6 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
> @@ -28,6 +28,8 @@
>
>  #define NPC_PARSE_RESULT_DMAC_OFFSET   8
>  #define NPC_HW_TSTAMP_OFFSET   8
> +#define NPC_KEX_CHAN_MASK  0xFFFULL
> +#define NPC_KEX_PF_FUNC_MASK   0xULL
>
>  static const char def_pfl_name[] = "default";
>
> @@ -63,6 +65,54 @@ int rvu_npc_get_tx_nibble_cfg(struct rvu *rvu, u64 
> nibble_ena)
> return 0;
>  }
>
> +static int npc_mcam_verify_pf_func(struct rvu *rvu,
> +  struct mcam_entry *entry_data, u8 intf,
> +  u16 pcifunc)
> +{
> +   u16 pf_func, pf_func_mask;
> +
> +   if (is_npc_intf_rx(intf))
> +   return 0;
> +
> +   pf_func_mask = (entry_data->kw_mask[0] >> 32) &
> +   NPC_KEX_PF_FUNC_MASK;
> +   pf_func = (entry_data->kw[0] >> 32) & NPC_KEX_PF_FUNC_MASK;
> +
> +   pf_func = be16_to_cpu((__force __be16)pf_func);
> +   if (pf_func_mask != NPC_KEX_PF_FUNC_MASK ||
> +   ((pf_func & ~RVU_PFVF_FUNC_MASK) !=
> +(pcifunc & ~RVU_PFVF_FUNC_MASK)))
> +   return -EINVAL;
> +
> +   return 0;
> +}
> +
> +int npc_mcam_verify_channel(struct rvu *rvu, u16 pcifunc, u8 intf, u16 
> channel)
> +{
> +   int pf = rvu_get_pf(pcifunc);
> +   u8 cgx_id, lmac_id;
> +   int base = 0, end;
> +
> +   if (is_npc_intf_tx(intf))
> +   return 0;
> +
> +   if (is_afvf(pcifunc)) {
> +   end = rvu_get_num_lbk_chans();
> +   if (end < 0)
> +   return -EINVAL;
> +   } else {
> +   rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], _id, 
> _id);
> +

Re: [PATCH v3 net-next 01/13] octeontx2-af: Modify default KEX profile to extract TX packet fields

2020-11-12 Thread Alexander Duyck

On Tue, Nov 10, 2020 at 11:22 PM Naveen Mamindlapalli
 wrote:
>
> From: Stanislaw Kardach 
>
> The current default Key Extraction(KEX) profile can only use RX
> packet fields while generating the MCAM search key. The profile
> can't be used for matching TX packet fields. This patch modifies
> the default KEX profile to add support for extracting TX packet
> fields into MCAM search key. Enabled Tx KPU packet parsing by
> configuring TX PKIND in tx_parse_cfg.
>
> Also modified the default KEX profile to extract VLAN TCI from
> the LB_PTR and exact byte offset of VLAN header. The NPC KPU
> parser was modified to point LB_PTR to the starting byte offset
> of VLAN header which points to the tpid field.
>
> Signed-off-by: Stanislaw Kardach 
> Signed-off-by: Sunil Goutham 
> Signed-off-by: Naveen Mamindlapalli 

A bit more documentation would be useful. However other than that the
code itself appears to make sense.

Reviewed-by: Alexander Duyck 

> ---
>  .../ethernet/marvell/octeontx2/af/npc_profile.h| 71 
> --
>  .../net/ethernet/marvell/octeontx2/af/rvu_nix.c|  6 ++
>  2 files changed, 72 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h 
> b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h
> index 199448610e3e..c5b13385c81d 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h
> @@ -13386,8 +13386,8 @@ static struct npc_mcam_kex npc_mkex_default = {
> .kpu_version = NPC_KPU_PROFILE_VER,
> .keyx_cfg = {
> /* nibble: LA..LE (ltype only) + Channel */
> -   [NIX_INTF_RX] = ((u64)NPC_MCAM_KEY_X2 << 32) | 0x49247,
> -   [NIX_INTF_TX] = ((u64)NPC_MCAM_KEY_X2 << 32) | ((1ULL << 19) 
> - 1),
> +   [NIX_INTF_RX] = ((u64)NPC_MCAM_KEY_X2 << 32) | 0x249207,
> +   [NIX_INTF_TX] = ((u64)NPC_MCAM_KEY_X2 << 32) | 0x249200,
> },
> .intf_lid_lt_ld = {
> /* Default RX MCAM KEX profile */
//
Any sort of explanation for what some of these magic numbers means
might be useful. I'm left wondering if the lower 32b is a bitfield or
a fixed value. I am guessing bit field based on the fact that it was
originally being set using ((1ULL << X) - 1) however if there were
macros defined for each bit explaining what each bit was that would be
useful.

> @@ -13405,12 +13405,14 @@ static struct npc_mcam_kex npc_mkex_default = {
> /* Layer B: Single VLAN (CTAG) */
> /* CTAG VLAN[2..3] + Ethertype, 4 bytes, KW0[63:32] */
> [NPC_LT_LB_CTAG] = {
> -   KEX_LD_CFG(0x03, 0x0, 0x1, 0x0, 0x4),
> +   KEX_LD_CFG(0x03, 0x2, 0x1, 0x0, 0x4),
> },

Similarly here some explanation for KEX_LD_CFG would be useful. From
what I can tell it seems like this may be some sort of fix as you are
adjusting the "hdr_ofs" field from 0 to 2.

> /* Layer B: Stacked VLAN (STAG|QinQ) */
> [NPC_LT_LB_STAG_QINQ] = {
> -   /* CTAG VLAN[2..3] + Ethertype, 4 bytes, 
> KW0[63:32] */
> -   KEX_LD_CFG(0x03, 0x4, 0x1, 0x0, 0x4),
> +   /* Outer VLAN: 2 bytes, KW0[63:48] */
> +   KEX_LD_CFG(0x01, 0x2, 0x1, 0x0, 0x6),
> +   /* Ethertype: 2 bytes, KW0[47:32] */
> +   KEX_LD_CFG(0x01, 0x8, 0x1, 0x0, 0x4),

Just to confirm, are you matching up the outer VLAN with the inner
Ethertype here? It seems like an odd combination. I assume you need
the inner ethertype in order to identify the L3 traffic?

> },
> [NPC_LT_LB_FDSA] = {
> /* SWITCH PORT: 1 byte, KW0[63:48] */
> @@ -13450,6 +13452,65 @@ static struct npc_mcam_kex npc_mkex_default = {
> },
> },
> },
> +
> +   /* Default TX MCAM KEX profile */
> +   [NIX_INTF_TX] = {
> +   [NPC_LID_LA] = {
> +   /* Layer A: Ethernet: */
> +   [NPC_LT_LA_IH_NIX_ETHER] = {
> +   /* PF_FUNC: 2B , KW0 [47:32] */
> +   KEX_LD_CFG(0x01, 0x0, 0x1, 0x0, 0x4),

I'm assuming you have an 8B internal header that is being parsed? A
comment explaining that this is parsing a preamble that is at the
start of things might be useful.

> +   /* DMAC: 6 bytes, KW1[63:16] */
> +   KEX_LD_CFG

Re: [PATCH V7 5/5] platform/x86: Intel PMT Crashlog capability driver

2020-10-01 Thread Alexander Duyck

On Thu, Oct 1, 2020 at 11:47 AM Andy Shevchenko
 wrote:
>
> On Thu, Oct 1, 2020 at 9:33 PM Alexander Duyck
>  wrote:
> > On Thu, Oct 1, 2020 at 9:37 AM Andy Shevchenko
> >  wrote:
> > > On Thu, Oct 1, 2020 at 4:43 AM David E. Box  
> > > wrote:
>
> ...
>
> > Arguably not much. I'll drop the comment.
> >
> > > > +   control &= ~(CRASHLOG_FLAG_MASK | CRASHLOG_FLAG_DISABLE);
> > >
> > > How does the second constant play any role here?
> >
> > The "control" flags are bits 28-31, while the disable flag is bit 27
> > if I recall.
>
> Okay, then it adds more confusion to the same comment here and there.
> Good you are about to drop the comment.
>
> > Specifically bit 31 is read only, bit 28 will clear bit 31, bit 29
> > will cause the crashlog to be generated and set bit 31, and bit 30 is
> > just reserved 0.
>
> Can this be added as a comment somewhere in the code?

I'll do that with the definitions themselves.

> ...
>
> > > > +   ret = intel_pmt_dev_create(entry, _crashlog_ns, parent);
> > > > +   if (!ret)
> > > > +   return 0;
>
> (2)
>
> > > > +
> > > > +   dev_err(parent, "Failed to add crashlog controls\n");
> > > > +   intel_pmt_dev_destroy(entry, _crashlog_ns);
> > > > +
> > > > +   return ret;
> > >
> > > Can we use traditional patterns?
> > > if (ret) {
> > >   ...
> > > }
> > > return ret;
> >
> > I can switch it if that is preferred.
>
> Yes, please. The (2) is really hard to parse (easy to miss ! part and
> be confused by return 0 one).
>
> ...
>
> > > Are you going to duplicate this in each driver? Consider to refactor
> > > to avoid duplication of a lot of code.
> >
> > So the issue lies in the complexity of pmt_telem_add_entry versus
> > pmt_crashlog_add_entry. Specifically I end up needing disc_res and the
> > discovery table when I go to create the controls for the crashlog
> > device. Similarly we have a third device that we plan to add called a
> > watcher which will require us to keep things split up like this so we
> > thought it best to split it up this way.
>
> Could you revisit and think how this can be deduplicated. I see at
> least one variant with a hooks (callbacks) which you supply depending
> on the driver, but the for-loop is kept in one place.

I'll see what I can do.

> ...
>
> > > > +   .name   = DRV_NAME,
> > >
> > > > +MODULE_ALIAS("platform:" DRV_NAME);
> > >
> > > I'm not sure I have interpreted this:
> > > - Use 'raw' string instead of defines for device names
> > > correctly. Can you elaborate?
> >
> > Again I am not sure what this is in reference to. If you can point me
> > to some documentation somewhere I can take a look.
>
> Reference to your own changelog of this series!

So the issue is we have two authors so it is a matter of keeping track
of who is working on what.

So apparently that was in reference to the MFD driver which was
instantiating the devices using defines and there was only one spot
where they were being used. The reason why I was confused is because
the commit message had nothing to do with this patch and it I haven't
really done any work on the MFD driver myself. The link to the 'raw'
discussion can be found here:
https://lore.kernel.org/lkml/20200728075859.GH1850026@dell/

Re: [PATCH V7 3/5] platform/x86: Intel PMT class driver

2020-10-01 Thread Alexander Duyck

On Thu, Oct 1, 2020 at 11:06 AM Andy Shevchenko
 wrote:
>
> On Thu, Oct 1, 2020 at 8:44 PM Alexander Duyck
>  wrote:
> > On Thu, Oct 1, 2020 at 9:26 AM Andy Shevchenko
> >  wrote:
> > > On Thu, Oct 1, 2020 at 4:43 AM David E. Box  
> > > wrote:
>
> ...
>
> > > > Intel Platform Monitoring Technology is meant to provide a common way to
> > > > access telemetry and system metrics.
> > > >
> > > > Register mappings are not provided by the driver. Instead, a GUID is 
> > > > read
> > > > from a header for each endpoint. The GUID identifies the device and is 
> > > > to
> > > > be used with an XML, provided by the vendor, to discover the available 
> > > > set
> > > > of metrics and their register mapping.  This allows firmware updates to
> > > > modify the register space without needing to update the driver every 
> > > > time
> > > > with new mappings. Firmware writes a new GUID in this case to specify 
> > > > the
> > > > new mapping.  Software tools with access to the associated XML file can
> > > > then interpret the changes.
> > >
> > > Where one may find a database of these reserved GUIDs / XMLs?
> > > How do you prevent a chaos which happens with other registries?
> >
> > The database will be posted on intel.com eventually. Although I don't
> > believe the URL is public yet.
>
> How can we be sure that this won't be forgotten? How can we be sure it
> will be public at the end? Please, elaborate this in the commit
> message.

Okay, I will work with David on that.

> ...
>
> > > > +static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
> > > > +   { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
> > > > +   { }
> > > > +};
> > > > +bool intel_pmt_is_early_client_hw(struct device *dev)
> > > > +{
> > > > +   struct pci_dev *parent = to_pci_dev(dev->parent);
> > > > +
> > > > +   return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
> > >
> > > What is this and why is it in the class driver?
> >
> > I chose to use the class driver as a central place to store code
> > common to all of the instances of the class. In this case we have
> > quirks that are specific to Tiger Lake and so I chose to store the
> > function to test for the device here.
>
> Can it be done in another file module at least (let's say intel_pmt_quirks.c)?

I suppose, but then it is adding a file for essentially 13 lines of
code. Maybe I will just move this back to intel_pmt_telemetry.c and we
can revisit where this should go if/when we add the watcher driver.

> ...
>
> > > > +   if (off < 0)
> > > > +   return -EINVAL;
> > > Is this real or theoretical?
> >
> > Not sure. I am not that familiar with the interface. It was something
> > I copied from read_bmof which is what I based this code on based on an
> > earlier suggestion.
>
> I'm not a fan of cargo cult when there is no understanding why certain
> code appears in the driver.

Well with something like this I usually question if it provides any
value. The problem is I don't know enough about binary sysfs
attributes to say one way or another. If you know that the offset
provided cannot be negative I can drop it. However I haven't seen
anything that seems to say one way or another.

> ...
>
> > > > +   if (count)
> > >
> > > Useless.
> >
> > I'm assuming that is because memcpy_fromio is assumed to handle this case?
>
> Right.
>
> > > > +   memcpy_fromio(buf, entry->base + off, count);
>
> ...
>
> > > > +   psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * 
> > > > PAGE_SIZE;
> > >
> > > PFN_PHYS(PFN_UP(...)) ?
> >
> > I'm not sure how that would work. Basically what we are doing here is
> > determining the size of the mapping based on the number of pages that
> > will be needed. So we wake the pfn of the start of the region,
> > subtract that from the pfn for the end of the region and multiply by
> > the size of a page.
>
> PFN_PHYS() is a replacement for multiplication. You may check its
> implementation.

Ah, okay so you meant PFN_PHYS(PFN_UP(...)-pfn)

> ...
>
> > > > +   /* if size is 0 assume no data buffer, so no file needed */
> > > > +   if (!entry->size)
> >

Re: [PATCH V7 5/5] platform/x86: Intel PMT Crashlog capability driver

2020-10-01 Thread Alexander Duyck

On Thu, Oct 1, 2020 at 9:37 AM Andy Shevchenko
 wrote:
>
> On Thu, Oct 1, 2020 at 4:43 AM David E. Box  
> wrote:
> > Add support for the Intel Platform Monitoring Technology crashlog
> > interface. This interface provides a few sysfs values to allow for
> > controlling the crashlog telemetry interface as well as a character driver
> > to allow for mapping the crashlog memory region so that it can be accessed
> > after a crashlog has been recorded.
> >
> > This driver is meant to only support the server version of the crashlog
> > which is identified as crash_type 1 with a version of zero. Currently no
> > other types are supported.
>
> ...
>
> > +   The crashlog directory contains files for configuring an
> > +   instance of a PMT crashlog device that can perform crash 
> > data
> > +   recoring. Each crashlog device has an associated crashlog
>
> recording
>
> > +   file. This file can be opened and mapped or read to access 
> > the
> > +   resulting crashlog buffer. The register layout for the 
> > buffer
> > +   can be determined from an XML file of specified guid for the
> > +   parent device.
>
> ...
>
> > +   (RO) The guid for this crashlog device. The guid identifies 
> > the
>
> guid -> GUID
>
> Please, spell check all ABI files in this series.

I'll run through it again. I am suspecting I must have deleted the "d"
in recording with a fat fingering when I was editing something else.

> ...
>
> > +config INTEL_PMT_CRASHLOG
> > +   tristate "Intel Platform Monitoring Technology (PMT) Crashlog 
> > driver"
> > +   select INTEL_PMT_CLASS
> > +   help
> > + The Intel Platform Monitoring Technology (PMT) crashlog driver 
> > provides
> > + access to hardware crashlog capabilities on devices that support 
> > the
> > + feature.
>
> Name of the module?

I will add the verbiage:
  To compile this driver as a module, choose M here: the module
  will be called intel_pmt_crashlog.


> ...
>
> > +   /*
>
> > +* Currenty we only recognize OOBMSM version 0 devices.
>
> Currently. Please spell check all comments in the code.

I'll make another pass.

> > +* We can ignore all other crashlog devices in the system.
> > +*/
>
> ...
>
> > +   /* clear control bits */
>
> What new information readers get from this comment?

Arguably not much. I'll drop the comment.

> > +   control &= ~(CRASHLOG_FLAG_MASK | CRASHLOG_FLAG_DISABLE);
>
> How does the second constant play any role here?

The "control" flags are bits 28-31, while the disable flag is bit 27
if I recall.

Specifically bit 31 is read only, bit 28 will clear bit 31, bit 29
will cause the crashlog to be generated and set bit 31, and bit 30 is
just reserved 0.

> ...
>
> > +   /* clear control bits */
>
> Ditto. And moreover it's ambiguous due to joined two lines below.

I'll just drop the comments.

> > +   control &= ~CRASHLOG_FLAG_MASK;
> > +   control |= CRASHLOG_FLAG_EXECUTE;
>
> ...
>
> > +   return strnlen(buf, count);
>
> How is this different to count?

I guess they should be equivalent so I can probably just switch to count.

> ...
>
> > +   struct crashlog_entry *entry;
> > +   bool trigger;
> > +   int result;
> > +
>
> > +   entry = dev_get_drvdata(dev);
>
> You may reduce LOCs by direct assigning in the definition block above.

Okay. I can move it if you prefer.

> ...
>
> > +   result = strnlen(buf, count);
>
> How is it different from count?

I'll switch it.

> ...
>
> > +static DEFINE_XARRAY_ALLOC(crashlog_array);
> > +static struct intel_pmt_namespace pmt_crashlog_ns = {
> > +   .name = "crashlog",
> > +   .xa = _array,
> > +   .attr_grp = _crashlog_group
>
> Leave the comma here.

Already fixed based on similar comments you had for the telemetry driver.. :-)

> > +};
>
> ...
>
> > +   ret = intel_pmt_dev_create(entry, _crashlog_ns, parent);
> > +   if (!ret)
> > +   return 0;
> > +
> > +   dev_err(parent, "Failed to add crashlog controls\n");
> > +   intel_pmt_dev_destroy(entry, _crashlog_ns);
> > +
> > +   return ret;
>
> Can we use traditional patterns?
> if (ret) {
>   ...
> }
> return ret;

I can switch it if that is preferred.

> ...
>
> > +   size = offsetof(struct pmt_crashlog_priv, 
> > entry[pdev->num_resources]);
> > +   priv = devm_kzalloc(>dev, size, GFP_KERNEL);
> > +   if (!priv)
> > +   return -ENOMEM;
>
> struct_size()
>
> ...
>
> > +   /* initialize control mutex */
> > +   mutex_init(>entry[i].control_mutex);
> > +
> > +   disc_res = platform_get_resource(pdev, IORESOURCE_MEM, i);
> > +   if (!disc_res)
> > +   goto abort_probe;
> > +
> > +   ret = intel_pmt_ioremap_discovery_table(entry, pdev, i);
> > +   if (ret)
> > +   goto

Re: [PATCH V7 3/5] platform/x86: Intel PMT class driver

2020-10-01 Thread Alexander Duyck

On Thu, Oct 1, 2020 at 9:26 AM Andy Shevchenko
 wrote:
>
> On Thu, Oct 1, 2020 at 4:43 AM David E. Box  
> wrote:
> >
> > From: Alexander Duyck 
> >
> > Intel Platform Monitoring Technology is meant to provide a common way to
> > access telemetry and system metrics.
> >
> > Register mappings are not provided by the driver. Instead, a GUID is read
> > from a header for each endpoint. The GUID identifies the device and is to
> > be used with an XML, provided by the vendor, to discover the available set
> > of metrics and their register mapping.  This allows firmware updates to
> > modify the register space without needing to update the driver every time
> > with new mappings. Firmware writes a new GUID in this case to specify the
> > new mapping.  Software tools with access to the associated XML file can
> > then interpret the changes.
>
> Where one may find a database of these reserved GUIDs / XMLs?
> How do you prevent a chaos which happens with other registries?

The database will be posted on intel.com eventually. Although I don't
believe the URL is public yet.

> > The module manages access to all Intel PMT endpoints on a system,
> > independent of the device exporting them. It creates an intel_pmt class to
> > manage the devices. For each telemetry endpoint, sysfs files provide GUID
> > and size information as well as a pointer to the parent device the
> > telemetry came from. Software may discover the association between
> > endpoints and devices by iterating through the list in sysfs, or by looking
> > for the existence of the class folder under the device of interest.  A
> > binary sysfs attribute of the same name allows software to then read or map
> > the telemetry space for direct access.
>
> What are the security implications by direct access?

In this case minimal as it would really be no different than the read.
The registers in the memory regions themselves are read-only with no
read side effects.

> ...
>
> > +static const struct pci_device_id pmt_telem_early_client_pci_ids[] = {
> > +   { PCI_VDEVICE(INTEL, 0x9a0d) }, /* TGL */
> > +   { }
> > +};
> > +bool intel_pmt_is_early_client_hw(struct device *dev)
> > +{
> > +   struct pci_dev *parent = to_pci_dev(dev->parent);
> > +
> > +   return !!pci_match_id(pmt_telem_early_client_pci_ids, parent);
> > +}
> > +EXPORT_SYMBOL_GPL(intel_pmt_is_early_client_hw);
>
> What is this and why is it in the class driver?

I chose to use the class driver as a central place to store code
common to all of the instances of the class. In this case we have
quirks that are specific to Tiger Lake and so I chose to store the
function to test for the device here.

> > +static ssize_t
> > +intel_pmt_read(struct file *filp, struct kobject *kobj,
> > +  struct bin_attribute *attr, char *buf, loff_t off,
> > +  size_t count)
> > +{
> > +   struct intel_pmt_entry *entry = container_of(attr,
> > +struct intel_pmt_entry,
> > +pmt_bin_attr);
>
> > +   if (off < 0)
> > +   return -EINVAL;
> Is this real or theoretical?

Not sure. I am not that familiar with the interface. It was something
I copied from read_bmof which is what I based this code on based on an
earlier suggestion.

> > +   if (count)
>
> Useless.

I'm assuming that is because memcpy_fromio is assumed to handle this case?

> > +   memcpy_fromio(buf, entry->base + off, count);
> > +
> > +   return count;
> > +}
>
> ...
>
> > +   psize = (PFN_UP(entry->base_addr + entry->size) - pfn) * PAGE_SIZE;
>
> PFN_PHYS(PFN_UP(...)) ?

I'm not sure how that would work. Basically what we are doing here is
determining the size of the mapping based on the number of pages that
will be needed. So we wake the pfn of the start of the region,
subtract that from the pfn for the end of the region and multiply by
the size of a page.

> > +static struct attribute *intel_pmt_attrs[] = {
> > +   _attr_guid.attr,
> > +   _attr_size.attr,
> > +   _attr_offset.attr,
> > +   NULL
> > +};
>
> > +
>
> Unneeded blank line.
>
> > +ATTRIBUTE_GROUPS(intel_pmt);
>
> ...
>
> > +   /* if size is 0 assume no data buffer, so no file needed */
> > +   if (!entry->size)
> > +   return 0;
>
> Hmm... But presence of the file is also an information that might be
> useful for user, no?

I'm not sure what you mean? If the size of the region is zero it means
there is

Re: [PATCH V7 4/5] platform/x86: Intel PMT Telemetry capability driver

2020-10-01 Thread Alexander Duyck

On Thu, Oct 1, 2020 at 9:03 AM Andy Shevchenko
 wrote:
>
> On Thu, Oct 1, 2020 at 4:43 AM David E. Box  
> wrote:
> >
> > From: Alexander Duyck 
> >
> > PMT Telemetry is a capability of the Intel Platform Monitoring Technology.
> > The Telemetry capability provides access to device telemetry metrics that
> > provide hardware performance data to users from read-only register spaces.
> >
> > With this driver present the intel_pmt directory can be populated with
> > telem devices. These devices will contain the standard intel_pmt sysfs
> > data and a "telem" binary sysfs attribute which can be used to access the
> > telemetry data.
>
> ...
>
> > +static DEFINE_XARRAY_ALLOC(telem_array);
> > +static struct intel_pmt_namespace pmt_telem_ns = {
> > +   .name = "telem",
> > +   .xa = _array
>
> Leave comma at the end.
>
> > +};
> > +
> > +/*
> > + * driver initialization
> > + */
>
> This is a useless comment.
>
> > +   size = offsetof(struct pmt_telem_priv, entry[pdev->num_resources]);
> > +   priv = devm_kzalloc(>dev, size, GFP_KERNEL);
> > +   if (!priv)
> > +   return -ENOMEM;
>
> Please, use struct_size() from overflow.h instead of custom approach.
>
> ...

So all of the above make sense and can be fixed shortly and pushed as
a v8 for both the telemetry and crashlog drivers.

> > +static struct platform_driver pmt_telem_driver = {
> > +   .driver = {
> > +   .name   = TELEM_DEV_NAME,
>
> I'm not sure I have interpreted this:
> - Use 'raw' string instead of defines for device names
> correctly. Can you elaborate?

Can you point me to a reference of that? I'm not sure what you are referring to.

> > +   },
> > +   .remove = pmt_telem_remove,
> > +   .probe  = pmt_telem_probe,
> > +};
>
> ...
>
> > +MODULE_ALIAS("platform:" TELEM_DEV_NAME);
>
> Ditto.

This doesn't make sense to me. Are you saying we are expected to use
"pmt_telemetry" everywhere instead of the define? It seems like that
would be much more error prone. It seems like common practice to use
DRV_NAME throughout a driver for these sort of things so if you are
wanting us to rename it to that I am fine with that, but I am not sure
getting rid of the use of a define makes sense.

Re: [PATCH V6 3/5] platform/x86: Intel PMT class driver

2020-09-29 Thread Alexander Duyck

On Mon, Sep 28, 2020 at 6:24 PM Randy Dunlap  wrote:
>
> On 9/28/20 5:53 PM, David E. Box wrote:
> > diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
> > index 40219bba6801..093c43b63f48 100644
> > --- a/drivers/platform/x86/Kconfig
> > +++ b/drivers/platform/x86/Kconfig
> > @@ -1360,6 +1360,15 @@ config INTEL_PMC_CORE
> >   - LTR Ignore
> >   - MPHY/PLL gating status (Sunrisepoint PCH only)
> >
> > +config INTEL_PMT_CLASS
> > + tristate "Intel Platform Monitoring Technology (PMT) Class driver"
> > + help
> > +   The Intel Platform Monitoring Technology (PMT) class driver provides
> > +   the basic sysfs interface and file heirarchy uses by PMT devices.
>
>  hierarchy
> No "heir" involved.

Knowing me I probably had class inheritance on the mind at the time
when I was writing it up and it was just a thinko.. :-)

Thanks for the review feedback. I'll work with David to make sure we
address the formatting/spelling issues in this patch and the crashlog
patch.

- Alex

Re: [patch 10/35] net: intel: Remove in_interrupt() warnings

2020-09-28 Thread Alexander Duyck

On Sun, Sep 27, 2020 at 1:00 PM Thomas Gleixner  wrote:
>
> From: Sebastian Andrzej Siewior 
>
> in_interrupt() is ill defined and does not provide what the name
> suggests. The usage especially in driver code is deprecated and a tree wide
> effort to clean up and consolidate the (ab)usage of in_interrupt() and
> related checks is happening.
>
> In this case the checks cover only parts of the contexts in which these
> functions cannot be called. They fail to detect preemption or interrupt
> disabled invocations.
>
> As the functions which are invoked from the various places contain already
> a broad variety of checks (always enabled or debug option dependent) cover
> all invalid conditions already, there is no point in having inconsistent
> warnings in those drivers.
>
> Just remove them.
>
> Signed-off-by: Sebastian Andrzej Siewior 
> Signed-off-by: Thomas Gleixner 
> Cc: Jeff Kirsher 
> Cc: "David S. Miller" 
> Cc: Jakub Kicinski 
> Cc: net...@vger.kernel.org

The patch looks good to me.

Reviewed-by: Alexander Duyck 

> ---
>  drivers/net/ethernet/intel/e1000/e1000_main.c |1 -
>  drivers/net/ethernet/intel/fm10k/fm10k_pci.c  |2 --
>  drivers/net/ethernet/intel/i40e/i40e_main.c   |4 
>  drivers/net/ethernet/intel/ice/ice_main.c |1 -
>  drivers/net/ethernet/intel/igb/igb_main.c |1 -
>  drivers/net/ethernet/intel/igc/igc_main.c |1 -
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |1 -
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |2 --
>  8 files changed, 13 deletions(-)
>
> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
> @@ -534,7 +534,6 @@ void e1000_down(struct e1000_adapter *ad
>
>  void e1000_reinit_locked(struct e1000_adapter *adapter)
>  {
> -   WARN_ON(in_interrupt());
> while (test_and_set_bit(__E1000_RESETTING, >flags))
> msleep(1);
>
> --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
> @@ -221,8 +221,6 @@ static bool fm10k_prepare_for_reset(stru
>  {
> struct net_device *netdev = interface->netdev;
>
> -   WARN_ON(in_interrupt());
> -
> /* put off any impending NetWatchDogTimeout */
> netif_trans_update(netdev);
>
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -6689,7 +6689,6 @@ static void i40e_vsi_reinit_locked(struc
>  {
> struct i40e_pf *pf = vsi->back;
>
> -   WARN_ON(in_interrupt());
> while (test_and_set_bit(__I40E_CONFIG_BUSY, pf->state))
> usleep_range(1000, 2000);
> i40e_down(vsi);
> @@ -8462,9 +8461,6 @@ void i40e_do_reset(struct i40e_pf *pf, u
>  {
> u32 val;
>
> -   WARN_ON(in_interrupt());
> -
> -
> /* do the biggest reset indicated */
> if (reset_flags & BIT_ULL(__I40E_GLOBAL_RESET_REQUESTED)) {
>
> --- a/drivers/net/ethernet/intel/ice/ice_main.c
> +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> @@ -486,7 +486,6 @@ static void ice_do_reset(struct ice_pf *
> struct ice_hw *hw = >hw;
>
> dev_dbg(dev, "reset_type 0x%x requested\n", reset_type);
> -   WARN_ON(in_interrupt());
>
> ice_prepare_for_reset(pf);
>
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -2220,7 +2220,6 @@ void igb_down(struct igb_adapter *adapte
>
>  void igb_reinit_locked(struct igb_adapter *adapter)
>  {
> -   WARN_ON(in_interrupt());
> while (test_and_set_bit(__IGB_RESETTING, >state))
> usleep_range(1000, 2000);
> igb_down(adapter);
> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> @@ -3831,7 +3831,6 @@ void igc_down(struct igc_adapter *adapte
>
>  void igc_reinit_locked(struct igc_adapter *adapter)
>  {
> -   WARN_ON(in_interrupt());
> while (test_and_set_bit(__IGC_RESETTING, >state))
> usleep_range(1000, 2000);
> igc_down(adapter);
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -5677,7 +5677,6 @@ static void ixgbe_up_complete(struct ixg
>
>  void ixgbe_reinit_locked(struct ixgbe_adapter *adapter)
>  {
> -   WARN_ON(in_interrupt());
> /* put off any impending NetWatchDogTimeout */
> netif_trans_update(adapter->netdev);
>
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> @@ -2526,8 +2526,6 @@ void ixgbevf_down(struct ixgbevf_adapter
>
>  void ixgbevf_reinit_locked(struct ixgbevf_adapter *adapter)
>  {
> -   WARN_ON(in_interrupt());
> -
> while (test_and_set_bit(__IXGBEVF_RESETTING, >state))
> msleep(1);
>
>

Re: [PATCH 3/3] platform/x86: Intel PMT Crashlog capability driver

2020-09-21 Thread Alexander Duyck

On Mon, Sep 21, 2020 at 9:07 AM Alexey Budankov
 wrote:
>
>
> On 21.09.2020 16:36, Alexander Duyck wrote:
> > On Sat, Sep 19, 2020 at 1:01 AM Alexey Budankov
> >  wrote:
> >>
> >> Hi,
> >>
> >> Thanks for the patches.
> >>
> >> On 11.09.2020 22:45, David E. Box wrote:
> >>> From: Alexander Duyck 
> >>>
> >>> Add support for the Intel Platform Monitoring Technology crashlog
> >>> interface.  This interface provides a few sysfs values to allow for
> >>> controlling the crashlog telemetry interface as well as a character driver
> >>> to allow for mapping the crashlog memory region so that it can be accessed
> >>> after a crashlog has been recorded.
> >>>
> >>> This driver is meant to only support the server version of the crashlog
> >>> which is identified as crash_type 1 with a version of zero. Currently no
> >>> other types are supported.
> >>>
> >>> Signed-off-by: Alexander Duyck 
> >>> Signed-off-by: David E. Box 
> >>> ---
> >>>  .../ABI/testing/sysfs-class-pmt_crashlog  |  66 ++
> >>>  drivers/platform/x86/Kconfig  |  10 +
> >>>  drivers/platform/x86/Makefile |   1 +
> >>>  drivers/platform/x86/intel_pmt_crashlog.c | 588 ++
> >>>  4 files changed, 665 insertions(+)
> >>>  create mode 100644 Documentation/ABI/testing/sysfs-class-pmt_crashlog
> >>>  create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c
> >>
> >> 
> >>
> >>> +
> >>> +/*
> >>> + * devfs
> >>> + */
> >>> +static int pmt_crashlog_open(struct inode *inode, struct file *filp)
> >>> +{
> >>> + struct crashlog_entry *entry;
> >>> + struct pci_driver *pci_drv;
> >>> + struct pmt_crashlog_priv *priv;
> >>> +
> >>> + if (!capable(CAP_SYS_ADMIN))
> >>> + return -EPERM;
> >>
> >> Will not this above still block access to /dev/crashlogX for admin_group 
> >> users
> >> in case root configured access e.g. similar to this:
> >>
> >> ls -alh /dev/
> >> crw-rw.  1 root admin_group  1,   9 Sep 15 18:28 crashlogX
> >>
> >> If yes then that capable() check is probably superfluous and
> >> should be avoided in order not to block access to PMT data.
> >>
> >> Could you please clarify or comment?
> >>
> >> Thanks,
> >> Alexei
> >
> > Actually this should probably be updated to "if (!perfmon_capable())"
> > instead. The telemetry driver code originally had the CAP_SYS_ADMIN
> > check and it probably makes more sense to limit this user-wise to the
> > same users who have access to performon.
>
> Indeed, it is currently perfmon_capable() for performance part but it is 
> unclear
> if it should be the same for crashlog since it's more like a debugging thing.
> It appears it all depends on usage models implemented in a user space tools 
> e.g. Perf.
>
> However there is an important use case that is not covered
> neither by perfmon_capable() nor by capable(CAP_SYS_ADMIN).
>
> It is access and usage of PMT features in cluster or cloud environments by
> unprivileged users that don't have root credentials. The users however can run
> software tools (Perf, VTune etc.) once installed and configured by root.
>
> Even though Perf tool can be configured to use use CAP_PERFMON [1] the tool 
> binary
> should still reside on a file system supporting xattr to convey capabilities
> into processes implementing monitoring.
>
> Unfortunately NFSv3 which is quite popular to be used for storing and sharing
> software tooling in large production systems doesn't support capabilities yet.
>
> Thus, capabilities approach still has limitation in HPC clusters and cloud 
> environments
> and for PMT support this limitation has a chance to be lifted if
> suitable access control mechanism would be designed from the very beggining.
>
> Actually I tried to change group ownership of /dev and /sys directories and 
> files, being root,
> and it appeared that for dev file it is possible:
> ls -alh /dev/
> crw-rw.  1 root admin_group  1,   9 Sep 15 18:28 telem
>
> So if e.g. perf tool having CAP_PERFMON and configured like:
>
> -rwxr-x---.  1 root admin_group  24M Mar  5  2020 perf.cap
>
> would mmap /dev/telem to provide uncore performance insights
> to admin_group users only access control based on user/group/others ownership
> would suffice without capabilities requirement.
>
> Still haven't had chance to verify it for memory mapped PMT dev files and
> that is why I am asking you guys here.
>
> Alexei
>
> [1] 
> https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html#privileged-perf-users-groups

We will have to see. There is a high likelihood this code will go away
if we switch over to binary sysfs attributes for the data. I'm still
working on the rewrite and hope to have something we can review as an
RFC in the next few days.

Thanks.

- Alex

Re: [PATCH 3/3] platform/x86: Intel PMT Crashlog capability driver

2020-09-21 Thread Alexander Duyck

On Mon, Sep 21, 2020 at 6:16 AM Hans de Goede  wrote:
>
> Hi,
>
> On 9/17/20 11:35 PM, Alexander Duyck wrote:
> > On Thu, Sep 17, 2020 at 5:12 AM Hans de Goede  wrote:
> >>
> >> Hi,
> >>
> >> On 9/15/20 12:35 AM, Alexander Duyck wrote:
> >>> On Mon, Sep 14, 2020 at 11:07 AM Alexander Duyck
> >>>  wrote:
> >>>>
> >>>> On Mon, Sep 14, 2020 at 6:42 AM Hans de Goede  
> >>>> wrote:
> >>>>>
> >>>>> Hi,
> >>>>>
> >>>>> On 9/11/20 9:45 PM, David E. Box wrote:
> >>>>>> From: Alexander Duyck 
> >>>>>>
> >>>>>> Add support for the Intel Platform Monitoring Technology crashlog
> >>>>>> interface.  This interface provides a few sysfs values to allow for
> >>>>>> controlling the crashlog telemetry interface as well as a character 
> >>>>>> driver
> >>>>>> to allow for mapping the crashlog memory region so that it can be 
> >>>>>> accessed
> >>>>>> after a crashlog has been recorded.
> >>>>>>
> >>>>>> This driver is meant to only support the server version of the crashlog
> >>>>>> which is identified as crash_type 1 with a version of zero. Currently 
> >>>>>> no
> >>>>>> other types are supported.
> >>>>>>
> >>>>>> Signed-off-by: Alexander Duyck 
> >>>>>> Signed-off-by: David E. Box 
> >>>>>> ---
> >>>>>> .../ABI/testing/sysfs-class-pmt_crashlog  |  66 ++
> >>>>>> drivers/platform/x86/Kconfig  |  10 +
> >>>>>> drivers/platform/x86/Makefile |   1 +
> >>>>>> drivers/platform/x86/intel_pmt_crashlog.c | 588 
> >>>>>> ++
> >>>>>> 4 files changed, 665 insertions(+)
> >>>>>> create mode 100644 
> >>>>>> Documentation/ABI/testing/sysfs-class-pmt_crashlog
> >>>>>> create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c
> >>>>>>
> >>>>>> diff --git a/Documentation/ABI/testing/sysfs-class-pmt_crashlog 
> >>>>>> b/Documentation/ABI/testing/sysfs-class-pmt_crashlog
> >>>>>> new file mode 100644
> >>>>>> index ..40fb4ff437a6
> >>>>>> --- /dev/null
> >>>>>> +++ b/Documentation/ABI/testing/sysfs-class-pmt_crashlog
> >>>>>> @@ -0,0 +1,66 @@
> >>>>>> +What:/sys/class/pmt_crashlog/
> >>>>>> +Date:September 2020
> >>>>>> +KernelVersion:   5.10
> >>>>>> +Contact: Alexander Duyck 
> >>>>>> +Description:
> >>>>>> + The pmt_crashlog/ class directory contains information
> >>>>>> + for devices that expose crashlog capabilities using the 
> >>>>>> Intel
> >>>>>> + Platform Monitoring Technology (PTM).
> >>>>>> +
> >>>>>> +What:/sys/class/pmt_crashlog/crashlogX
> >>>>>> +Date:September 2020
> >>>>>> +KernelVersion:   5.10
> >>>>>> +Contact: Alexander Duyck 
> >>>>>> +Description:
> >>>>>> + The crashlogX directory contains files for configuring an
> >>>>>> + instance of a PMT crashlog device that can perform crash 
> >>>>>> data
> >>>>>> + recoring. Each crashlogX device has an associated
> >>>>>> + /dev/crashlogX device node. This node can be opened and 
> >>>>>> mapped
> >>>>>> + to access the resulting crashlog data. The register 
> >>>>>> layout for
> >>>>>> + the log can be determined from an XML file of specified 
> >>>>>> guid
> >>>>>> + for the parent device.
> >>>>>> +
> >>>>>> +What:/sys/class/pmt_crashlog/crashlogX/guid
> >>>>>> +Date:September 2020
> >>>>>> +KernelVe

Re: [PATCH 3/3] platform/x86: Intel PMT Crashlog capability driver

2020-09-21 Thread Alexander Duyck

On Sat, Sep 19, 2020 at 1:01 AM Alexey Budankov
 wrote:
>
> Hi,
>
> Thanks for the patches.
>
> On 11.09.2020 22:45, David E. Box wrote:
> > From: Alexander Duyck 
> >
> > Add support for the Intel Platform Monitoring Technology crashlog
> > interface.  This interface provides a few sysfs values to allow for
> > controlling the crashlog telemetry interface as well as a character driver
> > to allow for mapping the crashlog memory region so that it can be accessed
> > after a crashlog has been recorded.
> >
> > This driver is meant to only support the server version of the crashlog
> > which is identified as crash_type 1 with a version of zero. Currently no
> > other types are supported.
> >
> > Signed-off-by: Alexander Duyck 
> > Signed-off-by: David E. Box 
> > ---
> >  .../ABI/testing/sysfs-class-pmt_crashlog  |  66 ++
> >  drivers/platform/x86/Kconfig  |  10 +
> >  drivers/platform/x86/Makefile |   1 +
> >  drivers/platform/x86/intel_pmt_crashlog.c | 588 ++
> >  4 files changed, 665 insertions(+)
> >  create mode 100644 Documentation/ABI/testing/sysfs-class-pmt_crashlog
> >  create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c
>
> 
>
> > +
> > +/*
> > + * devfs
> > + */
> > +static int pmt_crashlog_open(struct inode *inode, struct file *filp)
> > +{
> > + struct crashlog_entry *entry;
> > + struct pci_driver *pci_drv;
> > + struct pmt_crashlog_priv *priv;
> > +
> > + if (!capable(CAP_SYS_ADMIN))
> > + return -EPERM;
>
> Will not this above still block access to /dev/crashlogX for admin_group users
> in case root configured access e.g. similar to this:
>
> ls -alh /dev/
> crw-rw.  1 root admin_group  1,   9 Sep 15 18:28 crashlogX
>
> If yes then that capable() check is probably superfluous and
> should be avoided in order not to block access to PMT data.
>
> Could you please clarify or comment?
>
> Thanks,
> Alexei

Actually this should probably be updated to "if (!perfmon_capable())"
instead. The telemetry driver code originally had the CAP_SYS_ADMIN
check and it probably makes more sense to limit this user-wise to the
same users who have access to performon.

Thanks.

- Alex

Re: [PATCH 3/3] platform/x86: Intel PMT Crashlog capability driver

2020-09-17 Thread Alexander Duyck

On Thu, Sep 17, 2020 at 5:12 AM Hans de Goede  wrote:
>
> Hi,
>
> On 9/15/20 12:35 AM, Alexander Duyck wrote:
> > On Mon, Sep 14, 2020 at 11:07 AM Alexander Duyck
> >  wrote:
> >>
> >> On Mon, Sep 14, 2020 at 6:42 AM Hans de Goede  wrote:
> >>>
> >>> Hi,
> >>>
> >>> On 9/11/20 9:45 PM, David E. Box wrote:
> >>>> From: Alexander Duyck 
> >>>>
> >>>> Add support for the Intel Platform Monitoring Technology crashlog
> >>>> interface.  This interface provides a few sysfs values to allow for
> >>>> controlling the crashlog telemetry interface as well as a character 
> >>>> driver
> >>>> to allow for mapping the crashlog memory region so that it can be 
> >>>> accessed
> >>>> after a crashlog has been recorded.
> >>>>
> >>>> This driver is meant to only support the server version of the crashlog
> >>>> which is identified as crash_type 1 with a version of zero. Currently no
> >>>> other types are supported.
> >>>>
> >>>> Signed-off-by: Alexander Duyck 
> >>>> Signed-off-by: David E. Box 
> >>>> ---
> >>>>.../ABI/testing/sysfs-class-pmt_crashlog  |  66 ++
> >>>>drivers/platform/x86/Kconfig  |  10 +
> >>>>drivers/platform/x86/Makefile |   1 +
> >>>>drivers/platform/x86/intel_pmt_crashlog.c | 588 ++
> >>>>4 files changed, 665 insertions(+)
> >>>>create mode 100644 Documentation/ABI/testing/sysfs-class-pmt_crashlog
> >>>>create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c
> >>>>
> >>>> diff --git a/Documentation/ABI/testing/sysfs-class-pmt_crashlog 
> >>>> b/Documentation/ABI/testing/sysfs-class-pmt_crashlog
> >>>> new file mode 100644
> >>>> index ..40fb4ff437a6
> >>>> --- /dev/null
> >>>> +++ b/Documentation/ABI/testing/sysfs-class-pmt_crashlog
> >>>> @@ -0,0 +1,66 @@
> >>>> +What:    /sys/class/pmt_crashlog/
> >>>> +Date:September 2020
> >>>> +KernelVersion:   5.10
> >>>> +Contact: Alexander Duyck 
> >>>> +Description:
> >>>> + The pmt_crashlog/ class directory contains information
> >>>> + for devices that expose crashlog capabilities using the 
> >>>> Intel
> >>>> + Platform Monitoring Technology (PTM).
> >>>> +
> >>>> +What:/sys/class/pmt_crashlog/crashlogX
> >>>> +Date:September 2020
> >>>> +KernelVersion:   5.10
> >>>> +Contact: Alexander Duyck 
> >>>> +Description:
> >>>> + The crashlogX directory contains files for configuring an
> >>>> + instance of a PMT crashlog device that can perform crash 
> >>>> data
> >>>> + recoring. Each crashlogX device has an associated
> >>>> + /dev/crashlogX device node. This node can be opened and 
> >>>> mapped
> >>>> + to access the resulting crashlog data. The register layout 
> >>>> for
> >>>> + the log can be determined from an XML file of specified 
> >>>> guid
> >>>> + for the parent device.
> >>>> +
> >>>> +What:    /sys/class/pmt_crashlog/crashlogX/guid
> >>>> +Date:September 2020
> >>>> +KernelVersion:   5.10
> >>>> +Contact: Alexander Duyck 
> >>>> +Description:
> >>>> + (RO) The guid for this crashlog device. The guid 
> >>>> identifies the
> >>>> + version of the XML file for the parent device that should 
> >>>> be
> >>>> + used to determine the register layout.
> >>>> +
> >>>> +What:/sys/class/pmt_crashlog/crashlogX/size
> >>>> +Date:September 2020
> >>>> +KernelVersion:   5.10
> >>>> +Contact: Alexander Duyck 
> >>>> +Description:
> >>>> + (RO) The length of the result buffer in bytes that 
> >>>> corresponds
> >&g

Re: [PATCH -next] PCI/IOV: use module_pci_driver to simplify the code

2020-09-17 Thread Alexander Duyck

On Thu, Sep 17, 2020 at 9:56 AM Bjorn Helgaas  wrote:
>
> [+cc Alexander]
>
> On Thu, Sep 17, 2020 at 03:10:42PM +0800, Liu Shixin wrote:
> > Use the module_pci_driver() macro to make the code simpler
> > by eliminating module_init and module_exit calls.
> >
> > Signed-off-by: Liu Shixin 
>
> Applied to pci/misc for v5.10, thanks!

The code below seems pretty straight forward.

Acked-by: Alexander Duyck 

> > ---
> >  drivers/pci/pci-pf-stub.c | 14 +-
> >  1 file changed, 1 insertion(+), 13 deletions(-)
> >
> > diff --git a/drivers/pci/pci-pf-stub.c b/drivers/pci/pci-pf-stub.c
> > index a0b2bd6c918a..45855a5e9fca 100644
> > --- a/drivers/pci/pci-pf-stub.c
> > +++ b/drivers/pci/pci-pf-stub.c
> > @@ -37,18 +37,6 @@ static struct pci_driver pf_stub_driver = {
> >   .probe  = pci_pf_stub_probe,
> >   .sriov_configure= pci_sriov_configure_simple,
> >  };
> > -
> > -static int __init pci_pf_stub_init(void)
> > -{
> > - return pci_register_driver(_stub_driver);
> > -}
> > -
> > -static void __exit pci_pf_stub_exit(void)
> > -{
> > - pci_unregister_driver(_stub_driver);
> > -}
> > -
> > -module_init(pci_pf_stub_init);
> > -module_exit(pci_pf_stub_exit);
> > +module_pci_driver(pf_stub_driver);
> >
> >  MODULE_LICENSE("GPL");
> > --
> > 2.25.1
> >

Re: [PATCH v18 00/32] per memcg lru_lock: reviews

2020-09-17 Thread Alexander Duyck

On Thu, Sep 17, 2020 at 7:26 AM Daniel Jordan
 wrote:
>
> On Thu, Sep 17, 2020 at 10:37:45AM +0800, Alex Shi wrote:
> > 在 2020/9/16 上午12:58, Daniel Jordan 写道:
> > > On Tue, Sep 15, 2020 at 01:21:56AM -0700, Hugh Dickins wrote:
> > >> On Sun, 13 Sep 2020, Alex Shi wrote:
> > >>> Uh, I updated the testing with some new results here:
> > >>> https://lkml.org/lkml/2020/8/26/212
> > >> Right, I missed that, that's better, thanks.  Any other test results?
> > > Alex, you were doing some will-it-scale runs earlier.  Are you planning 
> > > to do
> > > more of those?  Otherwise I can add them in.
> >
> > Hi Daniel,
> >
> > Does compaction perf scalable, like thpscale, I except they could get some 
> > benefit.
>
> Yep, I plan to stress compaction.  Reclaim as well.
>
> I should have said which Alex I meant.  I was asking Alex Duyck since he'd 
> done
> some will-it-scale runs.

I probably won't be able to do any will-it-scale runs any time soon.
If I recall I ran them for this latest v18 patch set and didn't see
any regressions like I did with the previous set. However the system I
was using is tied up for other purposes and it may be awhile before I
can free it up to look into this again.

Thanks.

- Alex

Re: [PATCH RFC 1/4] mm/page_alloc: convert "report" flag of __free_one_page() to a proper flag

2020-09-16 Thread Alexander Duyck

On Wed, Sep 16, 2020 at 11:34 AM David Hildenbrand  wrote:
>
> Let's prepare for additional flags and avoid long parameter lists of bools.
> Follow-up patches will also make use of the flags in __free_pages_ok(),
> however, I wasn't able to come up with a better name for the type - should
> be good enough for internal purposes.
>
> Cc: Andrew Morton 
> Cc: Alexander Duyck 
> Cc: Mel Gorman 
> Cc: Michal Hocko 
> Cc: Dave Hansen 
> Cc: Vlastimil Babka 
> Cc: Wei Yang 
> Cc: Oscar Salvador 
> Cc: Mike Rapoport 
> Signed-off-by: David Hildenbrand 
> ---
>  mm/page_alloc.c | 28 
>  1 file changed, 20 insertions(+), 8 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 6b699d273d6e..91cefb8157dd 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -77,6 +77,18 @@
>  #include "shuffle.h"
>  #include "page_reporting.h"
>
> +/* Free One Page flags: for internal, non-pcp variants of free_pages(). */
> +typedef int __bitwise fop_t;
> +
> +/* No special request */
> +#define FOP_NONE   ((__force fop_t)0)
> +
> +/*
> + * Skip free page reporting notification after buddy merging (will *not* mark
> + * the page reported, only skip the notification).
> + */
> +#define FOP_SKIP_REPORT_NOTIFY ((__force fop_t)BIT(0))
> +
>  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
>  static DEFINE_MUTEX(pcp_batch_high_lock);
>  #define MIN_PERCPU_PAGELIST_FRACTION   (8)
> @@ -948,10 +960,9 @@ buddy_merge_likely(unsigned long pfn, unsigned long 
> buddy_pfn,
>   * -- nyc
>   */
>
> -static inline void __free_one_page(struct page *page,
> -   unsigned long pfn,
> -   struct zone *zone, unsigned int order,
> -   int migratetype, bool report)
> +static inline void __free_one_page(struct page *page, unsigned long pfn,
> +  struct zone *zone, unsigned int order,
> +  int migratetype, fop_t fop_flags)
>  {
> struct capture_control *capc = task_capc(zone);
> unsigned long buddy_pfn;
> @@ -1038,7 +1049,7 @@ static inline void __free_one_page(struct page *page,
> add_to_free_list(page, zone, order, migratetype);
>
> /* Notify page reporting subsystem of freed page */
> -   if (report)
> +   if (!(fop_flags & FOP_SKIP_REPORT_NOTIFY))
> page_reporting_notify_free(order);
>  }
>
> @@ -1368,7 +1379,7 @@ static void free_pcppages_bulk(struct zone *zone, int 
> count,
> if (unlikely(isolated_pageblocks))
> mt = get_pageblock_migratetype(page);
>
> -   __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
> +   __free_one_page(page, page_to_pfn(page), zone, 0, mt, 
> FOP_NONE);
> trace_mm_page_pcpu_drain(page, 0, mt);
> }
> spin_unlock(>lock);
> @@ -1384,7 +1395,7 @@ static void free_one_page(struct zone *zone,
> is_migrate_isolate(migratetype))) {
> migratetype = get_pfnblock_migratetype(page, pfn);
> }
> -   __free_one_page(page, pfn, zone, order, migratetype, true);
> +   __free_one_page(page, pfn, zone, order, migratetype, FOP_NONE);
> spin_unlock(>lock);
>  }
>
> @@ -3277,7 +3288,8 @@ void __putback_isolated_page(struct page *page, 
> unsigned int order, int mt)
> lockdep_assert_held(>lock);
>
> /* Return isolated page to tail of freelist. */
> -   __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
> +   __free_one_page(page, page_to_pfn(page), zone, order, mt,
> +   FOP_SKIP_REPORT_NOTIFY);
>  }
>
>  /*

Seems pretty straight forward. So we are basically flipping the logic
and replacing !report with FOP_SKIP_REPORT_NOTIFY.

Reviewed-by: Alexander Duyck

Re: [PATCH RFC 2/4] mm/page_alloc: place pages to tail in __putback_isolated_page()

2020-09-16 Thread Alexander Duyck

On Wed, Sep 16, 2020 at 11:34 AM David Hildenbrand  wrote:
>
> __putback_isolated_page() already documents that pages will be placed to
> the tail of the freelist - this is, however, not the case for
> "order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be
> the case for all existing users.
>
> This change affects two users:
> - free page reporting
> - page isolation, when undoing the isolation.
>
> This behavior is desireable for pages that haven't really been touched

I think "desirable" is misspelled here.

> lately, so exactly the two users that don't actually read/write page
> content, but rather move untouched pages.

So in reality we were already dealing with this for page reporting,
but not in the most direct way. If I recall we were adding the pages
to the head of the list and then when we would go back to pull more
pages we were doing list rotation in the report function so they were
technically being added to the head, but usually would end up back on
the tail anyway. If anything the benefit for page reporting is that it
should be more direct this way as we will only have to rescan the
pages now when we have consumed all of the reported ones on the list.

> The new behavior is especially desirable for memory onlining, where we
> allow allocation of newly onlined pages via undo_isolate_page_range()
> in online_pages(). Right now, we always place them to the head of the
> free list, resulting in undesireable behavior: Assume we add
> individual memory chunks via add_memory() and online them right away to
> the NORMAL zone. We create a dependency chain of unmovable allocations
> e.g., via the memmap. The memmap of the next chunk will be placed onto
> previous chunks - if the last block cannot get offlined+removed, all
> dependent ones cannot get offlined+removed. While this can already be
> observed with individual DIMMs, it's more of an issue for virtio-mem
> (and I suspect also ppc DLPAR).
>
> Note: If we observe a degradation due to the changed page isolation
> behavior (which I doubt), we can always make this configurable by the
> instance triggering undo of isolation (e.g., alloc_contig_range(),
> memory onlining, memory offlining).
>
> Cc: Andrew Morton 
> Cc: Alexander Duyck 
> Cc: Mel Gorman 
> Cc: Michal Hocko 
> Cc: Dave Hansen 
> Cc: Vlastimil Babka 
> Cc: Wei Yang 
> Cc: Oscar Salvador 
> Cc: Mike Rapoport 
> Cc: Scott Cheloha 
> Cc: Michael Ellerman 
> Signed-off-by: David Hildenbrand 
> ---
>  mm/page_alloc.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 91cefb8157dd..bba9a0f60c70 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -89,6 +89,12 @@ typedef int __bitwise fop_t;
>   */
>  #define FOP_SKIP_REPORT_NOTIFY ((__force fop_t)BIT(0))
>
> +/*
> + * Place the freed page to the tail of the freelist after buddy merging. Will
> + * get ignored with page shuffling enabled.
> + */
> +#define FOP_TO_TAIL((__force fop_t)BIT(1))
> +
>  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
>  static DEFINE_MUTEX(pcp_batch_high_lock);
>  #define MIN_PERCPU_PAGELIST_FRACTION   (8)
> @@ -1040,6 +1046,8 @@ static inline void __free_one_page(struct page *page, 
> unsigned long pfn,
>
> if (is_shuffle_order(order))
> to_tail = shuffle_pick_tail();
> +   else if (fop_flags & FOP_TO_TAIL)
> +   to_tail = true;
> else
> to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
>
> @@ -3289,7 +3297,7 @@ void __putback_isolated_page(struct page *page, 
> unsigned int order, int mt)
>
> /* Return isolated page to tail of freelist. */
> __free_one_page(page, page_to_pfn(page), zone, order, mt,
> -   FOP_SKIP_REPORT_NOTIFY);
> +   FOP_SKIP_REPORT_NOTIFY | FOP_TO_TAIL);
>  }
>
>  /*

The code looks good to me.

Reviewed-by: Alexander Duyck

Re: [PATCH 3/3] platform/x86: Intel PMT Crashlog capability driver

2020-09-14 Thread Alexander Duyck

On Mon, Sep 14, 2020 at 11:07 AM Alexander Duyck
 wrote:
>
> On Mon, Sep 14, 2020 at 6:42 AM Hans de Goede  wrote:
> >
> > Hi,
> >
> > On 9/11/20 9:45 PM, David E. Box wrote:
> > > From: Alexander Duyck 
> > >
> > > Add support for the Intel Platform Monitoring Technology crashlog
> > > interface.  This interface provides a few sysfs values to allow for
> > > controlling the crashlog telemetry interface as well as a character driver
> > > to allow for mapping the crashlog memory region so that it can be accessed
> > > after a crashlog has been recorded.
> > >
> > > This driver is meant to only support the server version of the crashlog
> > > which is identified as crash_type 1 with a version of zero. Currently no
> > > other types are supported.
> > >
> > > Signed-off-by: Alexander Duyck 
> > > Signed-off-by: David E. Box 
> > > ---
> > >   .../ABI/testing/sysfs-class-pmt_crashlog  |  66 ++
> > >   drivers/platform/x86/Kconfig  |  10 +
> > >   drivers/platform/x86/Makefile |   1 +
> > >   drivers/platform/x86/intel_pmt_crashlog.c | 588 ++
> > >   4 files changed, 665 insertions(+)
> > >   create mode 100644 Documentation/ABI/testing/sysfs-class-pmt_crashlog
> > >   create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c
> > >
> > > diff --git a/Documentation/ABI/testing/sysfs-class-pmt_crashlog 
> > > b/Documentation/ABI/testing/sysfs-class-pmt_crashlog
> > > new file mode 100644
> > > index 0000..40fb4ff437a6
> > > --- /dev/null
> > > +++ b/Documentation/ABI/testing/sysfs-class-pmt_crashlog
> > > @@ -0,0 +1,66 @@
> > > +What:/sys/class/pmt_crashlog/
> > > +Date:September 2020
> > > +KernelVersion:   5.10
> > > +Contact: Alexander Duyck 
> > > +Description:
> > > + The pmt_crashlog/ class directory contains information
> > > + for devices that expose crashlog capabilities using the 
> > > Intel
> > > + Platform Monitoring Technology (PTM).
> > > +
> > > +What:/sys/class/pmt_crashlog/crashlogX
> > > +Date:September 2020
> > > +KernelVersion:   5.10
> > > +Contact: Alexander Duyck 
> > > +Description:
> > > + The crashlogX directory contains files for configuring an
> > > + instance of a PMT crashlog device that can perform crash 
> > > data
> > > + recoring. Each crashlogX device has an associated
> > > +     /dev/crashlogX device node. This node can be opened and 
> > > mapped
> > > + to access the resulting crashlog data. The register layout 
> > > for
> > > + the log can be determined from an XML file of specified guid
> > > + for the parent device.
> > > +
> > > +What:/sys/class/pmt_crashlog/crashlogX/guid
> > > +Date:September 2020
> > > +KernelVersion:   5.10
> > > +Contact: Alexander Duyck 
> > > +Description:
> > > + (RO) The guid for this crashlog device. The guid identifies 
> > > the
> > > + version of the XML file for the parent device that should be
> > > + used to determine the register layout.
> > > +
> > > +What:/sys/class/pmt_crashlog/crashlogX/size
> > > +Date:September 2020
> > > +KernelVersion:   5.10
> > > +Contact: Alexander Duyck 
> > > +Description:
> > > +     (RO) The length of the result buffer in bytes that 
> > > corresponds
> > > + to the mapping size for the /dev/crashlogX device node.
> > > +
> > > +What:/sys/class/pmt_crashlog/crashlogX/offset
> > > +Date:September 2020
> > > +KernelVersion:   5.10
> > > +Contact: Alexander Duyck 
> > > +Description:
> > > + (RO) The offset of the buffer in bytes that corresponds
> > > + to the mapping for the /dev/crashlogX device node.
> > > +
> > > +What:/sys/class/pmt_crashlog/crashlogX/enable
> > > +Date:September 2020
> > > +KernelVersion:   5.10
> > > +Contact: Alexander Duyck 
> > > +Description:
> > > + (RW)

Re: [PATCH 3/3] platform/x86: Intel PMT Crashlog capability driver

2020-09-14 Thread Alexander Duyck

On Mon, Sep 14, 2020 at 6:42 AM Hans de Goede  wrote:
>
> Hi,
>
> On 9/11/20 9:45 PM, David E. Box wrote:
> > From: Alexander Duyck 
> >
> > Add support for the Intel Platform Monitoring Technology crashlog
> > interface.  This interface provides a few sysfs values to allow for
> > controlling the crashlog telemetry interface as well as a character driver
> > to allow for mapping the crashlog memory region so that it can be accessed
> > after a crashlog has been recorded.
> >
> > This driver is meant to only support the server version of the crashlog
> > which is identified as crash_type 1 with a version of zero. Currently no
> > other types are supported.
> >
> > Signed-off-by: Alexander Duyck 
> > Signed-off-by: David E. Box 
> > ---
> >   .../ABI/testing/sysfs-class-pmt_crashlog  |  66 ++
> >   drivers/platform/x86/Kconfig  |  10 +
> >   drivers/platform/x86/Makefile |   1 +
> >   drivers/platform/x86/intel_pmt_crashlog.c | 588 ++
> >   4 files changed, 665 insertions(+)
> >   create mode 100644 Documentation/ABI/testing/sysfs-class-pmt_crashlog
> >   create mode 100644 drivers/platform/x86/intel_pmt_crashlog.c
> >
> > diff --git a/Documentation/ABI/testing/sysfs-class-pmt_crashlog 
> > b/Documentation/ABI/testing/sysfs-class-pmt_crashlog
> > new file mode 100644
> > index ..40fb4ff437a6
> > --- /dev/null
> > +++ b/Documentation/ABI/testing/sysfs-class-pmt_crashlog
> > @@ -0,0 +1,66 @@
> > +What:/sys/class/pmt_crashlog/
> > +Date:September 2020
> > +KernelVersion:   5.10
> > +Contact: Alexander Duyck 
> > +Description:
> > + The pmt_crashlog/ class directory contains information
> > + for devices that expose crashlog capabilities using the Intel
> > + Platform Monitoring Technology (PTM).
> > +
> > +What:/sys/class/pmt_crashlog/crashlogX
> > +Date:September 2020
> > +KernelVersion:   5.10
> > +Contact: Alexander Duyck 
> > +Description:
> > + The crashlogX directory contains files for configuring an
> > + instance of a PMT crashlog device that can perform crash data
> > + recoring. Each crashlogX device has an associated
> > + /dev/crashlogX device node. This node can be opened and mapped
> > + to access the resulting crashlog data. The register layout for
> > + the log can be determined from an XML file of specified guid
> > + for the parent device.
> > +
> > +What:/sys/class/pmt_crashlog/crashlogX/guid
> > +Date:September 2020
> > +KernelVersion:   5.10
> > +Contact: Alexander Duyck 
> > +Description:
> > + (RO) The guid for this crashlog device. The guid identifies 
> > the
> > + version of the XML file for the parent device that should be
> > + used to determine the register layout.
> > +
> > +What:/sys/class/pmt_crashlog/crashlogX/size
> > +Date:September 2020
> > +KernelVersion:   5.10
> > +Contact: Alexander Duyck 
> > +Description:
> > + (RO) The length of the result buffer in bytes that corresponds
> > + to the mapping size for the /dev/crashlogX device node.
> > +
> > +What:/sys/class/pmt_crashlog/crashlogX/offset
> > +Date:September 2020
> > +KernelVersion:   5.10
> > +Contact: Alexander Duyck 
> > +Description:
> > + (RO) The offset of the buffer in bytes that corresponds
> > + to the mapping for the /dev/crashlogX device node.
> > +
> > +What:    /sys/class/pmt_crashlog/crashlogX/enable
> > +Date:September 2020
> > +KernelVersion:   5.10
> > +Contact: Alexander Duyck 
> > +Description:
> > + (RW) Boolean value controlling if the crashlog functionality
> > + is enabled for the /dev/crashlogX device node.
> > +
> > +What:/sys/class/pmt_crashlog/crashlogX/trigger
> > +Date:September 2020
> > +KernelVersion:   5.10
> > +Contact: Alexander Duyck 
> > +Description:
> > + (RW) Boolean value controlling  the triggering of the
> > + /dev/crashlogX device node. When read it provides data on if
> > + the crashlog has been tr

Re: [PATCH v18 00/32] per memcg lru_lock: reviews

2020-09-10 Thread Alexander Duyck

On Wed, Sep 9, 2020 at 5:32 PM Hugh Dickins  wrote:
>
> On Wed, 9 Sep 2020, Alexander Duyck wrote:
> > On Tue, Sep 8, 2020 at 4:41 PM Hugh Dickins  wrote:
> > > [PATCH v18 28/32] mm/compaction: Drop locked from 
> > > isolate_migratepages_block
> > > Most of this consists of replacing "locked" by "lruvec", which is good:
> > > but please fold those changes back into 20/32 (or would it be 17/32?
> > > I've not yet looked into the relationship between those two), so we
> > > can then see more clearly what change this 28/32 (will need renaming!)
> > > actually makes, to use lruvec_holds_page_lru_lock(). That may be a
> > > good change, but it's mixed up with the "locked"->"lruvec" at present,
> > > and I think you could have just used lruvec for locked all along
> > > (but of course there's a place where you'll need new_lruvec too).
> >
> > I am good with my patch being folded in. No need to keep it separate.
>
> Thanks.  Though it was only the "locked"->"lruvec" changes I was
> suggesting to fold back, to minimize the diff, so that we could
> see your use of lruvec_holds_page_lru_lock() more clearly - you
> had not introduced that function at the stage of the earlier patches.
>
> But now that I stare at it again, using lruvec_holds_page_lru_lock()
> there doesn't look like an advantage to me: when it decides no, the
> same calculation is made all over again in mem_cgroup_page_lruvec(),
> whereas the code before only had to calculate it once.
>
> So, the code before looks better to me: I wonder, do you think that
> rcu_read_lock() is more expensive than I think it?  There can be
> debug instrumentation that makes it heavier, but by itself it is
> very cheap (by design) - not worth branching around.

Actually what I was more concerned with was the pointer chase that
required the RCU lock. With this function we are able to compare a
pair of pointers from the page and the lruvec and avoid the need for
the RCU lock. The way the old code was working we had to crawl through
the memcg to get to the lruvec before we could compare it to the one
we currently hold. The general idea is to use the data we have instead
of having to pull in some additional cache lines to perform the test.

> >
> > > [PATCH v18 29/32] mm: Identify compound pages sooner in 
> > > isolate_migratepages_block
> > > NAK. I agree that isolate_migratepages_block() looks nicer this way, but
> > > take a look at prep_new_page() in mm/page_alloc.c: post_alloc_hook() is
> > > where set_page_refcounted() changes page->_refcount from 0 to 1, allowing
> > > a racing get_page_unless_zero() to succeed; then later 
> > > prep_compound_page()
> > > is where PageHead and PageTails get set. So there's a small race window in
> > > which this patch could deliver a compound page when it should not.
> >
> > So the main motivation for the patch was to avoid the case where we
> > are having to reset the LRU flag.
>
> That would be satisfying.  Not necessary, but I agree satisfying.
> Maybe depends also on your "skip" change, which I've not looked at yet?

My concern is that we have scenarios where isolate_migratepages_block
could possibly prevent another page from being able to isolate a page.
I'm mostly concerned with us potentially creating something like an
isolation leak if multiple threads are doing something like clearing
and then resetting the LRU flag. In my mind if we clear the LRU flag
we should be certain we are going to remove the page as otherwise
another thread would have done it if it would have been allowed
access.

> > One question I would have is what if
> > we swapped the code block with the __isolate_lru_page_prepare section?
> > WIth that we would be taking a reference on the page, then verifying
> > the LRU flag is set, and then testing for compound page flag bit.
> > Would doing that close the race window since the LRU flag being set
> > should indicate that the allocation has already been completed has it
> > not?
>
> Yes, I think that would be safe, and would look better.  But I am
> very hesitant to give snap assurances here (I've twice missed out
> a vital PageLRU check from this sequence myself): it is very easy
> to deceive myself and only see it later.

I'm not looking for assurances, just sanity checks to make sure I am
not missing something obvious.

> If you can see a bug in what's there before these patches, certainly
> we need to fix it.  But adding non-essential patches to the already
> overlong series risks delaying it.

My concern ends up being that if we are clearing the bit and restoring
it while holding the LRU lock we can effectively cause pages to become
pseudo-pinned on the LRU. In my mind I would want us to avoid clearing
the LRU flag until we know we are going to be pulling the page from
the list once we take the lruvec lock. I interpret clearing of the
flag to indicate the page has already been pulled, it just hasn't left
the list yet. With us resetting the bit we are violating that which I
worry will lead to issues.

Re: [PATCH v18 31/32] mm: Add explicit page decrement in exception path for isolate_lru_pages

2020-09-09 Thread Alexander Duyck

On Wed, Sep 9, 2020 at 11:24 AM Hugh Dickins  wrote:
>
> On Wed, 9 Sep 2020, Alexander Duyck wrote:
> > On Tue, Sep 8, 2020 at 6:01 PM Matthew Wilcox  wrote:
> > > On Mon, Aug 24, 2020 at 08:55:04PM +0800, Alex Shi wrote:
> > > > +++ b/mm/vmscan.c
> > > > @@ -1688,10 +1688,13 @@ static unsigned long isolate_lru_pages(unsigned 
> > > > long nr_to_scan,
> > > >
> > > >   if (!TestClearPageLRU(page)) {
> > > >   /*
> > > > -  * This page may in other isolation path,
> > > > -  * but we still hold lru_lock.
> > > > +  * This page is being isolated in another
> > > > +  * thread, but we still hold lru_lock. The
> > > > +  * other thread must be holding a 
> > > > reference
> > > > +  * to the page so this should never hit a
> > > > +  * reference count of 0.
> > > >*/
> > > > - put_page(page);
> > > > + WARN_ON(put_page_testzero(page));
> > > >   goto busy;
> > >
> > > I read Hugh's review and that led me to take a look at this.  We don't
> > > do it like this.  Use the same pattern as elsewhere in mm:
> > >
> > > page_ref_sub(page, nr);
> > > VM_BUG_ON_PAGE(page_count(page) <= 0, page);
> > >
> > >
> >
> > Actually for this case page_ref_dec(page) would make more sense
> > wouldn't it? Otherwise I agree that would be a better change if that
> > is the way it has been handled before. I just wasn't familiar with
> > those other spots.
>
> After overnight reflection, my own preference would be simply to
> drop this patch.  I think we are making altogether too much of a
> fuss here over what was simply correct as plain put_page()
> (and further from correct if we change it to leak the page in an
> unforeseen circumstance).
>
> And if Alex's comment was not quite grammatically correct, never mind,
> it said as much as was worth saying.  I got more worried by his
> placement of the "busy:" label, but that does appear to work correctly.
>
> There's probably a thousand places where put_page() is used, where
> it would be troublesome if it were the final put_page(): this one
> bothered you because you'd been looking at isolate_migratepages_block(),
> and its necessary avoidance of lru_lock recursion on put_page();
> but let's just just leave this put_page() as is.

I'd be fine with that, but I would still like to see the comment
updated. At a minimum we should make it clear that we believe that
put_page is safe here as it should never reach zero and if it does
then we are looking at a bug. Then if this starts triggering soft
lockups  we at least have documentation somewhere that someone can
reference on what we expected and why we triggered a lockup.

- Alex

Re: [PATCH v18 00/32] per memcg lru_lock: reviews

2020-09-09 Thread Alexander Duyck

On Tue, Sep 8, 2020 at 4:41 PM Hugh Dickins  wrote:
>



> [PATCH v18 28/32] mm/compaction: Drop locked from isolate_migratepages_block
> Most of this consists of replacing "locked" by "lruvec", which is good:
> but please fold those changes back into 20/32 (or would it be 17/32?
> I've not yet looked into the relationship between those two), so we
> can then see more clearly what change this 28/32 (will need renaming!)
> actually makes, to use lruvec_holds_page_lru_lock(). That may be a
> good change, but it's mixed up with the "locked"->"lruvec" at present,
> and I think you could have just used lruvec for locked all along
> (but of course there's a place where you'll need new_lruvec too).

I am good with my patch being folded in. No need to keep it separate.

> [PATCH v18 29/32] mm: Identify compound pages sooner in 
> isolate_migratepages_block
> NAK. I agree that isolate_migratepages_block() looks nicer this way, but
> take a look at prep_new_page() in mm/page_alloc.c: post_alloc_hook() is
> where set_page_refcounted() changes page->_refcount from 0 to 1, allowing
> a racing get_page_unless_zero() to succeed; then later prep_compound_page()
> is where PageHead and PageTails get set. So there's a small race window in
> which this patch could deliver a compound page when it should not.

So the main motivation for the patch was to avoid the case where we
are having to reset the LRU flag. One question I would have is what if
we swapped the code block with the __isolate_lru_page_prepare section?
WIth that we would be taking a reference on the page, then verifying
the LRU flag is set, and then testing for compound page flag bit.
Would doing that close the race window since the LRU flag being set
should indicate that the allocation has already been completed has it
not?

> [PATCH v18 30/32] mm: Drop use of test_and_set_skip in favor of just setting 
> skip
> I haven't looked at this yet (but recall that per-memcg lru_lock can
> change the point at which compaction should skip a contended lock: IIRC
> the current kernel needs nothing extra, whereas some earlier kernels did
> need extra; but when I look at 30/32, may find these remarks irrelevant).
>
> [PATCH v18 31/32] mm: Add explicit page decrement in exception path for 
> isolate_lru_pages
> The title of this patch is definitely wrong: there was an explicit page
> decrement there before (put_page), now it's wrapping it up inside a
> WARN_ON().  We usually prefer to avoid doing functional operations
> inside WARN/BUGs, but I think I'll overlook that - anyone else worried?
> The comment is certainly better than what was there before: yes, this
> warning reflects the difficulty we have in thinking about the
> TestClearPageLRU protocol: which I'm still not sold on, but
> agree we should proceed with.  With a change in title, perhaps
> "mm: add warning where TestClearPageLRU failed on freeable page"?
> Acked-by: Hugh Dickins 

I can update that and resubmit it if needed. I know there were also
some suggestions from Matthew.

Re: [PATCH v18 31/32] mm: Add explicit page decrement in exception path for isolate_lru_pages

2020-09-09 Thread Alexander Duyck

On Tue, Sep 8, 2020 at 6:01 PM Matthew Wilcox  wrote:
>
> On Mon, Aug 24, 2020 at 08:55:04PM +0800, Alex Shi wrote:
> > +++ b/mm/vmscan.c
> > @@ -1688,10 +1688,13 @@ static unsigned long isolate_lru_pages(unsigned 
> > long nr_to_scan,
> >
> >   if (!TestClearPageLRU(page)) {
> >   /*
> > -  * This page may in other isolation path,
> > -  * but we still hold lru_lock.
> > +  * This page is being isolated in another
> > +  * thread, but we still hold lru_lock. The
> > +  * other thread must be holding a reference
> > +  * to the page so this should never hit a
> > +  * reference count of 0.
> >*/
> > - put_page(page);
> > + WARN_ON(put_page_testzero(page));
> >   goto busy;
>
> I read Hugh's review and that led me to take a look at this.  We don't
> do it like this.  Use the same pattern as elsewhere in mm:
>
> page_ref_sub(page, nr);
> VM_BUG_ON_PAGE(page_count(page) <= 0, page);
>
>

Actually for this case page_ref_dec(page) would make more sense
wouldn't it? Otherwise I agree that would be a better change if that
is the way it has been handled before. I just wasn't familiar with
those other spots.

Thanks.

- Alex

Re: [PATCH v2 2/2] mm/pageblock: remove false sharing in pageblock_flags

2020-08-30 Thread Alexander Duyck

On Sun, Aug 30, 2020 at 3:00 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/20 上午12:50, Alexander Duyck 写道:
> > On Wed, Aug 19, 2020 at 1:11 AM Alex Shi  wrote:
> >>
> >>
> >>
> >> 在 2020/8/19 下午3:57, Anshuman Khandual 写道:
> >>>
> >>>
> >>> On 08/19/2020 11:17 AM, Alex Shi wrote:
> >>>> Current pageblock_flags is only 4 bits, so it has to share a char size
> >>>> in cmpxchg when get set, the false sharing cause perf drop.
> >>>>
> >>>> If we incrase the bits up to 8, false sharing would gone in cmpxchg. and
> >>>> the only cost is half char per pageblock, which is half char per 128MB
> >>>> on x86, 4 chars in 1 GB.
> >>>
> >>> Agreed that increase in memory utilization is negligible here but does
> >>> this really improve performance ?
> >>>
> >>
> >> It's no doubt in theory. and it would had a bad impact according to
> >> commit e380bebe4771548  mm, compaction: keep migration source private to a 
> >> single
> >>
> >> but I do have some problem in running thpscale/mmtest. I'd like to see if 
> >> anyone
> >> could give a try.
> >>
> >> BTW, I naturally hate the false sharing even it's in theory. Anyone who 
> >> doesn't? :)
> >
> > You keep bringing up false sharing but you don't fix the false sharing
> > by doing this. You are still allowing the flags for multiple
> > pageblocks per cacheline so you still have false sharing even after
> > this.
>
> yes, the cacheline false sharing is still there. But as you pointed, cmpxchg 
> level
> false sharing could be addressed much by the patchset.
>
>
> >
> > What I believe you are attempting to address is the fact that multiple
> > pageblocks share a single long value and that long is being used with
> > a cmpxchg so you end up with multiple threads potentially all banging
> > on the same value and watching it change. However the field currently
> > consists of only 4 bits, 3 of them for migratetype and 1 for the skip
> > bit. In the case of the 3 bit portion a cmpxchg makes sense and is
> > usually protected by the zone lock so you would only have one thread
> > accessing it in most cases with the possible exception of a section
> > that spans multiple zones.
> >
> > For the case such as the skip bit and MIGRATE_UNMOVABLE (0x0) where we
> > would be clearing or setting the entire mask maybe it would make more
> > sense to simply use an atomic_or or atomic_and depending on if you are
> > setting or clearing the flag? It would allow you to avoid the spinning
> > or having to read the word before performing the operation since you
> > would just be directly applying an AND or OR via a mask value.
>
> Right that the different level to fix this problem, but narrow the cmpxchg
> comparsion is still needed and helpful.

What I was getting at though is that I am not sure that is the case.
Normally I believe we are always holding the zone lock when updating
the migrate type. The skip flag is a one-off operation that could
easily be addressed by changing the logic to use atomic_and or
atomic_or for the cases where we are updating single bit flags and
setting the mask value to all 1's or all 0's. So adding this extra
complexity which only really applies to the skip bit may not provide
much value, especially as there are a number of possible paths that
don't use the skip bit anyway.

Re: [RFC PATCH v2 5/5] mm: Split move_pages_to_lru into 3 separate passes

2020-08-20 Thread Alexander Duyck

On Thu, Aug 20, 2020 at 2:58 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/19 下午10:42, Alexander Duyck 写道:
> >> It's actually changed the meaning from current func. which I had seen a 
> >> bug if no relock.
> >> but after move to 5.9 kernel, I can not reprodce the bug any more. I am 
> >> not sure if 5.9 fixed
> >> the problem, and we don't need relock here.
> > So I am not sure what you mean here about "changed the meaning from
> > the current func". Which function are you referring to and what
> > changed?
> >
> > From what I can tell the pages cannot change memcg because they were
> > isolated and had the LRU flag stripped. They shouldn't be able to
> > change destination LRU vector as a result. Assuming that, then they
> > can all be processed under same LRU lock and we can avoid having to
> > release it until we are forced to do so to call putback_lru_page or
> > destroy the compound pages that were freed while we were shrinking the
> > LRU lists.
> >
>
> I had sent a bug which base on 5.8 kernel.
> https://lkml.org/lkml/2020/7/28/465
>
> I am not sure it was fixed in new kernel. The original line was introduced by 
> Hugh Dickins
> I believe it would be great if you can get comments from him.

When I brought this up before you had pointed to the relocking being
due to the fact that the function was reacquiring the lruvec for some
reason. I wonder if the fact that the LRU bit stripping serializing
things made it so that the check for the lruvec after releasing the
lock became redundant.

- Alex

Re: [RFC PATCH v2 4/5] mm: Split release_pages work into 3 passes

2020-08-20 Thread Alexander Duyck

On Thu, Aug 20, 2020 at 2:51 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/19 下午10:57, Alexander Duyck 写道:
> >>>   lruvec = relock_page_lruvec_irqsave(page, lruvec, );
> >> the lock bounce is better with the patch, would you like to do further
> >> like using add_lruvecs to reduce bounce more?
> >>
> >> Thanks
> >> Alex
> > I'm not sure how much doing something like that would add. In my case
> > I had a very specific issue that this is addressing which is the fact
> > that every compound page was taking the LRU lock and zone lock
> > separately. With this patch that is reduced to one LRU lock per 15
> > pages and then the zone lock per page. By adding or sorting pages by
> > lruvec I am not sure there will be much benefit as I am not certain
> > how often we will end up with pages being interleaved between multiple
> > lruvecs. In addition as I am limiting the quantity to a pagevec which
> > limits the pages to 15 I am not sure there will be much benefit to be
> > seen for sorting the pages beforehand.
> >
>
> the relock will unlock and get another lock again, the cost in that, the 2nd
> lock need to wait for fairness for concurrency lruvec locking.
> If we can do sort before, we should remove the fairness waiting here. Of 
> course,
> perf result depends on scenarios.

Agreed. The question is in how many scenarios are you going to have
pages interleaved between more than one lruvec? I suspect in most
cases you should only have one lruvec for all the pages being
processed in a single pagevec.

Re: [PATCH v2 2/2] mm/pageblock: remove false sharing in pageblock_flags

2020-08-19 Thread Alexander Duyck

On Wed, Aug 19, 2020 at 1:11 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/19 下午3:57, Anshuman Khandual 写道:
> >
> >
> > On 08/19/2020 11:17 AM, Alex Shi wrote:
> >> Current pageblock_flags is only 4 bits, so it has to share a char size
> >> in cmpxchg when get set, the false sharing cause perf drop.
> >>
> >> If we incrase the bits up to 8, false sharing would gone in cmpxchg. and
> >> the only cost is half char per pageblock, which is half char per 128MB
> >> on x86, 4 chars in 1 GB.
> >
> > Agreed that increase in memory utilization is negligible here but does
> > this really improve performance ?
> >
>
> It's no doubt in theory. and it would had a bad impact according to
> commit e380bebe4771548  mm, compaction: keep migration source private to a 
> single
>
> but I do have some problem in running thpscale/mmtest. I'd like to see if 
> anyone
> could give a try.
>
> BTW, I naturally hate the false sharing even it's in theory. Anyone who 
> doesn't? :)

You keep bringing up false sharing but you don't fix the false sharing
by doing this. You are still allowing the flags for multiple
pageblocks per cacheline so you still have false sharing even after
this.

What I believe you are attempting to address is the fact that multiple
pageblocks share a single long value and that long is being used with
a cmpxchg so you end up with multiple threads potentially all banging
on the same value and watching it change. However the field currently
consists of only 4 bits, 3 of them for migratetype and 1 for the skip
bit. In the case of the 3 bit portion a cmpxchg makes sense and is
usually protected by the zone lock so you would only have one thread
accessing it in most cases with the possible exception of a section
that spans multiple zones.

For the case such as the skip bit and MIGRATE_UNMOVABLE (0x0) where we
would be clearing or setting the entire mask maybe it would make more
sense to simply use an atomic_or or atomic_and depending on if you are
setting or clearing the flag? It would allow you to avoid the spinning
or having to read the word before performing the operation since you
would just be directly applying an AND or OR via a mask value.

Re: [RFC PATCH v2 4/5] mm: Split release_pages work into 3 passes

2020-08-19 Thread Alexander Duyck

On Wed, Aug 19, 2020 at 12:54 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/19 下午12:27, Alexander Duyck 写道:
> > From: Alexander Duyck 
> >
> > The release_pages function has a number of paths that end up with the
> > LRU lock having to be released and reacquired. Such an example would be the
> > freeing of THP pages as it requires releasing the LRU lock so that it can
> > be potentially reacquired by __put_compound_page.
> >
> > In order to avoid that we can split the work into 3 passes, the first
> > without the LRU lock to go through and sort out those pages that are not in
> > the LRU so they can be freed immediately from those that can't. The second
> > pass will then go through removing those pages from the LRU in batches as
> > large as a pagevec can hold before freeing the LRU lock. Once the pages have
> > been removed from the LRU we can then proceed to free the remaining pages
> > without needing to worry about if they are in the LRU any further.
> >
> > The general idea is to avoid bouncing the LRU lock between pages and to
> > hopefully aggregate the lock for up to the full page vector worth of pages.
> >
> > Signed-off-by: Alexander Duyck 
> > ---
> >  mm/swap.c |  109 
> > +
> >  1 file changed, 67 insertions(+), 42 deletions(-)
> >
> > diff --git a/mm/swap.c b/mm/swap.c
> > index fe53449fa1b8..b405f81b2c60 100644
> > --- a/mm/swap.c
> > +++ b/mm/swap.c
> > @@ -795,6 +795,54 @@ void lru_add_drain_all(void)
> >  }
> >  #endif
> >
> > +static void __release_page(struct page *page, struct list_head 
> > *pages_to_free)
> > +{
> > + if (PageCompound(page)) {
> > + __put_compound_page(page);
> > + } else {
> > + /* Clear Active bit in case of parallel mark_page_accessed */
> > + __ClearPageActive(page);
> > + __ClearPageWaiters(page);
> > +
> > + list_add(>lru, pages_to_free);
> > + }
> > +}
> > +
> > +static void __release_lru_pages(struct pagevec *pvec,
> > + struct list_head *pages_to_free)
> > +{
> > + struct lruvec *lruvec = NULL;
> > + unsigned long flags = 0;
> > + int i;
> > +
> > + /*
> > +  * The pagevec at this point should contain a set of pages with
> > +  * their reference count at 0 and the LRU flag set. We will now
> > +  * need to pull the pages from their LRU lists.
> > +  *
> > +  * We walk the list backwards here since that way we are starting at
> > +  * the pages that should be warmest in the cache.
> > +  */
> > + for (i = pagevec_count(pvec); i--;) {
> > + struct page *page = pvec->pages[i];
> > +
> > + lruvec = relock_page_lruvec_irqsave(page, lruvec, );
>
> the lock bounce is better with the patch, would you like to do further
> like using add_lruvecs to reduce bounce more?
>
> Thanks
> Alex

I'm not sure how much doing something like that would add. In my case
I had a very specific issue that this is addressing which is the fact
that every compound page was taking the LRU lock and zone lock
separately. With this patch that is reduced to one LRU lock per 15
pages and then the zone lock per page. By adding or sorting pages by
lruvec I am not sure there will be much benefit as I am not certain
how often we will end up with pages being interleaved between multiple
lruvecs. In addition as I am limiting the quantity to a pagevec which
limits the pages to 15 I am not sure there will be much benefit to be
seen for sorting the pages beforehand.

Thanks.

- Alex

Re: [RFC PATCH v2 3/5] mm: Add explicit page decrement in exception path for isolate_lru_pages

2020-08-19 Thread Alexander Duyck

On Wed, Aug 19, 2020 at 12:52 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/19 下午12:27, Alexander Duyck 写道:
> > From: Alexander Duyck 
> >
> > In isolate_lru_pages we have an exception path where if we call
> > get_page_unless_zero and that succeeds, but TestClearPageLRU fails we call
> > put_page. Normally this would be problematic but due to the way that the
> > calls are ordered and the fact that we are holding the LRU lock we know
> > that the caller must be holding another reference for the page. Since we
> > can assume that we can replace the put_page with a call to
> > put_page_testzero contained within a WARN_ON. By doing this we should see
> > if we ever leak a page as a result of the reference count somehow hitting
> > zero when it shouldn't, and can avoid the overhead and confusion of using
> > the full put_page call.
> >
> > Signed-off-by: Alexander Duyck 
> > ---
> >  mm/vmscan.c |9 ++---
> >  1 file changed, 6 insertions(+), 3 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 5bc0c2322043..3ebe3f9b653b 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1688,10 +1688,13 @@ static unsigned long isolate_lru_pages(unsigned 
> > long nr_to_scan,
> >
> >   if (!TestClearPageLRU(page)) {
> >   /*
> > -  * This page may in other isolation path,
> > -  * but we still hold lru_lock.
> > +  * This page is being isolated in another
> > +  * thread, but we still hold lru_lock. The
> > +  * other thread must be holding a reference
> > +  * to the page so this should never hit a
> > +  * reference count of 0.
> >*/
> > - put_page(page);
> > + WARN_ON(put_page_testzero(page));
>
> seems WARN_ON is always enabled.
>
> Reviewed-by: Alex Shi 

Yeah, it is always enabled however it should never be triggered. I had
considered just putting a page_ref_dec here since in theory this path
should never be triggered but I thought as a debug catch I add the
WARN_ON and put_page_testzero. If we ever do encounter this being
triggered then it will leak a page of memory which isn't the end of
the world but I thought would warrant a WARN_ON.

Re: [RFC PATCH v2 1/5] mm: Identify compound pages sooner in isolate_migratepages_block

2020-08-19 Thread Alexander Duyck

On Wed, Aug 19, 2020 at 4:43 AM Matthew Wilcox  wrote:
>
> On Tue, Aug 18, 2020 at 09:27:05PM -0700, Alexander Duyck wrote:
> > + /*
> > +  * Page is compound. We know the order before we know if it is
> > +  * on the LRU so we cannot assume it is THP. However since the
> > +  * page will have the LRU validated shortly we can use the 
> > value
> > +  * to skip over this page for now or validate the LRU is set 
> > and
> > +  * then isolate the entire compound page if we are isolating 
> > to
> > +  * generate a CMA page.
> > +  */
> > + if (PageCompound(page)) {
> > + const unsigned int order = compound_order(page);
> > +
> > + if (likely(order < MAX_ORDER))
> > + low_pfn += (1UL << order) - 1;
>
> Hmm.  You're checking for PageCompound but then skipping 1UL << order.
> That only works if PageHead.  If instead this is PageCompound because
> it's PageTail, you need to do something like:
>
> low_pfn |= (1UL << order) - 1;
>
> which will move you to the end of the page you're in the middle of.

Can you successfully call get_page_unless_zero in a tail page? I
thought their reference count was 0? There is a get_page_unless_zero
call before the PageCompound check, so I don't think we can get a tail
page.

> If PageTail can't actually happen here, then it's better to check for
> PageHead explicitly and WARN_ON if you get a PageTail (eg a page was
> combined into a compound page after you processed the earlier head page).
>
> Is it possible the page you've found is hugetlbfs?  Those can have orders
> larger than MAX_ORDER.

So in theory we only need to jump pageblock_order. However there are
some architectures where that is not a fixed constant and so it would
have some additional overhead if I am not mistaken. In addition we
should have been only provided a pageblock if i am not mistaken so the
check further down that prevents low_pfn from passing end_pfn should
reset to the correct value.

Re: [RFC PATCH v2 5/5] mm: Split move_pages_to_lru into 3 separate passes

2020-08-19 Thread Alexander Duyck

On Wed, Aug 19, 2020 at 12:58 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/19 下午12:27, Alexander Duyck 写道:
> > From: Alexander Duyck 
> >
> > The current code for move_pages_to_lru is meant to release the LRU lock
> > every time it encounters an unevictable page or a compound page that must
> > be freed. This results in a fair amount of code bulk because the lruvec has
> > to be reacquired every time the lock is released and reacquired.
> >
> > Instead of doing this I believe we can break the code up into 3 passes. The
> > first pass will identify the pages we can move to LRU and move those. In
> > addition it will sort the list out leaving the unevictable pages in the
> > list and moving those pages that have dropped to a reference count of 0 to
> > pages_to_free. The second pass will return the unevictable pages to the
> > LRU. The final pass will free any compound pages we have in the
> > pages_to_free list before we merge it back with the original list and
> > return from the function.
> >
> > The advantage of doing it this way is that we only have to release the lock
> > between pass 1 and 2, and then we reacquire the lock after pass 3 after we
> > merge the pages_to_free back into the original list. As such we only have
> > to release the lock at most once in an entire call instead of having to
> > test to see if we need to relock with each page.
> >
> > Signed-off-by: Alexander Duyck 
> > ---
> >  mm/vmscan.c |   68 
> > ++-
> >  1 file changed, 39 insertions(+), 29 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 3ebe3f9b653b..6a2bdbc1a9eb 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1850,22 +1850,21 @@ static unsigned noinline_for_stack 
> > move_pages_to_lru(struct lruvec *lruvec,
> >  {
> >   int nr_pages, nr_moved = 0;
> >   LIST_HEAD(pages_to_free);
> > - struct page *page;
> > - struct lruvec *orig_lruvec = lruvec;
> > + struct page *page, *next;
> >   enum lru_list lru;
> >
> > - while (!list_empty(list)) {
> > - page = lru_to_page(list);
> > + list_for_each_entry_safe(page, next, list, lru) {
> >   VM_BUG_ON_PAGE(PageLRU(page), page);
> > - list_del(>lru);
> > - if (unlikely(!page_evictable(page))) {
> > - if (lruvec) {
> > - spin_unlock_irq(>lru_lock);
> > - lruvec = NULL;
> > - }
> > - putback_lru_page(page);
> > +
> > + /*
> > +  * if page is unevictable leave it on the list to be returned
> > +  * to the LRU after we have finished processing the other
> > +  * entries in the list.
> > +  */
> > + if (unlikely(!page_evictable(page)))
> >   continue;
> > - }
> > +
> > + list_del(>lru);
> >
> >   /*
> >* The SetPageLRU needs to be kept here for list intergrity.
> > @@ -1878,20 +1877,14 @@ static unsigned noinline_for_stack 
> > move_pages_to_lru(struct lruvec *lruvec,
> >* list_add(>lru,)
> >*
> > list_add(>lru,)
> >*/
> > - lruvec = relock_page_lruvec_irq(page, lruvec);
>
> It's actually changed the meaning from current func. which I had seen a bug 
> if no relock.
> but after move to 5.9 kernel, I can not reprodce the bug any more. I am not 
> sure if 5.9 fixed
> the problem, and we don't need relock here.

So I am not sure what you mean here about "changed the meaning from
the current func". Which function are you referring to and what
changed?

>From what I can tell the pages cannot change memcg because they were
isolated and had the LRU flag stripped. They shouldn't be able to
change destination LRU vector as a result. Assuming that, then they
can all be processed under same LRU lock and we can avoid having to
release it until we are forced to do so to call putback_lru_page or
destroy the compound pages that were freed while we were shrinking the
LRU lists.

> For the rest of this patch.
> Reviewed-by: Alex Shi 

Thanks for the review.

- Alex

[RFC PATCH v2 1/5] mm: Identify compound pages sooner in isolate_migratepages_block

2020-08-18 Thread Alexander Duyck

From: Alexander Duyck 

Since we are holding a reference to the page much sooner in
isolate_migratepages_block we can move the PageCompound check out of the
LRU locked section and instead just place it after get_page_unless_zero. By
doing this we can allow any of the items that might trigger a failure to
trigger a failure for the compound page rather than the order 0 page and as
a result we should be able to process the pageblock faster.

In addition by testing for PageCompound sooner we can avoid having the LRU
flag cleared and then reset in the exception case. As a result this should
prevent possible races where another thread might be attempting to pull the
LRU pages from the list.

Signed-off-by: Alexander Duyck 
---
 mm/compaction.c |   33 ++---
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index d3f87f759773..88c7b950f676 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -984,6 +984,24 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (unlikely(!get_page_unless_zero(page)))
goto isolate_fail;
 
+   /*
+* Page is compound. We know the order before we know if it is
+* on the LRU so we cannot assume it is THP. However since the
+* page will have the LRU validated shortly we can use the value
+* to skip over this page for now or validate the LRU is set and
+* then isolate the entire compound page if we are isolating to
+* generate a CMA page.
+*/
+   if (PageCompound(page)) {
+   const unsigned int order = compound_order(page);
+
+   if (likely(order < MAX_ORDER))
+   low_pfn += (1UL << order) - 1;
+
+   if (!cc->alloc_contig)
+   goto isolate_fail_put;
+   }
+
if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
goto isolate_fail_put;
 
@@ -1009,23 +1027,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (test_and_set_skip(cc, page, low_pfn))
goto isolate_abort;
}
-
-   /*
-* Page become compound since the non-locked check,
-* and it's on LRU. It can only be a THP so the order
-* is safe to read and it's 0 for tail pages.
-*/
-   if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
-   low_pfn += compound_nr(page) - 1;
-   SetPageLRU(page);
-   goto isolate_fail_put;
-   }
}
 
-   /* The whole page is taken off the LRU; skip the tail pages. */
-   if (PageCompound(page))
-   low_pfn += compound_nr(page) - 1;
-
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
mod_node_page_state(page_pgdat(page),

[RFC PATCH v2 0/5] Minor cleanups and performance optimizations for LRU rework

2020-08-18 Thread Alexander Duyck

So this patch set addresses a few minor issues I have found and is based on
the lrunext branch of the tree at:
https://github.com/alexshi/linux.git

The first three patches address various issues if ound with the patch set
such as the fact that we were skipping non-LRU compound pages one 4K page
at a time, the fact that test_and_set_skip had been made redundant by the
fact that the LRU bit made the setting of the bit exclusive per pageblock,
and the fact that we were using put_page while holding the LRU lock.

The last two patches are some patches I have been experimenting with.
Basically trying to reduce the number of times the LRU lock has to be
released and reacquired by batching LRU work together, or deferring the
freeing/returning of pages to LRU in the case of move_pages_to_lru. I am
still working on generating data but for the fourth patch I have seen an
improvement of about 5% on the will-it-scale/page_fault2 test with THP
enabled by default, however that is just some preliminary data and I still
have a number of tests left to run.

---

Alexander Duyck (5):
  mm: Identify compound pages sooner in isolate_migratepages_block
  mm: Drop use of test_and_set_skip in favor of just setting skip
  mm: Add explicit page decrement in exception path for isolate_lru_pages
  mm: Split release_pages work into 3 passes
  mm: Split move_pages_to_lru into 3 separate passes


 mm/compaction.c |   84 +++---
 mm/swap.c   |  109 ++-
 mm/vmscan.c |   77 +++
 3 files changed, 142 insertions(+), 128 deletions(-)

--

[RFC PATCH v2 5/5] mm: Split move_pages_to_lru into 3 separate passes

2020-08-18 Thread Alexander Duyck

From: Alexander Duyck 

The current code for move_pages_to_lru is meant to release the LRU lock
every time it encounters an unevictable page or a compound page that must
be freed. This results in a fair amount of code bulk because the lruvec has
to be reacquired every time the lock is released and reacquired.

Instead of doing this I believe we can break the code up into 3 passes. The
first pass will identify the pages we can move to LRU and move those. In
addition it will sort the list out leaving the unevictable pages in the
list and moving those pages that have dropped to a reference count of 0 to
pages_to_free. The second pass will return the unevictable pages to the
LRU. The final pass will free any compound pages we have in the
pages_to_free list before we merge it back with the original list and
return from the function.

The advantage of doing it this way is that we only have to release the lock
between pass 1 and 2, and then we reacquire the lock after pass 3 after we
merge the pages_to_free back into the original list. As such we only have
to release the lock at most once in an entire call instead of having to
test to see if we need to relock with each page.

Signed-off-by: Alexander Duyck 
---
 mm/vmscan.c |   68 ++-
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ebe3f9b653b..6a2bdbc1a9eb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1850,22 +1850,21 @@ static unsigned noinline_for_stack 
move_pages_to_lru(struct lruvec *lruvec,
 {
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
-   struct page *page;
-   struct lruvec *orig_lruvec = lruvec;
+   struct page *page, *next;
enum lru_list lru;
 
-   while (!list_empty(list)) {
-   page = lru_to_page(list);
+   list_for_each_entry_safe(page, next, list, lru) {
VM_BUG_ON_PAGE(PageLRU(page), page);
-   list_del(>lru);
-   if (unlikely(!page_evictable(page))) {
-   if (lruvec) {
-   spin_unlock_irq(>lru_lock);
-   lruvec = NULL;
-   }
-   putback_lru_page(page);
+
+   /*
+* if page is unevictable leave it on the list to be returned
+* to the LRU after we have finished processing the other
+* entries in the list.
+*/
+   if (unlikely(!page_evictable(page)))
continue;
-   }
+
+   list_del(>lru);
 
/*
 * The SetPageLRU needs to be kept here for list intergrity.
@@ -1878,20 +1877,14 @@ static unsigned noinline_for_stack 
move_pages_to_lru(struct lruvec *lruvec,
 * list_add(>lru,)
 *list_add(>lru,)
 */
-   lruvec = relock_page_lruvec_irq(page, lruvec);
SetPageLRU(page);
 
if (unlikely(put_page_testzero(page))) {
__ClearPageLRU(page);
__ClearPageActive(page);
 
-   if (unlikely(PageCompound(page))) {
-   spin_unlock_irq(>lru_lock);
-   lruvec = NULL;
-   destroy_compound_page(page);
-   } else
-   list_add(>lru, _to_free);
-
+   /* defer freeing until we can release lru_lock */
+   list_add(>lru, _to_free);
continue;
}
 
@@ -1904,16 +1897,33 @@ static unsigned noinline_for_stack 
move_pages_to_lru(struct lruvec *lruvec,
if (PageActive(page))
workingset_age_nonresident(lruvec, nr_pages);
}
-   if (orig_lruvec != lruvec) {
-   if (lruvec)
-   spin_unlock_irq(>lru_lock);
-   spin_lock_irq(_lruvec->lru_lock);
-   }
 
-   /*
-* To save our caller's stack, now use input list for pages to free.
-*/
-   list_splice(_to_free, list);
+   if (unlikely(!list_empty(list) || !list_empty(_to_free))) {
+   spin_unlock_irq(>lru_lock);
+
+   /* return any unevictable pages to the LRU list */
+   while (!list_empty(list)) {
+   page = lru_to_page(list);
+   list_del(>lru);
+   putback_lru_page(page);
+   }
+
+   /*
+* To save our caller's stack use input
+* list for pages to free.
+*/
+   list_splice(_to_free, list);
+
+   /* free any compound pages we have in the list */
+   list_for_each_entry_safe(page, next, list, lru)

[RFC PATCH v2 2/5] mm: Drop use of test_and_set_skip in favor of just setting skip

2020-08-18 Thread Alexander Duyck

From: Alexander Duyck 

The only user of test_and_set_skip was isolate_migratepages_block and it
was using it after a call that was testing and clearing the LRU flag. As
such it really didn't need to be behind the LRU lock anymore as it wasn't
really fulfilling its purpose.

Since it is only possible to be able to test and set the skip flag if we
were able to obtain the LRU bit for the first page in the pageblock the
use of the test_and_set_skip becomes redundant as the LRU flag now becomes
the item that limits us to only one thread being able to perform the
operation and there being no need for a test_and_set operation.

With that being the case we can simply drop the bit and instead directly
just call the set_pageblock_skip function if the page we are working on is
the valid_page at the start of the pageblock. Then any other threads that
enter this pageblock should see the skip bit set on the first valid page in
the pageblock.

Since we have dropped the late abort case we can drop the code that was
clearing the LRU flag and calling page_put since the abort case will now
not be holding a reference to a page now.

Signed-off-by: Alexander Duyck 
---
 mm/compaction.c |   53 +
 1 file changed, 13 insertions(+), 40 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 88c7b950f676..f986c67e83cc 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -399,29 +399,6 @@ void reset_isolation_suitable(pg_data_t *pgdat)
}
 }
 
-/*
- * Sets the pageblock skip bit if it was clear. Note that this is a hint as
- * locks are not required for read/writers. Returns true if it was already set.
- */
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
-   unsigned long pfn)
-{
-   bool skip;
-
-   /* Do no update if skip hint is being ignored */
-   if (cc->ignore_skip_hint)
-   return false;
-
-   if (!IS_ALIGNED(pfn, pageblock_nr_pages))
-   return false;
-
-   skip = get_pageblock_skip(page);
-   if (!skip && !cc->no_set_skip_hint)
-   set_pageblock_skip(page);
-
-   return skip;
-}
-
 static void update_cached_migrate(struct compact_control *cc, unsigned long 
pfn)
 {
struct zone *zone = cc->zone;
@@ -480,12 +457,6 @@ static inline void update_pageblock_skip(struct 
compact_control *cc,
 static void update_cached_migrate(struct compact_control *cc, unsigned long 
pfn)
 {
 }
-
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
-   unsigned long pfn)
-{
-   return false;
-}
 #endif /* CONFIG_COMPACTION */
 
 /*
@@ -895,7 +866,6 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
low_pfn = end_pfn;
-   page = NULL;
goto isolate_abort;
}
valid_page = page;
@@ -1021,11 +991,20 @@ static bool too_many_isolated(pg_data_t *pgdat)
 
lruvec_memcg_debug(lruvec, page);
 
-   /* Try get exclusive access under lock */
-   if (!skip_updated) {
+   /*
+* Indicate that we want exclusive access to the
+* rest of the pageblock.
+*
+* The LRU flag prevents simultaneous access to the
+* first PFN, and the LRU lock helps to prevent
+* simultaneous update of multiple pageblocks shared
+* in the same bitmap.
+*/
+   if (page == valid_page) {
+   if (!cc->ignore_skip_hint &&
+   !cc->no_set_skip_hint)
+   set_pageblock_skip(page);
skip_updated = true;
-   if (test_and_set_skip(cc, page, low_pfn))
-   goto isolate_abort;
}
}
 
@@ -1098,15 +1077,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
 
-   page = NULL;
-
 isolate_abort:
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
-   if (page) {
-   SetPageLRU(page);
-   put_page(page);
-   }
 
/*
 * Updated the cached scanner pfn once the pageblock has been scanned

[RFC PATCH v2 3/5] mm: Add explicit page decrement in exception path for isolate_lru_pages

2020-08-18 Thread Alexander Duyck

From: Alexander Duyck 

In isolate_lru_pages we have an exception path where if we call
get_page_unless_zero and that succeeds, but TestClearPageLRU fails we call
put_page. Normally this would be problematic but due to the way that the
calls are ordered and the fact that we are holding the LRU lock we know
that the caller must be holding another reference for the page. Since we
can assume that we can replace the put_page with a call to
put_page_testzero contained within a WARN_ON. By doing this we should see
if we ever leak a page as a result of the reference count somehow hitting
zero when it shouldn't, and can avoid the overhead and confusion of using
the full put_page call.

Signed-off-by: Alexander Duyck 
---
 mm/vmscan.c |9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5bc0c2322043..3ebe3f9b653b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1688,10 +1688,13 @@ static unsigned long isolate_lru_pages(unsigned long 
nr_to_scan,
 
if (!TestClearPageLRU(page)) {
/*
-* This page may in other isolation path,
-* but we still hold lru_lock.
+* This page is being isolated in another
+* thread, but we still hold lru_lock. The
+* other thread must be holding a reference
+* to the page so this should never hit a
+* reference count of 0.
 */
-   put_page(page);
+   WARN_ON(put_page_testzero(page));
goto busy;
}

[RFC PATCH v2 4/5] mm: Split release_pages work into 3 passes

2020-08-18 Thread Alexander Duyck

From: Alexander Duyck 

The release_pages function has a number of paths that end up with the
LRU lock having to be released and reacquired. Such an example would be the
freeing of THP pages as it requires releasing the LRU lock so that it can
be potentially reacquired by __put_compound_page.

In order to avoid that we can split the work into 3 passes, the first
without the LRU lock to go through and sort out those pages that are not in
the LRU so they can be freed immediately from those that can't. The second
pass will then go through removing those pages from the LRU in batches as
large as a pagevec can hold before freeing the LRU lock. Once the pages have
been removed from the LRU we can then proceed to free the remaining pages
without needing to worry about if they are in the LRU any further.

The general idea is to avoid bouncing the LRU lock between pages and to
hopefully aggregate the lock for up to the full page vector worth of pages.

Signed-off-by: Alexander Duyck 
---
 mm/swap.c |  109 +
 1 file changed, 67 insertions(+), 42 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index fe53449fa1b8..b405f81b2c60 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -795,6 +795,54 @@ void lru_add_drain_all(void)
 }
 #endif
 
+static void __release_page(struct page *page, struct list_head *pages_to_free)
+{
+   if (PageCompound(page)) {
+   __put_compound_page(page);
+   } else {
+   /* Clear Active bit in case of parallel mark_page_accessed */
+   __ClearPageActive(page);
+   __ClearPageWaiters(page);
+
+   list_add(>lru, pages_to_free);
+   }
+}
+
+static void __release_lru_pages(struct pagevec *pvec,
+   struct list_head *pages_to_free)
+{
+   struct lruvec *lruvec = NULL;
+   unsigned long flags = 0;
+   int i;
+
+   /*
+* The pagevec at this point should contain a set of pages with
+* their reference count at 0 and the LRU flag set. We will now
+* need to pull the pages from their LRU lists.
+*
+* We walk the list backwards here since that way we are starting at
+* the pages that should be warmest in the cache.
+*/
+   for (i = pagevec_count(pvec); i--;) {
+   struct page *page = pvec->pages[i];
+
+   lruvec = relock_page_lruvec_irqsave(page, lruvec, );
+   VM_BUG_ON_PAGE(!PageLRU(page), page);
+   __ClearPageLRU(page);
+   del_page_from_lru_list(page, lruvec, page_off_lru(page));
+   }
+
+   unlock_page_lruvec_irqrestore(lruvec, flags);
+
+   /*
+* A batch of pages are no longer on the LRU list. Go through and
+* start the final process of returning the deferred pages to their
+* appropriate freelists.
+*/
+   for (i = pagevec_count(pvec); i--;)
+   __release_page(pvec->pages[i], pages_to_free);
+}
+
 /**
  * release_pages - batched put_page()
  * @pages: array of pages to release
@@ -806,32 +854,24 @@ void lru_add_drain_all(void)
 void release_pages(struct page **pages, int nr)
 {
int i;
+   struct pagevec pvec;
LIST_HEAD(pages_to_free);
-   struct lruvec *lruvec = NULL;
-   unsigned long flags;
-   unsigned int lock_batch;
 
+   pagevec_init();
+
+   /*
+* We need to first walk through the list cleaning up the low hanging
+* fruit and clearing those pages that either cannot be freed or that
+* are non-LRU. We will store the LRU pages in a pagevec so that we
+* can get to them in the next pass.
+*/
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
 
-   /*
-* Make sure the IRQ-safe lock-holding time does not get
-* excessive with a continuous string of pages from the
-* same lruvec. The lock is held only if lruvec != NULL.
-*/
-   if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
-   unlock_page_lruvec_irqrestore(lruvec, flags);
-   lruvec = NULL;
-   }
-
if (is_huge_zero_page(page))
continue;
 
if (is_zone_device_page(page)) {
-   if (lruvec) {
-   unlock_page_lruvec_irqrestore(lruvec, flags);
-   lruvec = NULL;
-   }
/*
 * ZONE_DEVICE pages that return 'false' from
 * put_devmap_managed_page() do not require special
@@ -848,36 +888,21 @@ void release_pages(struct page **pages, int nr)
if (!put_page_testzero(page))
continue;
 
-   if (PageCompound(page)) {
-

Re: [PATCH] mm/page_reporting: the "page" must not be the list head

2020-08-18 Thread Alexander Duyck

On Mon, Aug 17, 2020 at 8:22 PM Wei Yang
 wrote:
>
> On Mon, Aug 17, 2020 at 09:05:32AM -0700, Alexander Duyck wrote:
> >
> >
> >On 8/17/2020 2:35 AM, David Hildenbrand wrote:
> >> On 17.08.20 10:48, Wei Yang wrote:
> >> > If "page" is the list head, list_for_each_entry_safe() would stop
> >> > iteration.
> >> >
> >> > Signed-off-by: Wei Yang 
> >> > ---
> >> >   mm/page_reporting.c | 2 +-
> >> >   1 file changed, 1 insertion(+), 1 deletion(-)
> >> >
> >> > diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> >> > index 3bbd471cfc81..3605123d 100644
> >> > --- a/mm/page_reporting.c
> >> > +++ b/mm/page_reporting.c
> >> > @@ -178,7 +178,7 @@ page_reporting_cycle(struct page_reporting_dev_info 
> >> > *prdev, struct zone *zone,
> >> > * the new head of the free list before we release the
> >> > * zone lock.
> >> > */
> >> > -  if (>lru != list && !list_is_first(>lru, list))
> >> > +  if (!list_is_first(>lru, list))
> >> >list_rotate_to_front(>lru, list);
> >> >/* release lock before waiting on report processing */
> >> >
> >>
> >> Is this a fix or a cleanup? If it's a fix, can this be reproduced easily
> >> and what ere the effects?
> >>
> >
> >This should be a clean-up. Since the >lru != list will always be true.
> >
> >If I recall at some point the that was a check for >lru != list but I
> >think I pulled out an additional conditional check somewhere so that we just
> >go through the start of the loop again and iterate over reported pages until
> >we are guaranteed to have a non-reported page to rotate to the top of the
> >list with the general idea being that we wanted the allocator to pull
> >non-reported pages before reported pages.
>
> Hi, Alexander,
>
> I see you mentioned in the changelog, this change "mm/page_reporting: rotate
> reported pages to the tail of the list" brings some performance gain.
>
> Would you mind sharing more test detail? I would like to have a try at my
> side.
>
> Thanks :-)

I seem to recall my default test for most of this was the page_fault1
test from the will-it-scale suite of tests. Basically I was running
that while leaving page reporting enabled. However I don't know how
much visibility you would have into the performance impact as I seem
to recall I had to modify the frequency of scheduling for the
reporting polling task in order to see much of an impact.

Thanks.

- Alex

Re: [Patch v2] mm/page_reporting: drop stale list head check in page_reporting_cycle

2020-08-18 Thread Alexander Duyck

On Tue, Aug 18, 2020 at 1:45 AM Wei Yang
 wrote:
>
> list_for_each_entry_safe() guarantees that we will never stumble over
> the list head; ">lru != list" will always evaluate to true. Let's
> simplify.
>
> Signed-off-by: Wei Yang 
> Reviewed-by: David Hildenbrand 
> ---
>  mm/page_reporting.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 3bbd471cfc81..3605123d 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -178,7 +178,7 @@ page_reporting_cycle(struct page_reporting_dev_info 
> *prdev, struct zone *zone,
>  * the new head of the free list before we release the
>  * zone lock.
>  */
> -   if (>lru != list && !list_is_first(>lru, list))
> +   if (!list_is_first(>lru, list))
> list_rotate_to_front(>lru, list);
>
> /* release lock before waiting on report processing */

Reviewed-by: Alexander Duyck

Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction

2020-08-17 Thread Alexander Duyck

> @@ -1691,17 +1680,34 @@ static unsigned long isolate_lru_pages(unsigned long 
> nr_to_scan,
>  * only when the page is being freed somewhere else.
>  */
> scan += nr_pages;
> -   switch (__isolate_lru_page(page, mode)) {
> +   switch (__isolate_lru_page_prepare(page, mode)) {
> case 0:
> +   /*
> +* Be careful not to clear PageLRU until after we're
> +* sure the page is not being freed elsewhere -- the
> +* page release code relies on it.
> +*/
> +   if (unlikely(!get_page_unless_zero(page)))
> +   goto busy;
> +
> +   if (!TestClearPageLRU(page)) {
> +   /*
> +* This page may in other isolation path,
> +* but we still hold lru_lock.
> +*/
> +   put_page(page);
> +   goto busy;
> +   }
> +

So I was reviewing the code and came across this piece. It has me a
bit concerned since we are calling put_page while holding the LRU lock
which was taken before calling the function. We should be fine in
terms of not encountering a deadlock since the LRU bit is cleared the
page shouldn't grab the LRU lock again, however we could end up
grabbing the zone lock while holding the LRU lock which would be an
issue.

One other thought I had is that this might be safe because the
assumption would be that another thread is holding a reference on the
page, has already called TestClearPageLRU on the page and retrieved
the LRU bit, and is waiting on us to release the LRU lock before it
can pull the page off of the list. In that case put_page will never
decrement the reference count to 0. I believe that is the current case
but I cannot be certain.

I'm just wondering if we should just replace the put_page(page) with a
WARN_ON(put_page_testzero(page)) and a bit more documentation. If I am
not mistaken it should never be possible for the reference count to
actually hit zero here.

Thanks.

- Alex

Re: [RFC PATCH 2/3] mm: Drop use of test_and_set_skip in favor of just setting skip

2020-08-17 Thread Alexander Duyck

On Sat, Aug 15, 2020 at 2:51 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/15 上午5:15, Alexander Duyck 写道:
> > On Fri, Aug 14, 2020 at 7:24 AM Alexander Duyck
> >  wrote:
> >>
> >> On Fri, Aug 14, 2020 at 12:19 AM Alex Shi  
> >> wrote:
> >>>
> >>>
> >>>
> >>> 在 2020/8/13 下午12:02, Alexander Duyck 写道:
> >>>>
> >>>> Since we have dropped the late abort case we can drop the code that was
> >>>> clearing the LRU flag and calling page_put since the abort case will now
> >>>> not be holding a reference to a page.
> >>>>
> >>>> Signed-off-by: Alexander Duyck 
> >>>
> >>> seems the case-lru-file-mmap-read case drop about 3% on this patch in a 
> >>> rough testing.
> >>> on my 80 core machine.
> >>
> >> I'm not sure how it could have that much impact on the performance
> >> since the total effect would just be dropping what should be a
> >> redundant test since we tested the skip bit before we took the LRU
> >> bit, so we shouldn't need to test it again after.
> >>
> >> I finally got my test setup working last night. I'll have to do some
> >> testing in my environment and I can start trying to see what is going
> >> on.
> >
> > So I ran the case-lru-file-mmap-read a few times and I don't see how
> > it is supposed to be testing the compaction code. It doesn't seem like
> > compaction is running at least on my system as a result of the test
> > script.
>
> atteched my kernel config, it is used on mine machine,

I'm just wondering what the margin of error is on the tests you are
running. What is the variance between runs? I'm just wondering if 3%
falls into the range of noise or possible changes due to just code
shifting around?

In order for the code to have shown any change it needs to be run and
I didn't see the tests triggering compaction on my test system. I'm
wondering how much memory you have available in the system you were
testing on that the test was enough to trigger compaction?

> > I wonder if testing this code wouldn't be better done using
> > something like thpscale from the
> > mmtests(https://github.com/gormanm/mmtests)? It seems past changes to
> > the compaction code were tested using that, and the config script for
> > the test explains that it is designed specifically to stress the
> > compaction code. I have the test up and running now and hope to
> > collect results over the weekend.
>
> I did the testing, but a awkward is that I failed to get result,
> maybe leak some packages.

So one thing I noticed is that if you have over 128GB of memory in the
system it will fail unless you update the sysctl value
vm.max_map_count. It defaulted to somewhere close to 64K, and I
increased it 20X to 1280K in order for the test to run without failing
on the mmap calls. The other edit I had to make was the config file as
the test system I was on had about 1TB of RAM, and my home partition
only had about 800GB to spare so I had to reduce the map size from
8/10 to 5/8.

> # ../../compare-kernels.sh
>
> thpscale Fault Latencies
> Can't locate List/BinarySearch.pm in @INC (@INC contains: 
> /root/mmtests/bin/lib /usr/local/lib64/perl5 /usr/local/share/perl5 
> /usr/lib64/perl5/vendor_perl /usr/share/perl5/vend.
> BEGIN failed--compilation aborted at /root/mmtests/bin/lib/MMTests/Stat.pm 
> line 13.
> Compilation failed in require at 
> /root/mmtests/work/log/../../bin/compare-mmtests.pl line 13.
> BEGIN failed--compilation aborted at 
> /root/mmtests/work/log/../../bin/compare-mmtests.pl line 13.

I had to install List::BinarySearch.pm. It required installing the
cpan perl libraries.

> >
> > There is one change I will probably make to this patch and that is to
> > place the new code that is setting skip_updated where the old code was
> > calling test_and_set_skip_bit. By doing that we can avoid extra checks
> > and it should help to reduce possible collisions when setting the skip
> > bit in the pageblock flags.
>
> the problem maybe on cmpchxg pb flags, that may involved other blocks changes.

That is the only thing I can think of just based on code review.
Although that would imply multiple compact threads are running, and as
I said in my tests I never saw kcompactd wakeup so I don't think the
tests you were mentioning were enough to stress compaction.

Re: [PATCH] mm/page_reporting: the "page" must not be the list head

2020-08-17 Thread Alexander Duyck





On 8/17/2020 2:35 AM, David Hildenbrand wrote:

On 17.08.20 10:48, Wei Yang wrote:

If "page" is the list head, list_for_each_entry_safe() would stop
iteration.

Signed-off-by: Wei Yang 
---
  mm/page_reporting.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 3bbd471cfc81..3605123d 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -178,7 +178,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, 
struct zone *zone,
 * the new head of the free list before we release the
 * zone lock.
 */
-   if (>lru != list && !list_is_first(>lru, list))
+   if (!list_is_first(>lru, list))
list_rotate_to_front(>lru, list);
  
  		/* release lock before waiting on report processing */




Is this a fix or a cleanup? If it's a fix, can this be reproduced easily
and what ere the effects?



This should be a clean-up. Since the >lru != list will always be true.

If I recall at some point the that was a check for >lru != list 
but I think I pulled out an additional conditional check somewhere so 
that we just go through the start of the loop again and iterate over 
reported pages until we are guaranteed to have a non-reported page to 
rotate to the top of the list with the general idea being that we wanted 
the allocator to pull non-reported pages before reported pages.

Re: [PATCH 2/2] mm/pageblock: remove false sharing in pageblock_flags

2020-08-16 Thread Alexander Duyck

On Sun, Aug 16, 2020 at 7:11 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/16 下午12:17, Matthew Wilcox 写道:
> > On Sun, Aug 16, 2020 at 11:47:57AM +0800, Alex Shi wrote:
> >> Current pageblock_flags is only 4 bits, so it has to share a char size
> >> in cmpxchg when get set, the false sharing cause perf drop.
> >>
> >> If we incrase the bits up to 8, false sharing would gone in cmpxchg. and
> >> the only cost is half char per pageblock, which is half char per 128MB
> >> on x86, 4 chars in 1 GB.
> >
> > I don't believe this patch has that effect, mostly because it still does
> > cmpxchg() on words instead of bytes.
>
> Hi Matthew,
>
> Thank a lot for comments!
>
> Sorry, I must overlook sth, would you like point out why the cmpxchg is still
> on words after patch 1 applied?
>

I would take it one step further. You still have false sharing as the
pageblocks bits still occupy the same cacheline so you are going to
see them cache bouncing regardless.

What it seems like you are attempting to address is the fact that
multiple threads could all be attempting to update the same long
value. As I pointed out for the migrate type it seems to be protected
by the zone lock, but for compaction the skip bit doesn't have the
same protection as there are some threads using the zone lock and
others using the LRU lock. I'm still not sure it makes much of a
difference though.

> >
> > But which functions would benefit?  It seems to me this cmpxchg() is
> > only called from the set_pageblock_migratetype() morass of functions,
> > none of which are called in hot paths as far as I can make out.
> >
> > So are you just reasoning by analogy with the previous patch where you
> > have measured a performance improvement, or did you send the wrong patch,
> > or did I overlook a hot path that calls one of the pageblock migration
> > functions?
> >
>
> Uh, I am reading compaction.c and found the following commit introduced
> test_and_set_skip under a lock. It looks like the pagelock_flags setting
> has false sharing in cmpxchg. but I have no valid data on this yet.
>
> Thanks
> Alex
>
> e380bebe4771548  mm, compaction: keep migration source private to a single 
> compaction instance
>
> if (!locked) {
> locked = compact_trylock_irqsave(zone_lru_lock(zone),
> , cc);
> -   if (!locked)
> +
> +   /* Allow future scanning if the lock is contended */
> +   if (!locked) {
> +   clear_pageblock_skip(page);
> break;
> +   }
> +
> +   /* Try get exclusive access under lock */
> +   if (!skip_updated) {
> +   skip_updated = true;
> +   if (test_and_set_skip(cc, page, low_pfn))
> +   goto isolate_abort;
> +   }
>

I'm not sure that is a good grounds for doubling the size of the
pageblock flags. If you look further down in the code there are bits
that are setting these bits without taking the lock. The assumption
here is that by taking the lock the test_and_set_skip will be
performed atomically since another thread cannot perform that while
the zone lock is held. If you look in the function itself it only does
anything if the skip bits are checked and if the page is the first
page in the pageblock.

I think you might be confusing some of my earlier comments. I still
believe the 3% regression you reported with my patch is not directly
related to the test_and_set_skip as the test you ran seems unlikely to
trigger compaction. However with that said one of the advantages of
using the locked section to perform these types of tests is that it
reduces the number of times the test is run since it will only be on
the first unlocked page in any batch of pages and the first page in
the pageblock is always going to be handled without the lock held
since it is the first page processed.

Until we can get a test up such as thpscale that does a good job of
stressing the compaction code I don't think we can rely on just
observations to say if this is an improvement or not.

Thanks.

- Alex

Re: [RFC PATCH 2/3] mm: Drop use of test_and_set_skip in favor of just setting skip

2020-08-14 Thread Alexander Duyck

On Fri, Aug 14, 2020 at 7:24 AM Alexander Duyck
 wrote:
>
> On Fri, Aug 14, 2020 at 12:19 AM Alex Shi  wrote:
> >
> >
> >
> > 在 2020/8/13 下午12:02, Alexander Duyck 写道:
> > >
> > > Since we have dropped the late abort case we can drop the code that was
> > > clearing the LRU flag and calling page_put since the abort case will now
> > > not be holding a reference to a page.
> > >
> > > Signed-off-by: Alexander Duyck 
> >
> > seems the case-lru-file-mmap-read case drop about 3% on this patch in a 
> > rough testing.
> > on my 80 core machine.
>
> I'm not sure how it could have that much impact on the performance
> since the total effect would just be dropping what should be a
> redundant test since we tested the skip bit before we took the LRU
> bit, so we shouldn't need to test it again after.
>
> I finally got my test setup working last night. I'll have to do some
> testing in my environment and I can start trying to see what is going
> on.

So I ran the case-lru-file-mmap-read a few times and I don't see how
it is supposed to be testing the compaction code. It doesn't seem like
compaction is running at least on my system as a result of the test
script. I wonder if testing this code wouldn't be better done using
something like thpscale from the
mmtests(https://github.com/gormanm/mmtests)? It seems past changes to
the compaction code were tested using that, and the config script for
the test explains that it is designed specifically to stress the
compaction code. I have the test up and running now and hope to
collect results over the weekend.

There is one change I will probably make to this patch and that is to
place the new code that is setting skip_updated where the old code was
calling test_and_set_skip_bit. By doing that we can avoid extra checks
and it should help to reduce possible collisions when setting the skip
bit in the pageblock flags.

Thanks.

- Alex

Re: [RFC PATCH 2/3] mm: Drop use of test_and_set_skip in favor of just setting skip

2020-08-14 Thread Alexander Duyck

On Fri, Aug 14, 2020 at 12:19 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/13 下午12:02, Alexander Duyck 写道:
> >
> > Since we have dropped the late abort case we can drop the code that was
> > clearing the LRU flag and calling page_put since the abort case will now
> > not be holding a reference to a page.
> >
> > Signed-off-by: Alexander Duyck 
>
> seems the case-lru-file-mmap-read case drop about 3% on this patch in a rough 
> testing.
> on my 80 core machine.

I'm not sure how it could have that much impact on the performance
since the total effect would just be dropping what should be a
redundant test since we tested the skip bit before we took the LRU
bit, so we shouldn't need to test it again after.

I finally got my test setup working last night. I'll have to do some
testing in my environment and I can start trying to see what is going
on.

Thanks.

- Alex

Re: [RFC PATCH 1/3] mm: Drop locked from isolate_migratepages_block

2020-08-13 Thread Alexander Duyck

On Wed, Aug 12, 2020 at 11:57 PM Alex Shi  wrote:
>
>
>
> 在 2020/8/13 下午12:02, Alexander Duyck 写道:
> > From: Alexander Duyck 
> >
> > We can drop the need for the locked variable by making use of the
> > lruvec_holds_page_lru_lock function. By doing this we can avoid some rcu
> > locking ugliness for the case where the lruvec is still holding the LRU
> > lock associated with the page. Instead we can just use the lruvec and if it
> > is NULL we assume the lock was released.
> >
> > Signed-off-by: Alexander Duyck 
> > ---
> >  mm/compaction.c |   45 -
> >  1 file changed, 20 insertions(+), 25 deletions(-)
>
> Thanks a lot!
> Don't know if community is ok if we keep the patch following whole patchset 
> alone?

I am fine with you squashing it with another patch if you want. In
theory this could probably be squashed in with the earlier patch I
submitted that introduced lruvec_holds_page_lru_lock or some other
patch. It is mostly just a cleanup anyway as it gets us away from
needing to hold the RCU read lock in the case that we already have the
correct lruvec.

Re: [RFC PATCH 1/3] mm: Drop locked from isolate_migratepages_block

2020-08-13 Thread Alexander Duyck

On Thu, Aug 13, 2020 at 12:45 AM Alex Shi  wrote:
>
>
>
> 在 2020/8/13 下午12:02, Alexander Duyck 写道:
> > - rcu_read_lock();
> > - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> > -
> >   /* If we already hold the lock, we can skip some rechecking */
> > - if (lruvec != locked) {
> > - if (locked)
> > - unlock_page_lruvec_irqrestore(locked, flags);
> > + if (!lruvec || !lruvec_holds_page_lru_lock(page, lruvec)) {
>
> Ops, lruvec_holds_page_lru_lock need rcu_read_lock.

How so? The reason I wrote lruvec_holds_page_lru_lock the way I did is
that it is simply comparing the pointers held by the page and the
lruvec. It is never actually accessing any of the values, just the
pointers. As such we should be able to compare the two since the
lruvec is still locked and the the memcg and pgdat held by the lruvec
should not be changed. Likewise with the page pointers assuming the
values match.

> > + if (lruvec)
> > + unlock_page_lruvec_irqrestore(lruvec, flags);
> >
> > + lruvec = mem_cgroup_page_lruvec(page, pgdat);
> >   compact_lock_irqsave(>lru_lock, , cc);
> > - locked = lruvec;
> >   rcu_read_unlock();
> >
>
> and some bugs:
> [  534.564741] CPU: 23 PID: 545 Comm: kcompactd1 Kdump: loaded Tainted: G S   
>W 5.8.0-next-20200803-00028-g9a7ff2cd6e5c #85
> [  534.577320] Hardware name: Alibaba Alibaba Cloud ECS/Alibaba Cloud ECS, 
> BIOS 1.0.PL.IP.P.027.02 05/29/2020
> [  534.587693] Call Trace:
> [  534.590522]  dump_stack+0x96/0xd0
> [  534.594231]  ___might_sleep.cold.90+0xff/0x115
> [  534.599102]  kcompactd+0x24b/0x370
> [  534.602904]  ? finish_wait+0x80/0x80
> [  534.606897]  ? kcompactd_do_work+0x3d0/0x3d0
> [  534.611566]  kthread+0x14e/0x170
> [  534.615182]  ? kthread_park+0x80/0x80
> [  534.619252]  ret_from_fork+0x1f/0x30
> [  535.629483] BUG: sleeping function called from invalid context at 
> include/linux/freezer.h:57
> [  535.638691] in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 545, 
> name: kcompactd1
> [  535.647601] INFO: lockdep is turned off.

Ah, I see the bug now. It isn't the lruvec_holds_page_lru_lock that
needs the LRU lock. This is an issue as a part of a merge conflict.
There should have been an rcu_read_lock added before
mem_cgroup_page_lruvec.

[RFC PATCH 0/3] Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction

2020-08-12 Thread Alexander Duyck

Here are the patches I had discussed earlier to address the issues in
isolate_migratepages_block.

They are based on the tree at:
 https://github.com/alexshi/linux.git lrunext 

The first patch is mostly cleanup to address the RCU locking in the
function. The second addresses the test_and_set_skip issue, and the third
relocates PageCompound.

I did some digging into the history of the skip bits and since they are
only supposed to be a hint I thought we could probably just drop the
testing portion of the call since the LRU flag is preventing more than one
thread from accessing the function anyway so it would make sense to just
switch it to a set operation similar to what happens when low_pfn ==
end_pfn at the end of the call.

I have only had a chance to build test these since rebasing on the tree. In
addition I am not 100% certain the PageCompound changes are correct as they
operate on the assumption that get_page_unless_zero is enough to keep a
compound page from being split up. I plan on doing some testing tomorrow,
but thought I would push these out now so that we could discuss them.

---

Alexander Duyck (3):
  mm: Drop locked from isolate_migratepages_block
  mm: Drop use of test_and_set_skip in favor of just setting skip
  mm: Identify compound pages sooner in isolate_migratepages_block


 mm/compaction.c |  126 +++
 1 file changed, 44 insertions(+), 82 deletions(-)

--

[RFC PATCH 1/3] mm: Drop locked from isolate_migratepages_block

2020-08-12 Thread Alexander Duyck

From: Alexander Duyck 

We can drop the need for the locked variable by making use of the
lruvec_holds_page_lru_lock function. By doing this we can avoid some rcu
locking ugliness for the case where the lruvec is still holding the LRU
lock associated with the page. Instead we can just use the lruvec and if it
is NULL we assume the lock was released.

Signed-off-by: Alexander Duyck 
---
 mm/compaction.c |   45 -
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index b99c96c4862d..5021a18ef722 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -803,9 +803,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
 {
pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
-   struct lruvec *lruvec;
+   struct lruvec *lruvec = NULL;
unsigned long flags = 0;
-   struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
@@ -866,9 +865,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 * a fatal signal is pending.
 */
if (!(low_pfn % SWAP_CLUSTER_MAX)) {
-   if (locked) {
-   unlock_page_lruvec_irqrestore(locked, flags);
-   locked = NULL;
+   if (lruvec) {
+   unlock_page_lruvec_irqrestore(lruvec, flags);
+   lruvec = NULL;
}
 
if (fatal_signal_pending(current)) {
@@ -949,9 +948,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 */
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
-   if (locked) {
-   unlock_page_lruvec_irqrestore(locked, 
flags);
-   locked = NULL;
+   if (lruvec) {
+   unlock_page_lruvec_irqrestore(lruvec, 
flags);
+   lruvec = NULL;
}
 
if (!isolate_movable_page(page, isolate_mode))
@@ -992,16 +991,13 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (!TestClearPageLRU(page))
goto isolate_fail_put;
 
-   rcu_read_lock();
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
/* If we already hold the lock, we can skip some rechecking */
-   if (lruvec != locked) {
-   if (locked)
-   unlock_page_lruvec_irqrestore(locked, flags);
+   if (!lruvec || !lruvec_holds_page_lru_lock(page, lruvec)) {
+   if (lruvec)
+   unlock_page_lruvec_irqrestore(lruvec, flags);
 
+   lruvec = mem_cgroup_page_lruvec(page, pgdat);
compact_lock_irqsave(>lru_lock, , cc);
-   locked = lruvec;
rcu_read_unlock();
 
lruvec_memcg_debug(lruvec, page);
@@ -1023,8 +1019,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
SetPageLRU(page);
goto isolate_fail_put;
}
-   } else
-   rcu_read_unlock();
+   }
 
/* The whole page is taken off the LRU; skip the tail pages. */
if (PageCompound(page))
@@ -1057,9 +1052,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 
 isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
-   if (locked) {
-   unlock_page_lruvec_irqrestore(locked, flags);
-   locked = NULL;
+   if (lruvec) {
+   unlock_page_lruvec_irqrestore(lruvec, flags);
+   lruvec = NULL;
}
put_page(page);
 
@@ -1073,9 +1068,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 * page anyway.
 */
if (nr_isolated) {
-   if (locked) {
-   unlock_page_lruvec_irqrestore(locked, flags);
-   locked = NULL;
+   if (lruvec) {
+   unlock_page_lruvec_irqrestore(lruvec, flags);
+   lruvec = NULL;
}
putback_movable_pages(>migratepages);
cc->nr_migratepages = 0;
@@ -1102,8 +1097,8 @@ static bool too_many_isolated(pg_data_t *pgd

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2293 matches

Mail list logo