[PATCH] powerpc: Reduce csum_add() complexity for PPC64

2022-02-11 Thread Christophe Leroy
PPC64 does everything in C, gcc is able to skip calculation
when one of the operands in zero.

Move the constant folding in PPC32 part.

This helps GCC and reduces ppc64_defconfig by 170 bytes.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/checksum.h | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/checksum.h 
b/arch/powerpc/include/asm/checksum.h
index 3288a1bf5e8d..e4e25b46ac49 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -95,16 +95,15 @@ static __always_inline __wsum csum_add(__wsum csum, __wsum 
addend)
 {
 #ifdef __powerpc64__
u64 res = (__force u64)csum;
-#endif
+
+   res += (__force u64)addend;
+   return (__force __wsum)((u32)res + (res >> 32));
+#else
if (__builtin_constant_p(csum) && csum == 0)
return addend;
if (__builtin_constant_p(addend) && addend == 0)
return csum;
 
-#ifdef __powerpc64__
-   res += (__force u64)addend;
-   return (__force __wsum)((u32)res + (res >> 32));
-#else
asm("addc %0,%0,%1;"
"addze %0,%0;"
: "+r" (csum) : "r" (addend) : "xer");
-- 
2.34.1



[PATCH v5 5/6] drivers: virtio_mem: use pageblock size as the minimum virtio_mem size.

2022-02-11 Thread Zi Yan
From: Zi Yan 

alloc_contig_range() now only needs to be aligned to pageblock_order,
drop virtio_mem size requirement that it needs to be the max of
pageblock_order and MAX_ORDER.

Signed-off-by: Zi Yan 
---
 drivers/virtio/virtio_mem.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 38becd8d578c..2307e65d18c2 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -2476,13 +2476,12 @@ static int virtio_mem_init_hotplug(struct virtio_mem 
*vm)
  VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
 
/*
-* We want subblocks to span at least MAX_ORDER_NR_PAGES and
-* pageblock_nr_pages pages. This:
+* We want subblocks to span at least pageblock_nr_pages pages.
+* This:
 * - Is required for now for alloc_contig_range() to work reliably -
 *   it doesn't properly handle smaller granularity on ZONE_NORMAL.
 */
-   sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
-   pageblock_nr_pages) * PAGE_SIZE;
+   sb_size = pageblock_nr_pages * PAGE_SIZE;
sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
 
if (sb_size < memory_block_size_bytes() && !force_bbm) {
-- 
2.34.1



[PATCH v5 6/6] arch: powerpc: adjust fadump alignment to be pageblock aligned.

2022-02-11 Thread Zi Yan
From: Zi Yan 

CMA only requires pageblock alignment now. Change CMA alignment in
fadump too.

Signed-off-by: Zi Yan 
---
 arch/powerpc/include/asm/fadump-internal.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index 52189928ec08..fbfca85b4200 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -20,9 +20,7 @@
 #define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt)
 
 /* Alignment per CMA requirement. */
-#define FADUMP_CMA_ALIGNMENT   (PAGE_SIZE <<   \
-max_t(unsigned long, MAX_ORDER - 1,\
-pageblock_order))
+#define FADUMP_CMA_ALIGNMENT   (PAGE_SIZE << pageblock_order)
 
 /* FAD commands */
 #define FADUMP_REGISTER1
-- 
2.34.1



[PATCH v5 4/6] mm: cma: use pageblock_order as the single alignment

2022-02-11 Thread Zi Yan
From: Zi Yan 

Now alloc_contig_range() works at pageblock granularity. Change CMA
allocation, which uses alloc_contig_range(), to use pageblock_order
alignment.

Signed-off-by: Zi Yan 
---
 include/linux/mmzone.h  | 5 +
 kernel/dma/contiguous.c | 2 +-
 mm/cma.c| 6 ++
 mm/page_alloc.c | 4 ++--
 4 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3fff6deca2c0..da38c8436493 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -54,10 +54,7 @@ enum migratetype {
 *
 * The way to use it is to change migratetype of a range of
 * pageblocks to MIGRATE_CMA which can be done by
-* __free_pageblock_cma() function.  What is important though
-* is that a range of pageblocks must be aligned to
-* MAX_ORDER_NR_PAGES should biggest page be bigger than
-* a single pageblock.
+* __free_pageblock_cma() function.
 */
MIGRATE_CMA,
 #endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 3d63d91cba5c..ac35b14b0786 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -399,7 +399,7 @@ static const struct reserved_mem_ops rmem_cma_ops = {
 
 static int __init rmem_cma_setup(struct reserved_mem *rmem)
 {
-   phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
+   phys_addr_t align = PAGE_SIZE << pageblock_order;
phys_addr_t mask = align - 1;
unsigned long node = rmem->fdt_node;
bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
diff --git a/mm/cma.c b/mm/cma.c
index 766f1b82b532..b2e927fab7b5 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -187,8 +187,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, 
phys_addr_t size,
return -EINVAL;
 
/* ensure minimal alignment required by mm core */
-   alignment = PAGE_SIZE <<
-   max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
+   alignment = PAGE_SIZE << pageblock_order;
 
/* alignment should be aligned with order_per_bit */
if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit))
@@ -275,8 +274,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
 * migratetype page by page allocator's buddy algorithm. In the case,
 * you couldn't get a contiguous memory, which is not what we want.
 */
-   alignment = max(alignment,  (phys_addr_t)PAGE_SIZE <<
- max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+   alignment = max(alignment,  (phys_addr_t)PAGE_SIZE << pageblock_order);
if (fixed && base & (alignment - 1)) {
ret = -EINVAL;
pr_err("Region at %pa must be aligned to %pa bytes\n",
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7a4fa21aea5c..ac9432e63ce1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -9214,8 +9214,8 @@ int isolate_single_pageblock(unsigned long boundary_pfn, 
gfp_t gfp_flags,
  * be either of the two.
  * @gfp_mask:  GFP mask to use during compaction
  *
- * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
- * aligned.  The PFN range must belong to a single zone.
+ * The PFN range does not have to be pageblock aligned. The PFN range must
+ * belong to a single zone.
  *
  * The first thing this routine does is attempt to MIGRATE_ISOLATE all
  * pageblocks in the range.  Once isolated, the pageblocks should not
-- 
2.34.1



[PATCH v5 3/6] mm: make alloc_contig_range work at pageblock granularity

2022-02-11 Thread Zi Yan
From: Zi Yan 

alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
pageblocks with different migratetypes. It might unnecessarily convert
extra pageblocks at the beginning and at the end of the range. Change
alloc_contig_range() to work at pageblock granularity.

Special handling is needed for free pages and in-use pages across the
boundaries of the range specified alloc_contig_range(). Because these
partially isolated pages causes free page accounting issues. The free
pages will be split and freed into separate migratetype lists; the
in-use pages will be migrated then the freed pages will be handled.

Signed-off-by: Zi Yan 
---
 include/linux/page-isolation.h |   2 +-
 mm/internal.h  |   3 +
 mm/memory_hotplug.c|   3 +-
 mm/page_alloc.c| 235 +
 mm/page_isolation.c|  33 -
 5 files changed, 211 insertions(+), 65 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 4ef7be6def83..78ff940cc169 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -54,7 +54,7 @@ int move_freepages_block(struct zone *zone, struct page *page,
  */
 int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-unsigned migratetype, int flags);
+unsigned migratetype, int flags, gfp_t gfp_flags);
 
 /*
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
diff --git a/mm/internal.h b/mm/internal.h
index 0d240e876831..509cbdc25992 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -319,6 +319,9 @@ isolate_freepages_range(struct compact_control *cc,
 int
 isolate_migratepages_range(struct compact_control *cc,
   unsigned long low_pfn, unsigned long end_pfn);
+
+int
+isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int 
isolate_before_boundary);
 #endif
 int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ce68098832aa..82406d2f3e46 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1863,7 +1863,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned 
long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
   MIGRATE_MOVABLE,
-  MEMORY_OFFLINE | REPORT_FAILURE);
+  MEMORY_OFFLINE | REPORT_FAILURE,
+  GFP_USER | __GFP_MOVABLE | 
__GFP_RETRY_MAYFAIL);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62ef78f3d771..7a4fa21aea5c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8985,7 +8985,7 @@ static inline void alloc_contig_dump_pages(struct 
list_head *page_list)
 #endif
 
 /* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
 {
/* This function is based on compact_zone() from compaction.c. */
@@ -9043,6 +9043,167 @@ static int __alloc_contig_migrate_range(struct 
compact_control *cc,
return 0;
 }
 
+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page: the original free page
+ * @order: the order of the page
+ * @split_pfn_offset:  split offset within the page
+ *
+ * It is used when the free page crosses two pageblocks with different 
migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+static inline void split_free_page(struct page *free_page,
+   int order, unsigned long split_pfn_offset)
+{
+   struct zone *zone = page_zone(free_page);
+   unsigned long free_page_pfn = page_to_pfn(free_page);
+   unsigned long pfn;
+   unsigned long flags;
+   int free_page_order;
+
+   spin_lock_irqsave(&zone->lock, flags);
+   del_page_from_free_list(free_page, zone, order);
+   for (pfn = free_page_pfn;
+pfn < free_page_pfn + (1UL << order);) {
+   int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+   free_page_order = order_base_2(split_pfn_offset);
+   __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+   mt, FPI_NONE);
+   pfn += 1UL << free_page_order;
+   split_pfn_offset -= (1UL << free_page_order);
+   /* we have done the first part, now switch to second part 

[PATCH v5 0/6] Use pageblock_order for cma and alloc_contig_range alignment.

2022-02-11 Thread Zi Yan
From: Zi Yan 

Hi all,

This patchset tries to remove the MAX_ORDER-1 alignment requirement for CMA
and alloc_contig_range(). It prepares for my upcoming changes to make
MAX_ORDER adjustable at boot time[1]. It is on top of mmotm-2022-02-08-15-31.

Changelog
===
V5
---
1. Moved isolation address alignment handling in start_isolate_page_range().
2. Rewrote and simplified how alloc_contig_range() works at pageblock
   granularity (Patch 3). Only two pageblock migratetypes need to be saved and
   restored. start_isolate_page_range() might need to migrate pages in this
   version, but it prevents the caller from worrying about
   max(MAX_ORDER_NR_PAEGS, pageblock_nr_pages) alignment after the page range
   is isolated.

V4
---
1. Dropped two irrelevant patches on non-lru compound page handling, as
   it is not supported upstream.
2. Renamed migratetype_has_fallback() to migratetype_is_mergeable().
3. Always check whether two pageblocks can be merged in
   __free_one_page() when order is >= pageblock_order, as the case (not
   mergeable pageblocks are isolated, CMA, and HIGHATOMIC) becomes more common.
3. Moving has_unmovable_pages() is now a separate patch.
4. Removed MAX_ORDER-1 alignment requirement in the comment in virtio_mem code.

Description
===

The MAX_ORDER - 1 alignment requirement comes from that alloc_contig_range()
isolates pageblocks to remove free memory from buddy allocator but isolating
only a subset of pageblocks within a page spanning across multiple pageblocks
causes free page accounting issues. Isolated page might not be put into the
right free list, since the code assumes the migratetype of the first pageblock
as the whole free page migratetype. This is based on the discussion at [2].

To remove the requirement, this patchset:
1. isolates pages at pageblock granularity instead of
   max(MAX_ORDER_NR_PAEGS, pageblock_nr_pages);
2. splits free pages across the specified range or migrates in-use pages
   across the specified range then splits the freed page to avoid free page
   accounting issues (it happens when multiple pageblocks within a single page
   have different migratetypes);
3. only checks unmovable pages within the range instead of MAX_ORDER - 1 aligned
   range during isolation to avoid alloc_contig_range() failure when pageblocks
   within a MAX_ORDER - 1 aligned range are allocated separately.
4. returns pages not in the range as it did before.

One optimization might come later:
1. make MIGRATE_ISOLATE a separate bit to be able to restore the original
   migratetypes when isolation fails in the middle of the range.

Feel free to give comments and suggestions. Thanks.

[1] https://lore.kernel.org/linux-mm/20210805190253.2795604-1-zi@sent.com/
[2] 
https://lore.kernel.org/linux-mm/d19fb078-cb9b-f60f-e310-fdeea1b94...@redhat.com/

Zi Yan (6):
  mm: page_isolation: move has_unmovable_pages() to mm/page_isolation.c
  mm: page_isolation: check specified range for unmovable pages
  mm: make alloc_contig_range work at pageblock granularity
  mm: cma: use pageblock_order as the single alignment
  drivers: virtio_mem: use pageblock size as the minimum virtio_mem
size.
  arch: powerpc: adjust fadump alignment to be pageblock aligned.

 arch/powerpc/include/asm/fadump-internal.h |   4 +-
 drivers/virtio/virtio_mem.c|   7 +-
 include/linux/mmzone.h |   5 +-
 include/linux/page-isolation.h |  16 +-
 kernel/dma/contiguous.c|   2 +-
 mm/cma.c   |   6 +-
 mm/internal.h  |   3 +
 mm/memory_hotplug.c|   3 +-
 mm/page_alloc.c| 371 ++---
 mm/page_isolation.c| 172 +-
 10 files changed, 367 insertions(+), 222 deletions(-)

-- 
2.34.1



[PATCH v5 1/6] mm: page_isolation: move has_unmovable_pages() to mm/page_isolation.c

2022-02-11 Thread Zi Yan
From: Zi Yan 

has_unmovable_pages() is only used in mm/page_isolation.c. Move it from
mm/page_alloc.c and make it static.

Signed-off-by: Zi Yan 
Reviewed-by: Oscar Salvador 
---
 include/linux/page-isolation.h |   2 -
 mm/page_alloc.c| 119 -
 mm/page_isolation.c| 119 +
 3 files changed, 119 insertions(+), 121 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 572458016331..e14eddf6741a 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -33,8 +33,6 @@ static inline bool is_migrate_isolate(int migratetype)
 #define MEMORY_OFFLINE 0x1
 #define REPORT_FAILURE 0x2
 
-struct page *has_unmovable_pages(struct zone *zone, struct page *page,
-int migratetype, int flags);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
int migratetype, int *num_movable);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cface1d38093..e2c6a67fc386 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8962,125 +8962,6 @@ void *__init alloc_large_system_hash(const char 
*tablename,
return table;
 }
 
-/*
- * This function checks whether pageblock includes unmovable pages or not.
- *
- * PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- * check without lock_page also may miss some movable non-lru pages at
- * race condition. So you can't expect this function should be exact.
- *
- * Returns a page without holding a reference. If the caller wants to
- * dereference that page (e.g., dumping), it has to make sure that it
- * cannot get removed (e.g., via memory unplug) concurrently.
- *
- */
-struct page *has_unmovable_pages(struct zone *zone, struct page *page,
-int migratetype, int flags)
-{
-   unsigned long iter = 0;
-   unsigned long pfn = page_to_pfn(page);
-   unsigned long offset = pfn % pageblock_nr_pages;
-
-   if (is_migrate_cma_page(page)) {
-   /*
-* CMA allocations (alloc_contig_range) really need to mark
-* isolate CMA pageblocks even when they are not movable in fact
-* so consider them movable here.
-*/
-   if (is_migrate_cma(migratetype))
-   return NULL;
-
-   return page;
-   }
-
-   for (; iter < pageblock_nr_pages - offset; iter++) {
-   page = pfn_to_page(pfn + iter);
-
-   /*
-* Both, bootmem allocations and memory holes are marked
-* PG_reserved and are unmovable. We can even have unmovable
-* allocations inside ZONE_MOVABLE, for example when
-* specifying "movablecore".
-*/
-   if (PageReserved(page))
-   return page;
-
-   /*
-* If the zone is movable and we have ruled out all reserved
-* pages then it should be reasonably safe to assume the rest
-* is movable.
-*/
-   if (zone_idx(zone) == ZONE_MOVABLE)
-   continue;
-
-   /*
-* Hugepages are not in LRU lists, but they're movable.
-* THPs are on the LRU, but need to be counted as #small pages.
-* We need not scan over tail pages because we don't
-* handle each tail page individually in migration.
-*/
-   if (PageHuge(page) || PageTransCompound(page)) {
-   struct page *head = compound_head(page);
-   unsigned int skip_pages;
-
-   if (PageHuge(page)) {
-   if 
(!hugepage_migration_supported(page_hstate(head)))
-   return page;
-   } else if (!PageLRU(head) && !__PageMovable(head)) {
-   return page;
-   }
-
-   skip_pages = compound_nr(head) - (page - head);
-   iter += skip_pages - 1;
-   continue;
-   }
-
-   /*
-* We can't use page_count without pin a page
-* because another CPU can free compound page.
-* This check already skips compound tails of THP
-* because their page->_refcount is zero at all time.
-*/
-   if (!page_ref_count(page)) {
-   if (PageBuddy(page))
-   iter += (1 << buddy_order(page)) - 1;
-   continue;
-   }
-
-   /*
-* The HWPoisoned page may be

[PATCH v5 2/6] mm: page_isolation: check specified range for unmovable pages

2022-02-11 Thread Zi Yan
From: Zi Yan 

Enable set_migratetype_isolate() to check specified sub-range for
unmovable pages during isolation. Page isolation is done
at max(MAX_ORDER_NR_PAEGS, pageblock_nr_pages) granularity, but not all
pages within that granularity are intended to be isolated. For example,
alloc_contig_range(), which uses page isolation, allows ranges without
alignment. This commit makes unmovable page check only look for
interesting pages, so that page isolation can succeed for any
non-overlapping ranges.

Signed-off-by: Zi Yan 
---
 include/linux/page-isolation.h | 12 +
 mm/page_alloc.c| 15 +--
 mm/page_isolation.c| 46 +-
 3 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index e14eddf6741a..4ef7be6def83 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -15,6 +15,18 @@ static inline bool is_migrate_isolate(int migratetype)
 {
return migratetype == MIGRATE_ISOLATE;
 }
+static inline unsigned long pfn_max_align_down(unsigned long pfn)
+{
+   return ALIGN_DOWN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
+pageblock_nr_pages));
+}
+
+static inline unsigned long pfn_max_align_up(unsigned long pfn)
+{
+   return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
+   pageblock_nr_pages));
+}
+
 #else
 static inline bool has_isolate_pageblock(struct zone *zone)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2c6a67fc386..62ef78f3d771 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8963,18 +8963,6 @@ void *__init alloc_large_system_hash(const char 
*tablename,
 }
 
 #ifdef CONFIG_CONTIG_ALLOC
-static unsigned long pfn_max_align_down(unsigned long pfn)
-{
-   return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
-pageblock_nr_pages) - 1);
-}
-
-static unsigned long pfn_max_align_up(unsigned long pfn)
-{
-   return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
-   pageblock_nr_pages));
-}
-
 #if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
 /* Usage: See admin-guide/dynamic-debug-howto.rst */
@@ -9119,8 +9107,7 @@ int alloc_contig_range(unsigned long start, unsigned long 
end,
 * put back to page allocator so that buddy can use them.
 */
 
-   ret = start_isolate_page_range(pfn_max_align_down(start),
-  pfn_max_align_up(end), migratetype, 0);
+   ret = start_isolate_page_range(start, end, migratetype, 0);
if (ret)
return ret;
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index b34f1310aeaa..64d093ab83ec 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -16,7 +16,8 @@
 #include 
 
 /*
- * This function checks whether pageblock includes unmovable pages or not.
+ * This function checks whether pageblock within [start_pfn, end_pfn) includes
+ * unmovable pages or not.
  *
  * PageLRU check without isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
@@ -29,11 +30,14 @@
  *
  */
 static struct page *has_unmovable_pages(struct zone *zone, struct page *page,
-int migratetype, int flags)
+int migratetype, int flags,
+unsigned long start_pfn, unsigned long end_pfn)
 {
-   unsigned long iter = 0;
-   unsigned long pfn = page_to_pfn(page);
-   unsigned long offset = pfn % pageblock_nr_pages;
+   unsigned long first_pfn = max(page_to_pfn(page), start_pfn);
+   unsigned long pfn = first_pfn;
+   unsigned long last_pfn = min(ALIGN(pfn + 1, pageblock_nr_pages), 
end_pfn);
+
+   page = pfn_to_page(pfn);
 
if (is_migrate_cma_page(page)) {
/*
@@ -47,8 +51,8 @@ static struct page *has_unmovable_pages(struct zone *zone, 
struct page *page,
return page;
}
 
-   for (; iter < pageblock_nr_pages - offset; iter++) {
-   page = pfn_to_page(pfn + iter);
+   for (pfn = first_pfn; pfn < last_pfn; pfn++) {
+   page = pfn_to_page(pfn);
 
/*
 * Both, bootmem allocations and memory holes are marked
@@ -85,7 +89,7 @@ static struct page *has_unmovable_pages(struct zone *zone, 
struct page *page,
}
 
skip_pages = compound_nr(head) - (page - head);
-   iter += skip_pages - 1;
+   pfn += skip_pages - 1;
continue;
}
 
@@ -97,7 +101,7 @@ static struct page *has_unmovable_pages(struct zone *zone, 
struct page *page,
 */
if (!page_ref_count(page)) {
if (PageBuddy(pag

Re: rcutorture’s init segfaults in ppc64le VM

2022-02-11 Thread Paul Menzel

Dear Michael,


Am 11.02.22 um 15:19 schrieb Paul Menzel:


Am 11.02.22 um 02:48 schrieb Michael Ellerman:

Paul Menzel writes:

Am 08.02.22 um 11:09 schrieb Michael Ellerman:

Paul Menzel writes:


[…]


On the POWER8 server IBM S822LC running Ubuntu 21.10, building Linux
5.17-rc2+ with rcutorture tests


I'm not sure if that's the host kernel version or the version you're
using of rcutorture? Can you tell us the sha1 of your host kernel 
and of the tree you're running rcutorture from?


The host system runs Linux 5.17-rc1+ started with kexec. Unfortunately,
I am unable to find the exact sha1.

  $ more /proc/version
  Linux version 5.17.0-rc1+ (x...@eddb.molgen.mpg.de) (Ubuntu clang version 
13.0.0-2, LLD 13.0.0) #1 SMP Fri Jan 28 17:13:04 CET 2022


OK. In general rc1 kernels can have issues, so it might be worth
rebooting the host into either v5.17-rc3 or a distro or stable kernel.
Just to rule out any issues on the host.


Yes, that was a good test. It works with Ubuntu’s 5.13 Linux kernel.

     $ more /proc/version
     Linux version 5.13.0-28-generic (buildd@bos02-ppc64el-013) (gcc (Ubuntu 
11.2.0-7ubuntu2) 11.2.0, GNU ld (GNU Binutils for Ubuntu) 2.37) #31-Ubuntu SMP 
Thu Jan 13 17:40:19 UTC 2022

I have to do more tests, but it could be LLVM/clang related.


Building commit f1baf68e1383 (Merge tag 'net-5.17-rc4' of 
git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net) with the ata 
patches on top with GCC, I am unable to reproduce the issue. Before I 
built it with


make -j100 LLVM=1 LLVM_IAS=0 bindeb-pkg

[…]


Kind regards,

Paul


Re: [PATCH v5 2/6] powerpc/kexec_file: Add KEXEC_SIG support.

2022-02-11 Thread Paul Menzel

Dear Michal,


Am 09.02.22 um 13:01 schrieb Michal Suchánek:


On Wed, Feb 09, 2022 at 07:44:15AM +0100, Paul Menzel wrote:



Am 11.01.22 um 12:37 schrieb Michal Suchanek:


[…]


How can this be tested?


Apparently KEXEC_SIG_FORCE is x86 only although the use of the option is
arch neutral:

arch/x86/Kconfig:config KEXEC_SIG_FORCE
kernel/kexec_file.c:if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE))
{

Maybe it should be moved?


Sounds good.


I used a patched kernel that enables lockdown in secure boot, and then
verified that signed kernel can be loaded by kexec and unsigned not,
with KEXEC_SIG enabled and IMA_KEXEC disabled.

The lockdown support can be enabled on any platform, and although I
can't find it documented anywhere there appears to be code in kexec_file
to take it into account:
kernel/kexec.c: result = security_locked_down(LOCKDOWN_KEXEC);
kernel/kexec_file.c:security_locked_down(LOCKDOWN_KEXEC))
kernel/module.c:return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
kernel/params.c:security_locked_down(LOCKDOWN_MODULE_PARAMETERS))
and lockdown can be enabled with a buildtime option, a kernel parameter, or a
debugfs file.

Still for testing lifting KEXEC_SIG_FORCE to some arch-neutral Kconfig file is
probably the simplest option.

kexec -s option should be used to select kexec_file rather than the old
style kexec which would either fail always or succeed always regardelss
of signature.


Thank you.


Signed-off-by: Michal Suchanek 
---
v3: - Philipp Rudo : Update the comit message with
explanation why the s390 code is usable on powerpc.
  - Include correct header for mod_check_sig
  - Nayna : Mention additional IMA features
in kconfig text
---
   arch/powerpc/Kconfig| 16 
   arch/powerpc/kexec/elf_64.c | 36 
   2 files changed, 52 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index dea74d7717c0..1cde9b6c5987 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -560,6 +560,22 @@ config KEXEC_FILE
   config ARCH_HAS_KEXEC_PURGATORY
def_bool KEXEC_FILE
+config KEXEC_SIG
+   bool "Verify kernel signature during kexec_file_load() syscall"
+   depends on KEXEC_FILE && MODULE_SIG_FORMAT
+   help
+ This option makes kernel signature verification mandatory for
+ the kexec_file_load() syscall.
+
+ In addition to that option, you need to enable signature
+ verification for the corresponding kernel image type being
+ loaded in order for this to work.
+
+ Note: on powerpc IMA_ARCH_POLICY also implements kexec'ed kernel
+ verification. In addition IMA adds kernel hashes to the measurement
+ list, extends IMA PCR in the TPM, and implements kernel image
+ blacklist by hash.


So, what is the takeaway for the user? IMA_ARCH_POLICY is preferred? What is
the disadvantage, and two implementations(?) needed then? More overhead?


IMA_KEXEC does more than KEXEC_SIG. The overhead is probably not big
unless you are trying to really minimize the kernel code size.

Arguably the simpler implementation has less potential for bugs, too.
Both in code and in user configuration required to enable the feature.

Interestingly IMA_ARCH_POLICY depends on KEXEC_SIG rather than
IMA_KEXEC. Just mind-boggling.


I have not looked into that.


The main problem with IMA_KEXEC from my point of view is it is not portable.
To record the measurements TPM support is requireed which is not available on
all platforms. It does not support PE so it cannot be used on platforms
that use PE kernel signature format.


Could you add that to the comment please?


+
   config RELOCATABLE
bool "Build a relocatable kernel"
depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE))
diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index eeb258002d1e..98d1cb5135b4 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -23,6 +23,7 @@
   #include 
   #include 
   #include 
+#include 
   static void *elf64_load(struct kimage *image, char *kernel_buf,
unsigned long kernel_len, char *initrd,
@@ -151,7 +152,42 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
return ret ? ERR_PTR(ret) : NULL;
   }
+#ifdef CONFIG_KEXEC_SIG
+int elf64_verify_sig(const char *kernel, unsigned long kernel_len)
+{
+   const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1;
+   struct module_signature *ms;
+   unsigned long sig_len;


Use size_t to match the signature of `verify_pkcs7_signature()`?


Nope. struct module_signature uses unsigned long, and this needs to be
matched to avoid type errors on 32bit.


I meant for `sig_len`.


Technically using size_t for in-memory buffers is misguided because
AFAICT no memory buffer can be bigger than ULONG_MAX, and size_t is
non-native type on 32bit.

Sure, the situatio

Re: [RFC PATCH 0/3] powerpc64/bpf: Add support for BPF Trampolines

2022-02-11 Thread Christophe Leroy


Le 07/02/2022 à 08:07, Naveen N. Rao a écrit :
> This is an early RFC series that adds support for BPF Trampolines on
> powerpc64. Some of the selftests are passing for me, but this needs more
> testing and I've likely missed a few things as well. A review of the
> patches and feedback about the overall approach will be great.
> 
> This series depends on some of the other BPF JIT fixes and enhancements
> posted previously, as well as on ftrace direct enablement on powerpc
> which has also been posted in the past.

Is there any reason to limit this to powerpc64 ?

Christophe

Re: [PATCH v2] powerpc/mm: Update default hugetlb size early

2022-02-11 Thread Aneesh Kumar K.V
Aneesh Kumar K.V  writes:

> David Hildenbrand  writes:
>
>> On 11.02.22 10:16, Aneesh Kumar K V wrote:
>>> On 2/11/22 14:00, David Hildenbrand wrote:
 On 11.02.22 07:52, Aneesh Kumar K.V wrote:
> commit: d9c234005227 ("Do not depend on MAX_ORDER when grouping pages by 
> mobility")



> I could build a kernel with FORCE_MAX_ZONEORDER=8 and pageblock_order =
> 8. We need to disable THP for such a kernel to boot, because THP do
> check for PMD_ORDER < MAX_ORDER. I was able to boot that kernel on a
> virtualized platform, but then gigantic_page_runtime_supported is not
> supported on such config with hash translation.
>
> On non virtualized platform I am hitting crashes like below during boot.
>
> [   47.637865][   C42] 
> = 
>   
>   
>  
> [   47.637907][   C42] BUG pgtable-2^11 (Not tainted): Object already free
>   
>
> [   47.637925][   C42] 
> - 
>   
> [   47.637925][   C42]
>   
>
> [   47.637945][   C42] Allocated in __pud_alloc+0x84/0x2a0 age=278 cpu=40 
> pid=1409  
>
> [   47.637974][   C42]  __slab_alloc.isra.0+0x40/0x60 
>   
>
> [   47.637995][   C42]  kmem_cache_alloc+0x1a8/0x510  
>   
>
> [   47.638010][   C42]  __pud_alloc+0x84/0x2a0
>   
>
> [   47.638024][   C42]  copy_page_range+0x38c/0x1b90  
>   
>
> [   47.638040][   C42]  dup_mm+0x548/0x880
>   
>
> [   47.638058][   C42]  copy_process+0xdc0/0x1e90 
>   
>
> [   47.638076][   C42]  kernel_clone+0xd4/0x9d0   
>   
>
> [   47.638094][   C42]  __do_sys_clone+0x88/0xe0  
>   
>
> [   47.638112][   C42]  system_call_exception+0x368/0x3a0 
>   
>
> [   47.638128][   C42]  system_call_common+0xec/0x250 
>   
>
> [   47.638147][   C42] Freed in __tlb_remove_table+0x1d4/0x200 age=263 cpu=57 
> pid=326   
>
> [   47.638172][   C42]  kmem_cache_free+0x44c/0x680   
>   
>
> [   47.638187][   C42]  __tlb_remove_table+0x1d4/0x200
>   
>
> [   47.638204][   C42]  tlb_remove_table_rcu+0x54/0xa0
>   
>
> [   47.638222][   C42]  rcu_core+0xdd4/0x15d0 
>   
>
> [   47.638239][   C42]  __do_softirq+0x360/0x69c  
>   
>
> [   47.638257][   C42]  run_ksoftirqd+0x54/0xc0   
>   
>
> [   47.638273][   C42]  smpboot_thread_fn+0x28c/0x2f0 
>   
>
> [   47.638290][   C42]  kthread+0x1a4/0x1b0   
>   
>
> [   47.638305][   C42]  ret_from_kernel_thread+0x5c/0x64  
> 

[PATCH] powerpc/64: Rewrite loading of AMR_KUAP_BLOCKED in assembly

2022-02-11 Thread Christophe Leroy
Constant loading of AMR_KUAP_BLOCKED takes 5 instructions:

c0016a40:   4c 00 01 2c isync
c0016a44:   3d 20 fc ff lis r9,-769
c0016a48:   61 29 ff ff ori r9,r9,65535
c0016a4c:   79 29 07 c6 rldicr  r9,r9,32,31
c0016a50:   65 29 ff ff orisr9,r9,65535
c0016a54:   61 29 ff ff ori r9,r9,65535
c0016a58:   7d 3d 03 a6 mtamr   r9
c0016a5c:   4c 00 01 2c isync

Until GCC is fixed, implement it in assembly using 2 instructions:

c0016a50:   4c 00 01 2c isync
c0016a54:   39 20 fc ff li  r9,-769
c0016a58:   79 29 80 02 rotldi  r9,r9,48
c0016a5c:   7d 3d 03 a6 mtamr   r9
c0016a60:   4c 00 01 2c isync

With this change a ppc64_defconfig build is reduced by 15 kbytes.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94395
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/64/kup.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/kup.h 
b/arch/powerpc/include/asm/book3s/64/kup.h
index 54cf46808157..35c017ba29e1 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -338,6 +338,10 @@ static __always_inline void set_kuap(unsigned long value)
 * before and after the move to AMR. See table 6 on page 1134.
 */
isync();
+
+   if (__builtin_constant_p(value) && value == 0xfcff)
+   asm("li %0, %1 ; rotldi %0, %0, 48" : "=r"(value) : 
"i"(0xfcff));
+
mtspr(SPRN_AMR, value);
isync();
 }
-- 
2.34.1



Re: rcutorture’s init segfaults in ppc64le VM

2022-02-11 Thread Paul Menzel

Dear Michael,


Am 11.02.22 um 02:48 schrieb Michael Ellerman:

Paul Menzel writes:

Am 08.02.22 um 11:09 schrieb Michael Ellerman:

Paul Menzel writes:


[…]


On the POWER8 server IBM S822LC running Ubuntu 21.10, building Linux
5.17-rc2+ with rcutorture tests


I'm not sure if that's the host kernel version or the version you're
using of rcutorture? Can you tell us the sha1 of your host kernel and of
the tree you're running rcutorture from?


The host system runs Linux 5.17-rc1+ started with kexec. Unfortunately,
I am unable to find the exact sha1.

  $ more /proc/version
  Linux version 5.17.0-rc1+ (x...@eddb.molgen.mpg.de) (Ubuntu clang version 
13.0.0-2, LLD 13.0.0) #1 SMP Fri Jan 28 17:13:04 CET 2022


OK. In general rc1 kernels can have issues, so it might be worth
rebooting the host into either v5.17-rc3 or a distro or stable kernel.
Just to rule out any issues on the host.


Yes, that was a good test. It works with Ubuntu’s 5.13 Linux kernel.

$ more /proc/version
Linux version 5.13.0-28-generic (buildd@bos02-ppc64el-013) (gcc 
(Ubuntu 11.2.0-7ubuntu2) 11.2.0, GNU ld (GNU Binutils for Ubuntu) 2.37) 
#31-Ubuntu SMP Thu Jan 13 17:40:19 UTC 2022


I have to do more tests, but it could be LLVM/clang related.


The Linux tree, from where I run rcutorture from, is at commit
dfd42facf1e4 (Linux 5.17-rc3) with four patches on top:

  $ git log --oneline -6
  207cec79e752 (HEAD -> master, origin/master, origin/HEAD) Problems with 
rcutorture on ppc64le: allmodconfig(2) and other failures
  8c82f96fbe57 ata: libata-sata: improve sata_link_debounce()
  a447541d925f ata: libata-sata: remove debounce delay by default
  afd84e1eeafc ata: libata-sata: introduce struct sata_deb_timing
  f4caf7e48b75 ata: libata-sata: Simplify sata_link_resume() interface
  dfd42facf1e4 (tag: v5.17-rc3) Linux 5.17-rc3


   $ tools/testing/selftests/rcutorture/bin/torture.sh --duration 10

the built init

   $ file tools/testing/selftests/rcutorture/initrd/init
   tools/testing/selftests/rcutorture/initrd/init: ELF 64-bit LSB 
executable, 64-bit PowerPC or cisco 7500, version 1 (SYSV), statically linked, 
BuildID[sha1]=0ded0e45649184a296f30d611f7a03cc51ecb616, for GNU/Linux 3.10.0, 
stripped


Mine looks pretty much identical:

$ file tools/testing/selftests/rcutorture/initrd/init
tools/testing/selftests/rcutorture/initrd/init: ELF 64-bit LSB executable, 
64-bit PowerPC or cisco 7500, version 1 (SYSV), statically linked, 
BuildID[sha1]=86078bf6e5d54ab0860d36aa9a65d52818b972c8, for GNU/Linux 3.10.0, 
stripped


segfaults in QEMU. From one of the log files


But mine doesn't segfault, it runs fine and the test completes.

What qemu version are you using?

I tried 4.2.1 and 6.2.0, both worked.


  $ qemu-system-ppc64le --version
  QEMU emulator version 6.0.0 (Debian 1:6.0+dfsg-2expubuntu1.1)
  Copyright (c) 2003-2021 Fabrice Bellard and the QEMU Project developers


OK, that's one difference between our setups, but I'd be surprised if it
explains this bug, but I guess anything's possible.


/dev/shm/linux/tools/testing/selftests/rcutorture/res/2022.02.01-21.52.37-torture/results-rcutorture/TREE03/console.log


Sorry, that was the wrong path/test. The correct one for the excerpt
below is:

  
/dev/shm/linux/tools/testing/selftests/rcutorture/res/2022.02.01-21.52.37-torture/results-locktorture-kasan/LOCK01/console.log


(For TREE03, QEMU does not start the Linux kernel at all, that means no
output after:

  Booting Linux via __start() @ 0x0040 ...


OK yeah I see that too.

Removing "threadirqs" from 
tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
seems to fix it.


Nice find. I have no idea, what that means though.


I still see some preempt related warnings, we clearly have some bugs
with preempt enabled.


You can now download the content of
`/dev/shm/linux/tools/testing/selftests/rcutorture/res/2022.02.01-21.52.37-torture/results-locktorture-kasan/LOCK01`
[1, 65 MB].

Can you reproduce the segmentation fault with the line below?

  $ qemu-system-ppc64 -enable-kvm -nographic -smp cores=1,threads=8 \
  -net none -enable-kvm -M pseries -nodefaults -device spapr-vscsi -serial 
stdio -m 512 \
  -kernel 
/dev/shm/linux/tools/testing/selftests/rcutorture/res/2022.02.01-21.52.37-torture/results-locktorture-kasan/LOCK01/vmlinux
 \
  -append "debug_boot_weak_hash panic=-1 console=ttyS0 \
  torture.disable_onoff_at_boot locktorture.onoff_interval=3 \
  locktorture.onoff_holdoff=30 locktorture.stat_interval=15 \
  locktorture.shutdown_secs=60 locktorture.verbose=1"


That works fine for me, boots and runs the test, then shuts down.

I assume you see the segfault on every boot, not intermittently?

So the differences between our setups are the host kernel and the qemu
version. Can you try a different host kernel easily?

The other thing would be to try a different qemu version, you might need
to build from source, 

[PATCH] powerpc/64: Force inlining of prevent_user_access() and set_kuap()

2022-02-11 Thread Christophe Leroy
A ppc64_defconfig build exhibits about 10 copied of
prevent_user_access(). It also have one copy of set_kuap().

c0017340 <.prevent_user_access.constprop.0>:
c001a038:   4b ff d3 09 bl  c0017340 
<.prevent_user_access.constprop.0>
c001aabc:   4b ff c8 85 bl  c0017340 
<.prevent_user_access.constprop.0>
c001ab38:   4b ff c8 09 bl  c0017340 
<.prevent_user_access.constprop.0>
c001ade0:   4b ff c5 61 bl  c0017340 
<.prevent_user_access.constprop.0>
c0039b90 <.prevent_user_access.constprop.0>:
c003ac08:   4b ff ef 89 bl  c0039b90 
<.prevent_user_access.constprop.0>
c003b9d0:   4b ff e1 c1 bl  c0039b90 
<.prevent_user_access.constprop.0>
c003ba54:   4b ff e1 3d bl  c0039b90 
<.prevent_user_access.constprop.0>
c003bbfc:   4b ff df 95 bl  c0039b90 
<.prevent_user_access.constprop.0>
c015dde0 <.prevent_user_access.constprop.0>:
c01612c0:   4b ff cb 21 bl  c015dde0 
<.prevent_user_access.constprop.0>
c0161b54:   4b ff c2 8d bl  c015dde0 
<.prevent_user_access.constprop.0>
c0188cf0 <.prevent_user_access.constprop.0>:
c018d658:   4b ff b6 99 bl  c0188cf0 
<.prevent_user_access.constprop.0>
c030fe20 <.prevent_user_access.constprop.0>:
c03123d4:   4b ff da 4d bl  c030fe20 
<.prevent_user_access.constprop.0>
c0313970:   4b ff c4 b1 bl  c030fe20 
<.prevent_user_access.constprop.0>
c05e6bd0 <.prevent_user_access.constprop.0>:
c05e7d8c:   4b ff ee 45 bl  c05e6bd0 
<.prevent_user_access.constprop.0>
c07bcae0 <.prevent_user_access.constprop.0>:
c07bda10:   4b ff f0 d1 bl  c07bcae0 
<.prevent_user_access.constprop.0>
c07bda54:   4b ff f0 8d bl  c07bcae0 
<.prevent_user_access.constprop.0>
c07bdd28:   4b ff ed b9 bl  c07bcae0 
<.prevent_user_access.constprop.0>
c07c0390:   4b ff c7 51 bl  c07bcae0 
<.prevent_user_access.constprop.0>
c094e4f0 <.prevent_user_access.constprop.0>:
c0950e40:   4b ff d6 b1 bl  c094e4f0 
<.prevent_user_access.constprop.0>
c097d2d0 <.prevent_user_access.constprop.0>:
c09813fc:   4b ff be d5 bl  c097d2d0 
<.prevent_user_access.constprop.0>
c0acd540 <.prevent_user_access.constprop.0>:
c0ad1d60:   4b ff b7 e1 bl  c0acd540 
<.prevent_user_access.constprop.0>
c0e5d680 <.prevent_user_access.constprop.0>:
c0e64b60:   4b ff 8b 21 bl  c0e5d680 
<.prevent_user_access.constprop.0>
c0e64b6c:   4b ff 8b 15 bl  c0e5d680 
<.prevent_user_access.constprop.0>
c0e64c38:   4b ff 8a 49 bl  c0e5d680 
<.prevent_user_access.constprop.0>

When building signal_64.c with -Winline the following messages appear:

./arch/powerpc/include/asm/book3s/64/kup.h:331:20: error: inlining 
failed in call to 'set_kuap': call is unlikely and code size would grow 
[-Werror=inline]
./arch/powerpc/include/asm/book3s/64/kup.h:401:20: error: inlining 
failed in call to 'prevent_user_access.constprop': call is unlikely and code 
size would grow [-Werror=inline]

Those functions are used on hot pathes and have been
expected to be inlined at all time.

Force them inline.

This patch reduces the kernel text size by 700 bytes, confirming
that not inlining those functions is not worth it.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/64/kup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kup.h 
b/arch/powerpc/include/asm/book3s/64/kup.h
index 69fcf63eec94..54cf46808157 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -328,7 +328,7 @@ static inline unsigned long get_kuap(void)
return mfspr(SPRN_AMR);
 }
 
-static inline void set_kuap(unsigned long value)
+static __always_inline void set_kuap(unsigned long value)
 {
if (!mmu_has_feature(MMU_FTR_BOOK3S_KUAP))
return;
@@ -398,7 +398,7 @@ static __always_inline void allow_user_access(void __user 
*to, const void __user
 
 #endif /* !CONFIG_PPC_KUAP */
 
-static inline void prevent_user_access(unsigned long dir)
+static __always_inline void prevent_user_access(unsigned long dir)
 {
set_kuap(AMR_

Re: [PATCH v2 1/2] selftest/vm: Use correct PAGE_SHIFT value for ppc64

2022-02-11 Thread Aneesh Kumar K V

On 2/11/22 18:58, Mike Rapoport wrote:

Hi Aneesh,

On Fri, Feb 11, 2022 at 05:22:13PM +0530, Aneesh Kumar K V wrote:

On 2/11/22 16:03, Mike Rapoport wrote:

On Fri, Feb 11, 2022 at 12:03:28PM +0530, Aneesh Kumar K.V wrote:

Keep it simple by using a #define and limiting hugepage size to 2M.
This keeps the test simpler instead of dynamically finding the page size
and huge page size.

Without this tests are broken w.r.t reading /proc/self/pagemap

if (pread(pagemap_fd, ent, sizeof(ent),
(uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
err(2, "read pagemap");

Cc: Shuah Khan 
Cc: linux-kselft...@vger.kernel.org
Signed-off-by: Aneesh Kumar K.V 
---
   tools/testing/selftests/vm/ksm_tests.c| 9 -
   tools/testing/selftests/vm/transhuge-stress.c | 9 -
   2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/ksm_tests.c 
b/tools/testing/selftests/vm/ksm_tests.c
index 1436e1a9a3d3..cae72872152b 100644
--- a/tools/testing/selftests/vm/ksm_tests.c
+++ b/tools/testing/selftests/vm/ksm_tests.c
@@ -22,7 +22,14 @@
   #define KSM_MERGE_ACROSS_NODES_DEFAULT true
   #define MB (1ul << 20)
-#define PAGE_SHIFT 12
+#ifdef __powerpc64__
+#define PAGE_SHIFT 16
+#else
+#define PAGE_SHIFT 12
+#endif


Page size can be other than 4096 for other configurations as well. And even
on ppc64 it's not necessarily 64k.



But most common test config is with 64K page size.


Ideally page size in selftests/vm should be sysconf(_SC_PAGESIZE)



yes. As explained in commit message, the idea was to keep it simpler.


I think it's simple enough (compile tested on x86 only):

 From 219577d87041f19f2c00dc7c23e0fd5aad8b02d5 Mon Sep 17 00:00:00 2001
From: Mike Rapoport 
Date: Fri, 11 Feb 2022 15:24:13 +0200
Subject: [PATCH] selftest/vm: add helpers to detect PAGE_SIZE and PAGE_SHIFT

PAGE_SIZE is not 4096 in many configurations, particularily ppc64 uses
64K pages in majority of cases.

Add helpers to detect PAGE_SIZE and PAGE_SHIFT dynamically.

Signed-off-by: Mike Rapoport 
---
  tools/testing/selftests/vm/gup_test.c |  3 +-
  tools/testing/selftests/vm/ksm_tests.c|  8 +
  tools/testing/selftests/vm/transhuge-stress.c |  9 ++
  tools/testing/selftests/vm/util.h | 29 +++
  4 files changed, 34 insertions(+), 15 deletions(-)
  create mode 100644 tools/testing/selftests/vm/util.h

diff --git a/tools/testing/selftests/vm/gup_test.c 
b/tools/testing/selftests/vm/gup_test.c
index fe043f67798b..cda837a14736 100644
--- a/tools/testing/selftests/vm/gup_test.c
+++ b/tools/testing/selftests/vm/gup_test.c
@@ -10,8 +10,9 @@
  #include 
  #include "../../../../mm/gup_test.h"
  
+#include "util.h"

+
  #define MB (1UL << 20)
-#define PAGE_SIZE sysconf(_SC_PAGESIZE)
  
  /* Just the flags we need, copied from mm.h: */

  #define FOLL_WRITE0x01/* check pte is writable */
diff --git a/tools/testing/selftests/vm/ksm_tests.c 
b/tools/testing/selftests/vm/ksm_tests.c
index cae72872152b..7faafd24446f 100644
--- a/tools/testing/selftests/vm/ksm_tests.c
+++ b/tools/testing/selftests/vm/ksm_tests.c
@@ -12,6 +12,7 @@
  
  #include "../kselftest.h"

  #include "../../../../include/vdso/time64.h"
+#include "util.h"
  
  #define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"

  #define KSM_FP(s) (KSM_SYSFS_PATH s)
@@ -22,17 +23,10 @@
  #define KSM_MERGE_ACROSS_NODES_DEFAULT true
  #define MB (1ul << 20)
  
-#ifdef __powerpc64__

-#define PAGE_SHIFT 16
-#else
-#define PAGE_SHIFT 12
-#endif
  /*
   * On ppc64 this will only work with radix 2M hugepage size
   */
  #define HPAGE_SHIFT 21
-
-#define PAGE_SIZE (1 << PAGE_SHIFT)
  #define HPAGE_SIZE (1 << HPAGE_SHIFT)
  
  #define PAGEMAP_PRESENT(ent)	(((ent) & (1ull << 63)) != 0)

diff --git a/tools/testing/selftests/vm/transhuge-stress.c 
b/tools/testing/selftests/vm/transhuge-stress.c
index b1f8d98355c5..baf90a745d28 100644
--- a/tools/testing/selftests/vm/transhuge-stress.c
+++ b/tools/testing/selftests/vm/transhuge-stress.c
@@ -16,17 +16,12 @@
  #include 
  #include 
  
-#ifdef __powerpc64__

-#define PAGE_SHIFT 16
-#else
-#define PAGE_SHIFT 12
-#endif
+#include "util.h"
+
  /*
   * On ppc64 this will only work with radix 2M hugepage size
   */
  #define HPAGE_SHIFT 21
-
-#define PAGE_SIZE (1 << PAGE_SHIFT)
  #define HPAGE_SIZE (1 << HPAGE_SHIFT)
  
  #define PAGEMAP_PRESENT(ent)	(((ent) & (1ull << 63)) != 0)

diff --git a/tools/testing/selftests/vm/util.h 
b/tools/testing/selftests/vm/util.h
new file mode 100644
index ..1c85d7583bac
--- /dev/null
+++ b/tools/testing/selftests/vm/util.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KSELFTEST_VM_UTIL_H
+#define __KSELFTEST_VM_UTIL_H
+
+#include  /* ffsl() */
+#include  /* _SC_PAGESIZE */
+
+static unsigned __page_size;
+static unsigned __page_shift;
+
+static inline unsigned page_size(void)
+{
+   if (!__page_size)
+   __page_size = sysconf(_S

Re: [PATCH v2 1/2] selftest/vm: Use correct PAGE_SHIFT value for ppc64

2022-02-11 Thread Mike Rapoport
Hi Aneesh,

On Fri, Feb 11, 2022 at 05:22:13PM +0530, Aneesh Kumar K V wrote:
> On 2/11/22 16:03, Mike Rapoport wrote:
> > On Fri, Feb 11, 2022 at 12:03:28PM +0530, Aneesh Kumar K.V wrote:
> > > Keep it simple by using a #define and limiting hugepage size to 2M.
> > > This keeps the test simpler instead of dynamically finding the page size
> > > and huge page size.
> > > 
> > > Without this tests are broken w.r.t reading /proc/self/pagemap
> > > 
> > >   if (pread(pagemap_fd, ent, sizeof(ent),
> > >   (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
> > >   err(2, "read pagemap");
> > > 
> > > Cc: Shuah Khan 
> > > Cc: linux-kselft...@vger.kernel.org
> > > Signed-off-by: Aneesh Kumar K.V 
> > > ---
> > >   tools/testing/selftests/vm/ksm_tests.c| 9 -
> > >   tools/testing/selftests/vm/transhuge-stress.c | 9 -
> > >   2 files changed, 16 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/tools/testing/selftests/vm/ksm_tests.c 
> > > b/tools/testing/selftests/vm/ksm_tests.c
> > > index 1436e1a9a3d3..cae72872152b 100644
> > > --- a/tools/testing/selftests/vm/ksm_tests.c
> > > +++ b/tools/testing/selftests/vm/ksm_tests.c
> > > @@ -22,7 +22,14 @@
> > >   #define KSM_MERGE_ACROSS_NODES_DEFAULT true
> > >   #define MB (1ul << 20)
> > > -#define PAGE_SHIFT 12
> > > +#ifdef __powerpc64__
> > > +#define PAGE_SHIFT   16
> > > +#else
> > > +#define PAGE_SHIFT   12
> > > +#endif
> > 
> > Page size can be other than 4096 for other configurations as well. And even
> > on ppc64 it's not necessarily 64k.
> > 
> 
> But most common test config is with 64K page size.
> 
> > Ideally page size in selftests/vm should be sysconf(_SC_PAGESIZE)
> 
> 
> yes. As explained in commit message, the idea was to keep it simpler.

I think it's simple enough (compile tested on x86 only):

>From 219577d87041f19f2c00dc7c23e0fd5aad8b02d5 Mon Sep 17 00:00:00 2001
From: Mike Rapoport 
Date: Fri, 11 Feb 2022 15:24:13 +0200
Subject: [PATCH] selftest/vm: add helpers to detect PAGE_SIZE and PAGE_SHIFT

PAGE_SIZE is not 4096 in many configurations, particularily ppc64 uses
64K pages in majority of cases.

Add helpers to detect PAGE_SIZE and PAGE_SHIFT dynamically.

Signed-off-by: Mike Rapoport 
---
 tools/testing/selftests/vm/gup_test.c |  3 +-
 tools/testing/selftests/vm/ksm_tests.c|  8 +
 tools/testing/selftests/vm/transhuge-stress.c |  9 ++
 tools/testing/selftests/vm/util.h | 29 +++
 4 files changed, 34 insertions(+), 15 deletions(-)
 create mode 100644 tools/testing/selftests/vm/util.h

diff --git a/tools/testing/selftests/vm/gup_test.c 
b/tools/testing/selftests/vm/gup_test.c
index fe043f67798b..cda837a14736 100644
--- a/tools/testing/selftests/vm/gup_test.c
+++ b/tools/testing/selftests/vm/gup_test.c
@@ -10,8 +10,9 @@
 #include 
 #include "../../../../mm/gup_test.h"
 
+#include "util.h"
+
 #define MB (1UL << 20)
-#define PAGE_SIZE sysconf(_SC_PAGESIZE)
 
 /* Just the flags we need, copied from mm.h: */
 #define FOLL_WRITE 0x01/* check pte is writable */
diff --git a/tools/testing/selftests/vm/ksm_tests.c 
b/tools/testing/selftests/vm/ksm_tests.c
index cae72872152b..7faafd24446f 100644
--- a/tools/testing/selftests/vm/ksm_tests.c
+++ b/tools/testing/selftests/vm/ksm_tests.c
@@ -12,6 +12,7 @@
 
 #include "../kselftest.h"
 #include "../../../../include/vdso/time64.h"
+#include "util.h"
 
 #define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
 #define KSM_FP(s) (KSM_SYSFS_PATH s)
@@ -22,17 +23,10 @@
 #define KSM_MERGE_ACROSS_NODES_DEFAULT true
 #define MB (1ul << 20)
 
-#ifdef __powerpc64__
-#define PAGE_SHIFT 16
-#else
-#define PAGE_SHIFT 12
-#endif
 /*
  * On ppc64 this will only work with radix 2M hugepage size
  */
 #define HPAGE_SHIFT 21
-
-#define PAGE_SIZE (1 << PAGE_SHIFT)
 #define HPAGE_SIZE (1 << HPAGE_SHIFT)
 
 #define PAGEMAP_PRESENT(ent)   (((ent) & (1ull << 63)) != 0)
diff --git a/tools/testing/selftests/vm/transhuge-stress.c 
b/tools/testing/selftests/vm/transhuge-stress.c
index b1f8d98355c5..baf90a745d28 100644
--- a/tools/testing/selftests/vm/transhuge-stress.c
+++ b/tools/testing/selftests/vm/transhuge-stress.c
@@ -16,17 +16,12 @@
 #include 
 #include 
 
-#ifdef __powerpc64__
-#define PAGE_SHIFT 16
-#else
-#define PAGE_SHIFT 12
-#endif
+#include "util.h"
+
 /*
  * On ppc64 this will only work with radix 2M hugepage size
  */
 #define HPAGE_SHIFT 21
-
-#define PAGE_SIZE (1 << PAGE_SHIFT)
 #define HPAGE_SIZE (1 << HPAGE_SHIFT)
 
 #define PAGEMAP_PRESENT(ent)   (((ent) & (1ull << 63)) != 0)
diff --git a/tools/testing/selftests/vm/util.h 
b/tools/testing/selftests/vm/util.h
new file mode 100644
index ..1c85d7583bac
--- /dev/null
+++ b/tools/testing/selftests/vm/util.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KSELFTEST_VM_UTIL_H
+#define __KSELFTEST_VM_UTIL_H
+
+#include  /* ffsl() */
+#include  /* _SC_PAGESIZE */
+
+static unsigned __page_size;
+static unsigned __pag

Re: [RFC] Upstreaming Linux for Nintendo Wii U

2022-02-11 Thread Christophe Leroy
Hi Ash,

Le 11/02/2022 à 12:29, Michael Ellerman a écrit :
> Ash Logan  writes:
>> - Like the Wii before it, the Wii U has a small amount of RAM at address
>> zero, a gap, then a large amount of RAM at a higher address. Instead of
>> the "map everything and reserve the gap" approach of the Wii, we loop
>> over each memblock and map only true RAM[9]. This seems to work, but as
>> far as I can tell is unique amongst powerpc32 platforms, so it's worth
>> pointing out. (Note: I've been told this doesn't work anymore after some
>> KUAP changes[10], so this point might be moot; haven't investigated)
> 
> We'd need more detail on that I guess. Currently all the 32-bit
> platforms use the flat memory model, which assumes RAM is a single
> contiguous block. Though that doesn't mean it all has to be used or
> mapped, like the Wii does. To properly support your layout you should be
> using sparsemem, but it's possible that's more trouble than it's worth,
> I'm not sure. How far apart are the low and high blocks of RAM, and what
> are their sizes?

Can you provide details on what's happening with KUAP changes ?

You are pointing to series https://lkml.org/lkml/2021/6/3/204

Does it work when CONFIG_PPC_KUAP is not selected or doesn't it work 
either ?

Are you able to bisect which commit of that series is the culprit ?

Thanks
Christophe

Re: [PATCH v2] powerpc/mm: Update default hugetlb size early

2022-02-11 Thread David Hildenbrand
On 11.02.22 13:23, Aneesh Kumar K.V wrote:
> David Hildenbrand  writes:
> 
>> On 11.02.22 10:16, Aneesh Kumar K V wrote:
>>> On 2/11/22 14:00, David Hildenbrand wrote:
 On 11.02.22 07:52, Aneesh Kumar K.V wrote:
> commit: d9c234005227 ("Do not depend on MAX_ORDER when grouping pages by 
> mobility")
> introduced pageblock_order which will be used to group pages better.
> The kernel now groups pages based on the value of HPAGE_SHIFT. Hence 
> HPAGE_SHIFT
> should be set before we call set_pageblock_order.
>
> set_pageblock_order happens early in the boot and default hugetlb page 
> size
> should be initialized before that to compute the right pageblock_order 
> value.
>
> Currently, default hugetlbe page size is set via arch_initcalls which 
> happens
> late in the boot as shown via the below callstack:
>
> [c7383b10] [c1289328] hugetlbpage_init+0x2b8/0x2f8
> [c7383bc0] [c12749e4] do_one_initcall+0x14c/0x320
> [c7383c90] [c127505c] kernel_init_freeable+0x410/0x4e8
> [c7383da0] [c0012664] kernel_init+0x30/0x15c
> [c7383e10] [c000cf14] ret_from_kernel_thread+0x5c/0x64
>
> and the pageblock_order initialization is done early during the boot.
>
> [c18bfc80] [c12ae120] set_pageblock_order+0x50/0x64
> [c18bfca0] [c12b3d94] sparse_init+0x188/0x268
> [c18bfd60] [c1288bfc] initmem_init+0x28c/0x328
> [c18bfe50] [c127b370] setup_arch+0x410/0x480
> [c18bfed0] [c127401c] start_kernel+0xb8/0x934
> [c18bff90] [c000d984] start_here_common+0x1c/0x98
>
> delaying default hugetlb page size initialization implies the kernel will
> initialize pageblock_order to (MAX_ORDER - 1) which is not an optimal
> value for mobility grouping. IIUC we always had this issue. But it was not
> a problem for hash translation mode because (MAX_ORDER - 1) is the same as
> HUGETLB_PAGE_ORDER (8) in the case of hash (16MB). With radix,
> HUGETLB_PAGE_ORDER will be 5 (2M size) and hence pageblock_order should be
> 5 instead of 8.


 A related question: Can we on ppc still have pageblock_order > MAX_ORDER
 - 1? We have some code for that and I am not so sure if we really need 
 that.

>>>
>>> I also have been wondering about the same. On book3s64 I don't think we 
>>> need that support for both 64K and 4K page size because with hash 
>>> hugetlb size is MAX_ORDER -1. (16MB hugepage size)
>>>
>>> I am not sure about the 256K page support. Christophe may be able to 
>>> answer that.
>>>
>>> For the gigantic hugepage support we depend on cma based allocation or
>>> firmware reservation. So I am not sure why we ever considered pageblock 
>>>  > MAX_ORDER -1 scenario. If you have pointers w.r.t why that was ever 
>>> needed, I could double-check whether ppc64 is still dependent on that.
>>
>> commit dc78327c0ea7da5186d8cbc1647bd6088c5c9fa5
>> Author: Michal Nazarewicz 
>> Date:   Wed Jul 2 15:22:35 2014 -0700
>>
>> mm: page_alloc: fix CMA area initialisation when pageblock > MAX_ORDER
>>
>> indicates that at least arm64 used to have cases for that as well.
>>
>> However, nowadays with ARM64_64K_PAGES we have FORCE_MAX_ZONEORDER=14 as
>> default, corresponding to 512MiB.
>>
>> So I'm not sure if this is something worth supporting. If you want
>> somewhat reliable gigantic pages, use CMA or preallocate them during boot.
>>
>> -- 
>> Thanks,
>>
>> David / dhildenb
> 
> I could build a kernel with FORCE_MAX_ZONEORDER=8 and pageblock_order =
> 8. We need to disable THP for such a kernel to boot, because THP do
> check for PMD_ORDER < MAX_ORDER. I was able to boot that kernel on a
> virtualized platform, but then gigantic_page_runtime_supported is not
> supported on such config with hash translation.
> 

I'm currently playing with the idea of the following (uncompiled,untested):

>From 68e0a158a5060bc1a175d12b20e21794763a33df Mon Sep 17 00:00:00 2001
From: David Hildenbrand 
Date: Fri, 11 Feb 2022 11:40:27 +0100
Subject: [PATCH] mm: enforce pageblock_order < MAX_ORDER

Some places in the kernel don't really expect pageblock_order >=
MAX_ORDER, and it looks like this is only possible in corner cases:

1) CONFIG_DEFERRED_STRUCT_PAGE_INIT we'll end up freeing pageblock_order
   pages via __free_pages_core(), which cannot possibly work.

2) mm/page_reporting.c won't be reporting any pages with default
   page_reporting_order == pageblock_order, as we'll be skipping the
   reporting loop inside page_reporting_process_zone().

3) __rmqueue_fallback() will never be able to steal with
   ALLOC_NOFRAGMENT.

4) find_zone_movable_pfns_for_nodes() will roundup the ZONE_MOVABLE
   start PFN to MAX_ORDER_NR_PAGES. Consequently with a bigger
   pageblock_order, we could have pageblocks partially managed by two
   zones.

p

Re: [PATCH v2] powerpc/mm: Update default hugetlb size early

2022-02-11 Thread Aneesh Kumar K.V
David Hildenbrand  writes:

> On 11.02.22 10:16, Aneesh Kumar K V wrote:
>> On 2/11/22 14:00, David Hildenbrand wrote:
>>> On 11.02.22 07:52, Aneesh Kumar K.V wrote:
 commit: d9c234005227 ("Do not depend on MAX_ORDER when grouping pages by 
 mobility")
 introduced pageblock_order which will be used to group pages better.
 The kernel now groups pages based on the value of HPAGE_SHIFT. Hence 
 HPAGE_SHIFT
 should be set before we call set_pageblock_order.

 set_pageblock_order happens early in the boot and default hugetlb page size
 should be initialized before that to compute the right pageblock_order 
 value.

 Currently, default hugetlbe page size is set via arch_initcalls which 
 happens
 late in the boot as shown via the below callstack:

 [c7383b10] [c1289328] hugetlbpage_init+0x2b8/0x2f8
 [c7383bc0] [c12749e4] do_one_initcall+0x14c/0x320
 [c7383c90] [c127505c] kernel_init_freeable+0x410/0x4e8
 [c7383da0] [c0012664] kernel_init+0x30/0x15c
 [c7383e10] [c000cf14] ret_from_kernel_thread+0x5c/0x64

 and the pageblock_order initialization is done early during the boot.

 [c18bfc80] [c12ae120] set_pageblock_order+0x50/0x64
 [c18bfca0] [c12b3d94] sparse_init+0x188/0x268
 [c18bfd60] [c1288bfc] initmem_init+0x28c/0x328
 [c18bfe50] [c127b370] setup_arch+0x410/0x480
 [c18bfed0] [c127401c] start_kernel+0xb8/0x934
 [c18bff90] [c000d984] start_here_common+0x1c/0x98

 delaying default hugetlb page size initialization implies the kernel will
 initialize pageblock_order to (MAX_ORDER - 1) which is not an optimal
 value for mobility grouping. IIUC we always had this issue. But it was not
 a problem for hash translation mode because (MAX_ORDER - 1) is the same as
 HUGETLB_PAGE_ORDER (8) in the case of hash (16MB). With radix,
 HUGETLB_PAGE_ORDER will be 5 (2M size) and hence pageblock_order should be
 5 instead of 8.
>>>
>>>
>>> A related question: Can we on ppc still have pageblock_order > MAX_ORDER
>>> - 1? We have some code for that and I am not so sure if we really need that.
>>>
>> 
>> I also have been wondering about the same. On book3s64 I don't think we 
>> need that support for both 64K and 4K page size because with hash 
>> hugetlb size is MAX_ORDER -1. (16MB hugepage size)
>> 
>> I am not sure about the 256K page support. Christophe may be able to 
>> answer that.
>> 
>> For the gigantic hugepage support we depend on cma based allocation or
>> firmware reservation. So I am not sure why we ever considered pageblock 
>>  > MAX_ORDER -1 scenario. If you have pointers w.r.t why that was ever 
>> needed, I could double-check whether ppc64 is still dependent on that.
>
> commit dc78327c0ea7da5186d8cbc1647bd6088c5c9fa5
> Author: Michal Nazarewicz 
> Date:   Wed Jul 2 15:22:35 2014 -0700
>
> mm: page_alloc: fix CMA area initialisation when pageblock > MAX_ORDER
>
> indicates that at least arm64 used to have cases for that as well.
>
> However, nowadays with ARM64_64K_PAGES we have FORCE_MAX_ZONEORDER=14 as
> default, corresponding to 512MiB.
>
> So I'm not sure if this is something worth supporting. If you want
> somewhat reliable gigantic pages, use CMA or preallocate them during boot.
>
> -- 
> Thanks,
>
> David / dhildenb

I could build a kernel with FORCE_MAX_ZONEORDER=8 and pageblock_order =
8. We need to disable THP for such a kernel to boot, because THP do
check for PMD_ORDER < MAX_ORDER. I was able to boot that kernel on a
virtualized platform, but then gigantic_page_runtime_supported is not
supported on such config with hash translation.

On non virtualized platform I am hitting crashes like below during boot.

[   47.637865][   C42] 
=   


   
[   47.637907][   C42] BUG pgtable-2^11 (Not tainted): Object already free  
   
[   47.637925][   C42] 
-   

[   47.637925][   C42]  
   
[   47.637945][   C42] Allocated in __pud_alloc+0x84/0x2a0 age=278 cpu=40 
pid=1409
 
[   47.637974][   C42]  __slab_alloc.isra.0+0x40/0x60   
   

Re: [PATCH v2 1/2] selftest/vm: Use correct PAGE_SHIFT value for ppc64

2022-02-11 Thread Aneesh Kumar K V

On 2/11/22 16:03, Mike Rapoport wrote:

On Fri, Feb 11, 2022 at 12:03:28PM +0530, Aneesh Kumar K.V wrote:

Keep it simple by using a #define and limiting hugepage size to 2M.
This keeps the test simpler instead of dynamically finding the page size
and huge page size.

Without this tests are broken w.r.t reading /proc/self/pagemap

if (pread(pagemap_fd, ent, sizeof(ent),
(uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
err(2, "read pagemap");

Cc: Shuah Khan 
Cc: linux-kselft...@vger.kernel.org
Signed-off-by: Aneesh Kumar K.V 
---
  tools/testing/selftests/vm/ksm_tests.c| 9 -
  tools/testing/selftests/vm/transhuge-stress.c | 9 -
  2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/ksm_tests.c 
b/tools/testing/selftests/vm/ksm_tests.c
index 1436e1a9a3d3..cae72872152b 100644
--- a/tools/testing/selftests/vm/ksm_tests.c
+++ b/tools/testing/selftests/vm/ksm_tests.c
@@ -22,7 +22,14 @@
  #define KSM_MERGE_ACROSS_NODES_DEFAULT true
  #define MB (1ul << 20)
  
-#define PAGE_SHIFT 12

+#ifdef __powerpc64__
+#define PAGE_SHIFT 16
+#else
+#define PAGE_SHIFT 12
+#endif


Page size can be other than 4096 for other configurations as well. And even
on ppc64 it's not necessarily 64k.



But most common test config is with 64K page size.


Ideally page size in selftests/vm should be sysconf(_SC_PAGESIZE)



yes. As explained in commit message, the idea was to keep it simpler.

"Keep it simple by using a #define and limiting hugepage size to 2M.
This keeps the test simpler instead of dynamically finding the page size
and huge page size.

Without this tests are broken w.r.t reading /proc/self/pagemap"

We can definitely look at updating multiple tests in selftest/vm to work 
with dynamic value of pagesize and huagepage size. But that can be 
outside this patch?





+/*
+ * On ppc64 this will only work with radix 2M hugepage size
+ */
  #define HPAGE_SHIFT 21
  
  #define PAGE_SIZE (1 << PAGE_SHIFT)

diff --git a/tools/testing/selftests/vm/transhuge-stress.c 
b/tools/testing/selftests/vm/transhuge-stress.c
index 5e4c036f6ad3..b1f8d98355c5 100644
--- a/tools/testing/selftests/vm/transhuge-stress.c
+++ b/tools/testing/selftests/vm/transhuge-stress.c
@@ -16,7 +16,14 @@
  #include 
  #include 
  
-#define PAGE_SHIFT 12

+#ifdef __powerpc64__
+#define PAGE_SHIFT 16
+#else
+#define PAGE_SHIFT 12
+#endif
+/*
+ * On ppc64 this will only work with radix 2M hugepage size
+ */
  #define HPAGE_SHIFT 21
  
  #define PAGE_SIZE (1 << PAGE_SHIFT)

--
2.34.1








Re: [PATCH kernel 2/3] powerpc/llvm: Sample config for LLVM LTO

2022-02-11 Thread Naveen N. Rao

Alexey Kardashevskiy wrote:


Disables CONFIG_FTRACE_MCOUNT_USE_RECORDMCOUNT as CONFIG_HAS_LTO_CLANG
depends on it being disabled. In order to avoid disabling way too many
options (like DYNAMIC_FTRACE/FUNCTION_TRACER), this converts
FTRACE_MCOUNT_USE_RECORDMCOUNT from def_bool to bool.





+CONFIG_FTRACE_MCOUNT_USE_RECORDMCOUNT=n


I don't think that will work since we have no other ways of generating 
mcount locations on powerpc. And since we decided to only support 
DYNAMIC_FTRACE, I guess we will need to disable FUNCTION_TRACER to get 
this working, for now.


I am looking into ways to get ftrace working in this scenario.


- Naveen


Re: [RFC PATCH 2/3] powerpc/ftrace: Override ftrace_location_lookup() for MPROFILE_KERNEL

2022-02-11 Thread Naveen N. Rao

Steven Rostedt wrote:

On Thu, 10 Feb 2022 16:40:28 +
"Naveen N. Rao"  wrote:

The other option is to mark ftrace_cmp_recs() as a __weak function, but 
I have a vague recollection of you suggesting #ifdef rather than a 
__weak function in the past. I might be mis-remembering, so if you think 
making this a __weak function is better, I can do that.


No. If I wanted that I would have suggested it. I think this is the
prettiest of the ugly solutions out there ;-)


Understood :)



As I said, I can't think of a better solution, and we can go with this
until something else comes along.


Thanks,
- Naveen


Re: [RFC] Upstreaming Linux for Nintendo Wii U

2022-02-11 Thread Michael Ellerman
Ash Logan  writes:
> Hello,

Hi Ash,

I can't really answer all your questions, but I can chime in on one or
two things ...

> - Right now I've made a new platform (like ps3) rather than joining the
> GameCube and Wii in embedded6xx, since that is marked as BROKEN_ON_SMP.
> The Wii U is a 3-core system, though a CPU bug[8] prevents existing
> userspaces working with it. Bit of a "cross that bridge when we get
> there" situation, though I'm reluctant to prevent that possibility by
> using a BROKEN_ON_SMP platform.

I'm happy for it to be a new platform. I'd almost prefer it to be a
separate platform, that way you can make changes in your platform code
without worrying (as much) about breaking other platforms.

> - Like the Wii before it, the Wii U has a small amount of RAM at address
> zero, a gap, then a large amount of RAM at a higher address. Instead of
> the "map everything and reserve the gap" approach of the Wii, we loop
> over each memblock and map only true RAM[9]. This seems to work, but as
> far as I can tell is unique amongst powerpc32 platforms, so it's worth
> pointing out. (Note: I've been told this doesn't work anymore after some
> KUAP changes[10], so this point might be moot; haven't investigated)

We'd need more detail on that I guess. Currently all the 32-bit
platforms use the flat memory model, which assumes RAM is a single
contiguous block. Though that doesn't mean it all has to be used or
mapped, like the Wii does. To properly support your layout you should be
using sparsemem, but it's possible that's more trouble than it's worth,
I'm not sure. How far apart are the low and high blocks of RAM, and what
are their sizes?

> - Due to the aformentioned DMA restrictions and possibly a fatal
> bytemasking bug on uncached mappings[11], I have been wondering if it'd
> be better to just give up on the SRAM at address 0 altogether and use it
> as VRAM or something, loading the kernel at a higher address.

Don't you have exceptions entering down at low addresses? Even so you
could possibly trampoline them up to the kernel at a high address.
 
> In terms of platform bringup, the key issue is whether to be embedded6xx
> or not and what output device to use. Beyond that it's just things like
> IRQ controller drivers, should be pretty straightforward. I think on our
> end, we'll start rebasing to 5.15 (LTS) and start sending patches from
> there. I know getting closer to HEAD is preferable, this project has
> just moved very slowly in the past and being on LTS has been a lifesaver.

As I said I'm happy for it to be a new platform. If there ends up being
a lot of shared code we can always refactor, but embedded6xx is only
~1500 LOC anyway.

One thing that has come up with previous console port submissions is the
requirement for patches to be signed off. The docs are here if you
aren't familiar with them:
  
https://www.kernel.org/doc/html/latest/process/submitting-patches.html#sign-your-work-the-developer-s-certificate-of-origin

Otherwise your plan sounds good to me, 4.19 is pretty old so getting up
to 5.15 would be a good start. Then submit whatever bits you can and
chip away at it.

cheers


Re: [PATCH v2 1/2] selftest/vm: Use correct PAGE_SHIFT value for ppc64

2022-02-11 Thread Mike Rapoport
On Fri, Feb 11, 2022 at 12:03:28PM +0530, Aneesh Kumar K.V wrote:
> Keep it simple by using a #define and limiting hugepage size to 2M.
> This keeps the test simpler instead of dynamically finding the page size
> and huge page size.
> 
> Without this tests are broken w.r.t reading /proc/self/pagemap
> 
>   if (pread(pagemap_fd, ent, sizeof(ent),
>   (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
>   err(2, "read pagemap");
> 
> Cc: Shuah Khan 
> Cc: linux-kselft...@vger.kernel.org
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  tools/testing/selftests/vm/ksm_tests.c| 9 -
>  tools/testing/selftests/vm/transhuge-stress.c | 9 -
>  2 files changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/testing/selftests/vm/ksm_tests.c 
> b/tools/testing/selftests/vm/ksm_tests.c
> index 1436e1a9a3d3..cae72872152b 100644
> --- a/tools/testing/selftests/vm/ksm_tests.c
> +++ b/tools/testing/selftests/vm/ksm_tests.c
> @@ -22,7 +22,14 @@
>  #define KSM_MERGE_ACROSS_NODES_DEFAULT true
>  #define MB (1ul << 20)
>  
> -#define PAGE_SHIFT 12
> +#ifdef __powerpc64__
> +#define PAGE_SHIFT   16
> +#else
> +#define PAGE_SHIFT   12
> +#endif

Page size can be other than 4096 for other configurations as well. And even
on ppc64 it's not necessarily 64k.

Ideally page size in selftests/vm should be sysconf(_SC_PAGESIZE)

> +/*
> + * On ppc64 this will only work with radix 2M hugepage size
> + */
>  #define HPAGE_SHIFT 21
>  
>  #define PAGE_SIZE (1 << PAGE_SHIFT)
> diff --git a/tools/testing/selftests/vm/transhuge-stress.c 
> b/tools/testing/selftests/vm/transhuge-stress.c
> index 5e4c036f6ad3..b1f8d98355c5 100644
> --- a/tools/testing/selftests/vm/transhuge-stress.c
> +++ b/tools/testing/selftests/vm/transhuge-stress.c
> @@ -16,7 +16,14 @@
>  #include 
>  #include 
>  
> -#define PAGE_SHIFT 12
> +#ifdef __powerpc64__
> +#define PAGE_SHIFT   16
> +#else
> +#define PAGE_SHIFT   12
> +#endif
> +/*
> + * On ppc64 this will only work with radix 2M hugepage size
> + */
>  #define HPAGE_SHIFT 21
>  
>  #define PAGE_SIZE (1 << PAGE_SHIFT)
> -- 
> 2.34.1
> 
> 

-- 
Sincerely yours,
Mike.


[PATCH 1/2] net: Allow csum_sub() to be provided in arch

2022-02-11 Thread Christophe Leroy
In the same spirit as commit 07064c6e022b ("net: Allow csum_add to be
provided in arch"), allow csum_sub() to be provided by arch.

The generic implementation of csum_sub() call csum_add() with the
complement of the addendum.

Some architectures can do it directly.

This will also avoid getting several copies of csum_sub() outlined
when building with -Os.

Signed-off-by: Christophe Leroy 
---
 include/net/checksum.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/net/checksum.h b/include/net/checksum.h
index 9badcd5532ef..735d98724145 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -62,10 +62,12 @@ static inline __wsum csum_add(__wsum csum, __wsum addend)
 }
 #endif
 
+#ifndef HAVE_ARCH_CSUM_SUB
 static inline __wsum csum_sub(__wsum csum, __wsum addend)
 {
return csum_add(csum, ~addend);
 }
+#endif
 
 static inline __sum16 csum16_add(__sum16 csum, __be16 addend)
 {
-- 
2.34.1



[PATCH 2/2] powerpc/32: Implement csum_sub

2022-02-11 Thread Christophe Leroy
When building kernel with CONFIG_CC_OPTIMISE_FOR_SIZE, several
copies of csum_sub() are generated, with the following code:

0170 :
 170:   7c 84 20 f8 not r4,r4
 174:   7c 63 20 14 addcr3,r3,r4
 178:   7c 63 01 94 addze   r3,r3
 17c:   4e 80 00 20 blr

Let's define a PPC32 version with subc/addme, and for it's inlining.

It will return 0 instead of 0x when subtracting 0x8000 to itself,
this is not an issue as 0 and ~0 are equivalent, refer to RFC 1624.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/checksum.h | 16 
 1 file changed, 16 insertions(+)

diff --git a/arch/powerpc/include/asm/checksum.h 
b/arch/powerpc/include/asm/checksum.h
index 350de8f90250..3288a1bf5e8d 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -112,6 +112,22 @@ static __always_inline __wsum csum_add(__wsum csum, __wsum 
addend)
 #endif
 }
 
+#ifdef CONFIG_PPC32
+#define HAVE_ARCH_CSUM_SUB
+static __always_inline __wsum csum_sub(__wsum csum, __wsum addend)
+{
+   if (__builtin_constant_p(csum) && (csum == 0 || csum == ~0))
+   return ~addend;
+   if (__builtin_constant_p(addend) && (addend == 0 || addend == ~0))
+   return csum;
+
+   asm("subc %0,%0,%1;"
+   "addme %0,%0;"
+   : "+r" (csum) : "r" (addend) : "xer");
+   return csum;
+}
+#endif
+
 /*
  * This is a version of ip_compute_csum() optimized for IP headers,
  * which always checksum on 4 octet boundaries.  ihl is the number
-- 
2.34.1



Re: [PATCH v2] powerpc/mm: Update default hugetlb size early

2022-02-11 Thread David Hildenbrand
On 11.02.22 10:16, Aneesh Kumar K V wrote:
> On 2/11/22 14:00, David Hildenbrand wrote:
>> On 11.02.22 07:52, Aneesh Kumar K.V wrote:
>>> commit: d9c234005227 ("Do not depend on MAX_ORDER when grouping pages by 
>>> mobility")
>>> introduced pageblock_order which will be used to group pages better.
>>> The kernel now groups pages based on the value of HPAGE_SHIFT. Hence 
>>> HPAGE_SHIFT
>>> should be set before we call set_pageblock_order.
>>>
>>> set_pageblock_order happens early in the boot and default hugetlb page size
>>> should be initialized before that to compute the right pageblock_order 
>>> value.
>>>
>>> Currently, default hugetlbe page size is set via arch_initcalls which 
>>> happens
>>> late in the boot as shown via the below callstack:
>>>
>>> [c7383b10] [c1289328] hugetlbpage_init+0x2b8/0x2f8
>>> [c7383bc0] [c12749e4] do_one_initcall+0x14c/0x320
>>> [c7383c90] [c127505c] kernel_init_freeable+0x410/0x4e8
>>> [c7383da0] [c0012664] kernel_init+0x30/0x15c
>>> [c7383e10] [c000cf14] ret_from_kernel_thread+0x5c/0x64
>>>
>>> and the pageblock_order initialization is done early during the boot.
>>>
>>> [c18bfc80] [c12ae120] set_pageblock_order+0x50/0x64
>>> [c18bfca0] [c12b3d94] sparse_init+0x188/0x268
>>> [c18bfd60] [c1288bfc] initmem_init+0x28c/0x328
>>> [c18bfe50] [c127b370] setup_arch+0x410/0x480
>>> [c18bfed0] [c127401c] start_kernel+0xb8/0x934
>>> [c18bff90] [c000d984] start_here_common+0x1c/0x98
>>>
>>> delaying default hugetlb page size initialization implies the kernel will
>>> initialize pageblock_order to (MAX_ORDER - 1) which is not an optimal
>>> value for mobility grouping. IIUC we always had this issue. But it was not
>>> a problem for hash translation mode because (MAX_ORDER - 1) is the same as
>>> HUGETLB_PAGE_ORDER (8) in the case of hash (16MB). With radix,
>>> HUGETLB_PAGE_ORDER will be 5 (2M size) and hence pageblock_order should be
>>> 5 instead of 8.
>>
>>
>> A related question: Can we on ppc still have pageblock_order > MAX_ORDER
>> - 1? We have some code for that and I am not so sure if we really need that.
>>
> 
> I also have been wondering about the same. On book3s64 I don't think we 
> need that support for both 64K and 4K page size because with hash 
> hugetlb size is MAX_ORDER -1. (16MB hugepage size)
> 
> I am not sure about the 256K page support. Christophe may be able to 
> answer that.
> 
> For the gigantic hugepage support we depend on cma based allocation or
> firmware reservation. So I am not sure why we ever considered pageblock 
>  > MAX_ORDER -1 scenario. If you have pointers w.r.t why that was ever 
> needed, I could double-check whether ppc64 is still dependent on that.

commit dc78327c0ea7da5186d8cbc1647bd6088c5c9fa5
Author: Michal Nazarewicz 
Date:   Wed Jul 2 15:22:35 2014 -0700

mm: page_alloc: fix CMA area initialisation when pageblock > MAX_ORDER

indicates that at least arm64 used to have cases for that as well.

However, nowadays with ARM64_64K_PAGES we have FORCE_MAX_ZONEORDER=14 as
default, corresponding to 512MiB.

So I'm not sure if this is something worth supporting. If you want
somewhat reliable gigantic pages, use CMA or preallocate them during boot.

-- 
Thanks,

David / dhildenb



Re: [PATCH v2] powerpc/mm: Update default hugetlb size early

2022-02-11 Thread Aneesh Kumar K V

On 2/11/22 14:00, David Hildenbrand wrote:

On 11.02.22 07:52, Aneesh Kumar K.V wrote:

commit: d9c234005227 ("Do not depend on MAX_ORDER when grouping pages by 
mobility")
introduced pageblock_order which will be used to group pages better.
The kernel now groups pages based on the value of HPAGE_SHIFT. Hence HPAGE_SHIFT
should be set before we call set_pageblock_order.

set_pageblock_order happens early in the boot and default hugetlb page size
should be initialized before that to compute the right pageblock_order value.

Currently, default hugetlbe page size is set via arch_initcalls which happens
late in the boot as shown via the below callstack:

[c7383b10] [c1289328] hugetlbpage_init+0x2b8/0x2f8
[c7383bc0] [c12749e4] do_one_initcall+0x14c/0x320
[c7383c90] [c127505c] kernel_init_freeable+0x410/0x4e8
[c7383da0] [c0012664] kernel_init+0x30/0x15c
[c7383e10] [c000cf14] ret_from_kernel_thread+0x5c/0x64

and the pageblock_order initialization is done early during the boot.

[c18bfc80] [c12ae120] set_pageblock_order+0x50/0x64
[c18bfca0] [c12b3d94] sparse_init+0x188/0x268
[c18bfd60] [c1288bfc] initmem_init+0x28c/0x328
[c18bfe50] [c127b370] setup_arch+0x410/0x480
[c18bfed0] [c127401c] start_kernel+0xb8/0x934
[c18bff90] [c000d984] start_here_common+0x1c/0x98

delaying default hugetlb page size initialization implies the kernel will
initialize pageblock_order to (MAX_ORDER - 1) which is not an optimal
value for mobility grouping. IIUC we always had this issue. But it was not
a problem for hash translation mode because (MAX_ORDER - 1) is the same as
HUGETLB_PAGE_ORDER (8) in the case of hash (16MB). With radix,
HUGETLB_PAGE_ORDER will be 5 (2M size) and hence pageblock_order should be
5 instead of 8.



A related question: Can we on ppc still have pageblock_order > MAX_ORDER
- 1? We have some code for that and I am not so sure if we really need that.



I also have been wondering about the same. On book3s64 I don't think we 
need that support for both 64K and 4K page size because with hash 
hugetlb size is MAX_ORDER -1. (16MB hugepage size)


I am not sure about the 256K page support. Christophe may be able to 
answer that.


For the gigantic hugepage support we depend on cma based allocation or
firmware reservation. So I am not sure why we ever considered pageblock 
> MAX_ORDER -1 scenario. If you have pointers w.r.t why that was ever 
needed, I could double-check whether ppc64 is still dependent on that.


-aneesh


[PATCH] powerpc/bitops: Force inlining of fls()

2022-02-11 Thread Christophe Leroy
Building a kernel with CONFIG_CC_OPTIMISE_FOR_SIZE leads to
the following functions being copied several times in vmlinux:

31 times __ilog2_u32()
34 times fls()

Disassembly follows:

c00f476c :
c00f476c:   7c 63 00 34 cntlzw  r3,r3
c00f4770:   20 63 00 20 subfic  r3,r3,32
c00f4774:   4e 80 00 20 blr

c00f4778 <__ilog2_u32>:
c00f4778:   94 21 ff f0 stwur1,-16(r1)
c00f477c:   7c 08 02 a6 mflrr0
c00f4780:   90 01 00 14 stw r0,20(r1)
c00f4784:   4b ff ff e9 bl  c00f476c 
c00f4788:   80 01 00 14 lwz r0,20(r1)
c00f478c:   38 63 ff ff addir3,r3,-1
c00f4790:   7c 08 03 a6 mtlrr0
c00f4794:   38 21 00 10 addir1,r1,16
c00f4798:   4e 80 00 20 blr

When forcing inlining of fls(), we get

c0008b80 <__ilog2_u32>:
c0008b80:   7c 63 00 34 cntlzw  r3,r3
c0008b84:   20 63 00 1f subfic  r3,r3,31
c0008b88:   4e 80 00 20 blr

vmlinux size gets reduced by 1 kbyte with that change.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/bitops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/bitops.h 
b/arch/powerpc/include/asm/bitops.h
index f18b1eac6b54..7df7fee774e5 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -288,7 +288,7 @@ static __always_inline void arch___clear_bit_unlock(int nr, 
volatile unsigned lo
  * fls: find last (most-significant) bit set.
  * Note fls(0) = 0, fls(1) = 1, fls(0x8000) = 32.
  */
-static inline int fls(unsigned int x)
+static __always_inline int fls(unsigned int x)
 {
int lz;
 
@@ -306,7 +306,7 @@ static inline int fls(unsigned int x)
  * 32-bit fls calls.
  */
 #ifdef CONFIG_PPC64
-static inline int fls64(__u64 x)
+static __always_inline int fls64(__u64 x)
 {
int lz;
 
-- 
2.34.1



[PATCH] net: Remove branch in csum_shift()

2022-02-11 Thread Christophe Leroy
Today's implementation of csum_shift() leads to branching based on
parity of 'offset'

02f8 :
 2f8:   70 a5 00 01 andi.   r5,r5,1
 2fc:   41 a2 00 08 beq 304 
 300:   54 84 c0 3e rotlwi  r4,r4,24
 304:   7c 63 20 14 addcr3,r3,r4
 308:   7c 63 01 94 addze   r3,r3
 30c:   4e 80 00 20 blr

Use first bit of 'offset' directly as input of the rotation instead of
branching.

02f8 :
 2f8:   54 a5 1f 38 rlwinm  r5,r5,3,28,28
 2fc:   20 a5 00 20 subfic  r5,r5,32
 300:   5c 84 28 3e rotlw   r4,r4,r5
 304:   7c 63 20 14 addcr3,r3,r4
 308:   7c 63 01 94 addze   r3,r3
 30c:   4e 80 00 20 blr

And change to left shift instead of right shift to skip one more
instruction. This has no impact on the final sum.

02f8 :
 2f8:   54 a5 1f 38 rlwinm  r5,r5,3,28,28
 2fc:   5c 84 28 3e rotlw   r4,r4,r5
 300:   7c 63 20 14 addcr3,r3,r4
 304:   7c 63 01 94 addze   r3,r3
 308:   4e 80 00 20 blr

Signed-off-by: Christophe Leroy 
---
 include/net/checksum.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/net/checksum.h b/include/net/checksum.h
index 5218041e5c8f..9badcd5532ef 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -83,9 +83,7 @@ static inline __sum16 csum16_sub(__sum16 csum, __be16 addend)
 static inline __wsum csum_shift(__wsum sum, int offset)
 {
/* rotate sum to align it with a 16b boundary */
-   if (offset & 1)
-   return (__force __wsum)ror32((__force u32)sum, 8);
-   return sum;
+   return (__force __wsum)rol32((__force u32)sum, (offset & 1) << 3);
 }
 
 static inline __wsum
-- 
2.34.1



Re: [PATCH v2] powerpc/mm: Update default hugetlb size early

2022-02-11 Thread David Hildenbrand
On 11.02.22 07:52, Aneesh Kumar K.V wrote:
> commit: d9c234005227 ("Do not depend on MAX_ORDER when grouping pages by 
> mobility")
> introduced pageblock_order which will be used to group pages better.
> The kernel now groups pages based on the value of HPAGE_SHIFT. Hence 
> HPAGE_SHIFT
> should be set before we call set_pageblock_order.
> 
> set_pageblock_order happens early in the boot and default hugetlb page size
> should be initialized before that to compute the right pageblock_order value.
> 
> Currently, default hugetlbe page size is set via arch_initcalls which happens
> late in the boot as shown via the below callstack:
> 
> [c7383b10] [c1289328] hugetlbpage_init+0x2b8/0x2f8
> [c7383bc0] [c12749e4] do_one_initcall+0x14c/0x320
> [c7383c90] [c127505c] kernel_init_freeable+0x410/0x4e8
> [c7383da0] [c0012664] kernel_init+0x30/0x15c
> [c7383e10] [c000cf14] ret_from_kernel_thread+0x5c/0x64
> 
> and the pageblock_order initialization is done early during the boot.
> 
> [c18bfc80] [c12ae120] set_pageblock_order+0x50/0x64
> [c18bfca0] [c12b3d94] sparse_init+0x188/0x268
> [c18bfd60] [c1288bfc] initmem_init+0x28c/0x328
> [c18bfe50] [c127b370] setup_arch+0x410/0x480
> [c18bfed0] [c127401c] start_kernel+0xb8/0x934
> [c18bff90] [c000d984] start_here_common+0x1c/0x98
> 
> delaying default hugetlb page size initialization implies the kernel will
> initialize pageblock_order to (MAX_ORDER - 1) which is not an optimal
> value for mobility grouping. IIUC we always had this issue. But it was not
> a problem for hash translation mode because (MAX_ORDER - 1) is the same as
> HUGETLB_PAGE_ORDER (8) in the case of hash (16MB). With radix,
> HUGETLB_PAGE_ORDER will be 5 (2M size) and hence pageblock_order should be
> 5 instead of 8.


A related question: Can we on ppc still have pageblock_order > MAX_ORDER
- 1? We have some code for that and I am not so sure if we really need that.

-- 
Thanks,

David / dhildenb