On Sun, 2016-12-04 at 09:17 -0800, Eric Dumazet wrote:
> On Sun, 2016-12-04 at 03:10 -0800, Linus Torvalds wrote:
> > 
> > 
> > On Dec 4, 2016 02:43, "Thorsten Leemhuis" <[email protected]>
> > wrote:
> >         
> >         
> >         What the status of below patch? From the discussion it looks a
> >         lot like
> >         it was developed to fix a regression in 4.9, but the patch
> >         afaics has
> >         neither his mainline or linux-next yet. 
> > 
> > 
> > It's not a regression as far as I can tell. It's a small optimization.
> > Maybe.
> > 
> > 
> > It's not going into 4.9, is not even clear it's worth it later either,
> > unless somebody had numbers (which I haven't seen)
> > 
> Right, the patch was not in anyway ready for 4.9 ;)
> 
> I'll try to complete this for next cycle.

I now have a hacky patch that also adds PMD alignment for large
allocations, and support hugepages (this last part depends on
CONFIG_HAVE_ARCH_HUGE_VMAP at this moment, x86/arm64 so far)

Toshi Kani added pmd_set_huge() in commit e61ce6ade404e ("mm: change
ioremap to set up huge I/O mappings"), I am not sure why vmalloc() was
not considered (or I might have missed it completely)

It seems to provide about 25 cycles gain per random access for large
tables on my x86 lab hosts.

(I did a test with a program having 10 Million fds)

For allocations above 2 MB (pages >= 512), like Dentry cache,
Inode-cache, TCP established hash table, or large alloc_fdmem() ones,
might benefit from this.

lpaa23:~# grep large /proc/vmallocinfo 
0xffffc90000009000-0xffffc9000000c000   12288 
alloc_large_system_hash+0x189/0x253 pages=2 vmalloc N0=1 N1=1
0xffffc9000000c000-0xffffc9000000f000   12288 
alloc_large_system_hash+0x189/0x253 pages=2 vmalloc N0=1 N1=1
0xffffc9000001e000-0xffffc9000009f000  528384 
alloc_large_system_hash+0x189/0x253 pages=128 vmalloc N0=64 N1=64
0xffffc9000009f000-0xffffc900000e0000  266240 
alloc_large_system_hash+0x189/0x253 pages=64 vmalloc N0=32 N1=32
0xffffc900001d9000-0xffffc900001dc000   12288 
alloc_large_system_hash+0x189/0x253 pages=2 vmalloc N0=1 N1=1
0xffffc90000200000-0xffffc90010201000 268439552 
alloc_large_system_hash+0x189/0x253 pages=65536 vmalloc vpages N0=32768 N1=32768
0xffffc90010400000-0xffffc90018401000 134221824 
alloc_large_system_hash+0x189/0x253 pages=32768 vmalloc vpages N0=16384 N1=16384
0xffffc90018600000-0xffffc90018a01000 4198400 
alloc_large_system_hash+0x189/0x253 pages=1024 vmalloc vpages N0=512 N1=512
0xffffc90018c00000-0xffffc90019001000 4198400 
alloc_large_system_hash+0x189/0x253 pages=1024 vmalloc vpages N0=512 N1=512
0xffffc9001b249000-0xffffc9001b34a000 1052672 
alloc_large_system_hash+0x189/0x253 pages=256 vmalloc N0=128 N1=128
0xffffc9001b400000-0xffffc9001b801000 4198400 
alloc_large_system_hash+0x189/0x253 pages=1024 vmalloc vpages N0=512 N1=512
0xffffc9001ba00000-0xffffc9001bc01000 2101248 
alloc_large_system_hash+0x189/0x253 pages=512 vmalloc N0=256 N1=256
0xffffc9001bc01000-0xffffc9001bd02000 1052672 
alloc_large_system_hash+0x189/0x253 pages=256 vmalloc N0=128 N1=128
0xffffc9001be00000-0xffffc9001c001000 2101248 
alloc_large_system_hash+0x189/0x253 pages=512 vmalloc N0=256 N1=256


I wont be able to split this patch in 3 parts before January 6th, after
my vacations. I am showing the WIP if anyone is interested seeing this.

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a5584384eabc..055b027ee659 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -21,6 +21,7 @@
 #include <linux/debugobjects.h>
 #include <linux/kallsyms.h>
 #include <linux/list.h>
+#include <linux/mempolicy.h>
 #include <linux/notifier.h>
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
@@ -154,6 +155,18 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+               if (next - addr == PMD_SIZE) {
+                       struct page *page = pages[*nr];
+
+                       if (compound_order(page) == PMD_SHIFT - PAGE_SHIFT) {
+                               if (pmd_set_huge(pmd, page_to_phys(page), 
prot)) {
+                                       (*nr) += 1 << (PMD_SHIFT - PAGE_SHIFT);
+                                       continue;
+                               }
+                       }
+               }
+#endif
                if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
                        return -ENOMEM;
        } while (pmd++, addr = next, addr != end);
@@ -1349,7 +1362,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long 
size,
        if (flags & VM_IOREMAP)
                align = 1ul << clamp_t(int, get_count_order_long(size),
                                       PAGE_SHIFT, IOREMAP_MAX_ORDER);
-
+       else if (size >= PMD_SIZE)
+               align = PMD_SIZE;
        area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;
@@ -1482,11 +1496,14 @@ static void __vunmap(const void *addr, int 
deallocate_pages)
        if (deallocate_pages) {
                int i;
 
-               for (i = 0; i < area->nr_pages; i++) {
+               for (i = 0; i < area->nr_pages;) {
                        struct page *page = area->pages[i];
+                       unsigned int order;
 
                        BUG_ON(!page);
-                       __free_pages(page, 0);
+                       order = compound_order(page);
+                       __free_pages(page, order);
+                       i += 1 << order;
                }
 
                kvfree(area->pages);
@@ -1613,16 +1630,39 @@ EXPORT_SYMBOL(vmap);
 static void *__vmalloc_node(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, pgprot_t prot,
                            int node, const void *caller);
+
+static int vmalloc_max_order(int node, int nr_pages)
+{
+       int max_node_order = min(PMD_SHIFT - PAGE_SHIFT, MAX_ORDER - 1);
+
+#if defined(CONFIG_NUMA)
+       if (nr_online_nodes > 1 && node == NUMA_NO_NODE) {
+               struct mempolicy *pol = current->mempolicy;
+               int pages_per_node, nr_nodes;
+
+               if (pol && pol->mode == MPOL_INTERLEAVE) {
+                       nr_nodes = nodes_weight(pol->v.nodes);
+                       pages_per_node = DIV_ROUND_UP(nr_pages, nr_nodes);
+                       max_node_order = min(max_node_order,
+                                            ilog2(pages_per_node));
+               }
+       }
+#endif
+       return max_node_order;
+}
+
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, int node)
 {
        struct page **pages;
-       unsigned int nr_pages, array_size, i;
+       unsigned int nr_pages, array_size, i, j;
        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
        const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
+       int max_node_order;
 
        nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
        array_size = (nr_pages * sizeof(struct page *));
+       max_node_order = vmalloc_max_order(node, nr_pages);
 
        area->nr_pages = nr_pages;
        /* Please note that the recursion is strictly bounded. */
@@ -1639,20 +1679,31 @@ static void *__vmalloc_area_node(struct vm_struct 
*area, gfp_t gfp_mask,
                return NULL;
        }
 
-       for (i = 0; i < area->nr_pages; i++) {
-               struct page *page;
 
-               if (node == NUMA_NO_NODE)
-                       page = alloc_page(alloc_mask);
-               else
-                       page = alloc_pages_node(node, alloc_mask, 0);
+       for (i = 0; i < area->nr_pages;) {
+               int order = min(ilog2(area->nr_pages - i), max_node_order);
+               struct page *page;
 
-               if (unlikely(!page)) {
-                       /* Successfully allocated i pages, free them in 
__vunmap() */
-                       area->nr_pages = i;
-                       goto fail;
+               for (;;) {
+                       gfp_t gfp = alloc_mask;
+
+                       if (order > 0)
+                               gfp = (gfp & ~__GFP_DIRECT_RECLAIM) |
+                                     __GFP_NORETRY | __GFP_COMP;
+                       if (node == NUMA_NO_NODE)
+                               page = alloc_pages(gfp, order);
+                       else
+                               page = alloc_pages_node(node, gfp, order);
+                       if (page)
+                               break;
+                       if (unlikely(--order < 0)) {
+                               /* Successfully allocated i pages, free them in 
__vunmap() */
+                               area->nr_pages = i;
+                               goto fail;
+                       }
                }
-               area->pages[i] = page;
+               for (j = 0; j < (1U << order); j++)
+                       area->pages[i++] = page++;
                if (gfpflags_allow_blocking(gfp_mask))
                        cond_resched();
        }
@@ -2619,9 +2670,13 @@ static void show_numa_info(struct seq_file *m, struct 
vm_struct *v)
 
                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
 
-               for (nr = 0; nr < v->nr_pages; nr++)
-                       counters[page_to_nid(v->pages[nr])]++;
+               for (nr = 0; nr < v->nr_pages;) {
+                       struct page *page = v->pages[nr];
+                       int npages = 1 << compound_order(page);
 
+                       counters[page_to_nid(page)] += npages;
+                       nr += npages;
+               }
                for_each_node_state(nr, N_HIGH_MEMORY)
                        if (counters[nr])
                                seq_printf(m, " N%u=%u", nr, counters[nr]);


Reply via email to