from:"Pavel Tatashin"

[v1 2/5] mm: defining memblock_virt_alloc_try_nid_raw

2017-03-23 Thread Pavel Tatashin

A new version of memblock_virt_alloc_* allocations:
- Does not zero the allocated memory
- Does not panic if request cannot be satisfied

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 include/linux/bootmem.h |3 +++
 mm/memblock.c   |   46 +++---
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index dbaf312..b61ea10 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -160,6 +160,9 @@ extern int reserve_bootmem_node(pg_data_t *pgdat,
 #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)
 
 /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr,
+ phys_addr_t max_addr, int nid);
 void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
phys_addr_t align, phys_addr_t min_addr,
phys_addr_t max_addr, int nid);
diff --git a/mm/memblock.c b/mm/memblock.c
index 696f06d..7fdc555 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1271,7 +1271,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
 static void * __init memblock_virt_alloc_internal(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
-   int nid)
+   int nid, bool zero)
 {
phys_addr_t alloc;
void *ptr;
@@ -1322,7 +1322,8 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
return NULL;
 done:
ptr = phys_to_virt(alloc);
-   memset(ptr, 0, size);
+   if (zero)
+   memset(ptr, 0, size);
 
/*
 * The min_count is set to 0 so that bootmem allocated blocks
@@ -1336,6 +1337,37 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
 }
 
 /**
+ * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
+ * memory and without panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *   is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *   is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *   allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory, does not panic if request
+ * cannot be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_raw(
+   phys_addr_t size, phys_addr_t align,
+   phys_addr_t min_addr, phys_addr_t max_addr,
+   int nid)
+{
+   memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx 
max_addr=0x%llx %pF\n",
+__func__, (u64)size, (u64)align, nid, (u64)min_addr,
+(u64)max_addr, (void *)_RET_IP_);
+   return memblock_virt_alloc_internal(size, align,
+  min_addr, max_addr, nid, false);
+}
+
+/**
  * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
  * @align: alignment of the region and block's size
@@ -1346,8 +1378,8 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
  *   allocate only from memory limited by memblock.current_limit value
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
  *
- * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
- * additional debug information (including caller info), if enabled.
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. This function zeroes the allocated memory.
  *
  * RETURNS:
  * Virtual address of allocated memory block on success, NULL on failure.
@@ -1361,7 +1393,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
 (u64)max_addr, (void *)_RET_IP_);
return memblock_virt_alloc_internal(size, align, min_addr,
-max_addr, nid);
+max_addr, nid, true);
 }
 
 /**
@@ -1375,7 +1407,7 @@ phys_addr_t __init memblock_alloc_try

[v1 4/5] mm: zero struct pages during initialization

2017-03-23 Thread Pavel Tatashin

When deferred struct page initialization is enabled, do not expect that
the memory that was allocated for struct pages was zeroed by the
allocator. Zero it when "struct pages" are initialized.

Also, a defined boolean VMEMMAP_ZERO is provided to tell platforms whether
they should zero memory or can deffer it.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 include/linux/mm.h |9 +
 mm/page_alloc.c|3 +++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 54df194..eb052f6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2427,6 +2427,15 @@ int vmemmap_populate_basepages(unsigned long start, 
unsigned long end,
 #ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end);
 #endif
+/*
+ * Don't zero "struct page"es during early boot, and zero only when they are
+ * initialized in parallel.
+ */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+#define VMEMMAP_ZERO   false
+#else
+#define VMEMMAP_ZERO   true
+#endif
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
  unsigned long size);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f202f8b..02945e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1168,6 +1168,9 @@ static void free_one_page(struct zone *zone,
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
 {
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+   memset(page, 0, sizeof(struct page));
+#endif
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
-- 
1.7.1

[v1 5/5] mm: teach platforms not to zero struct pages memory

2017-03-23 Thread Pavel Tatashin

If we are using deferred struct page initialization feature, most of
"struct page"es are getting initialized after other CPUs are started, and
hence we are benefiting from doing this job in parallel. However, we are
still zeroing all the memory that is allocated for "struct pages" using the
boot CPU.  This patch solves this problem, by deferring zeroing "struct
pages" to only when they are initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 arch/powerpc/mm/init_64.c |2 +-
 arch/sparc/mm/init_64.c   |2 +-
 arch/x86/mm/init_64.c |2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index eb4c270..24faf2d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -181,7 +181,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (vmemmap_populated(start, page_size))
continue;
 
-   p = vmemmap_alloc_block(page_size, node, true);
+   p = vmemmap_alloc_block(page_size, node, VMEMMAP_ZERO);
if (!p)
return -ENOMEM;
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index d91e462..280834e 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2542,7 +2542,7 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
void *block = vmemmap_alloc_block(PMD_SIZE, node,
- true);
+ VMEMMAP_ZERO);
 
if (!block)
return -ENOMEM;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 46101b6..9d8c72c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1177,7 +1177,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned 
long start,
void *p;
 
p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap,
- true);
+ VMEMMAP_ZERO);
if (p) {
pte_t entry;
 
-- 
1.7.1

[v1 1/5] sparc64: simplify vmemmap_populate

2017-03-23 Thread Pavel Tatashin

Remove duplicating code, by using common functions
vmemmap_pud_populate and vmemmap_pgd_populate functions.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 arch/sparc/mm/init_64.c |   23 ++-
 1 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 2c0cb2a..01eccab 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2526,30 +2526,19 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
vstart = vstart & PMD_MASK;
vend = ALIGN(vend, PMD_SIZE);
for (; vstart < vend; vstart += PMD_SIZE) {
-   pgd_t *pgd = pgd_offset_k(vstart);
+   pgd_t *pgd = vmemmap_pgd_populate(vstart, node);
unsigned long pte;
pud_t *pud;
pmd_t *pmd;
 
-   if (pgd_none(*pgd)) {
-   pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
+   if (!pgd)
+   return -ENOMEM;
 
-   if (!new)
-   return -ENOMEM;
-   pgd_populate(_mm, pgd, new);
-   }
-
-   pud = pud_offset(pgd, vstart);
-   if (pud_none(*pud)) {
-   pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
-
-   if (!new)
-   return -ENOMEM;
-   pud_populate(_mm, pud, new);
-   }
+   pud = vmemmap_pud_populate(pgd, vstart, node);
+   if (!pud)
+   return -ENOMEM;
 
pmd = pmd_offset(pud, vstart);
-
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
void *block = vmemmap_alloc_block(PMD_SIZE, node);
-- 
1.7.1

[v1 0/5] parallelized "struct page" zeroing

2017-03-23 Thread Pavel Tatashin

When deferred struct page initialization feature is enabled, we get a
performance gain of initializing vmemmap in parallel after other CPUs are
started. However, we still zero the memory for vmemmap using one boot CPU.
This patch-set fixes the memset-zeroing limitation by deferring it as well.

Here is example performance gain on SPARC with 32T:
base
https://hastebin.com/ozanelatat.go

fix
https://hastebin.com/utonawukof.go

As you can see without the fix it takes: 97.89s to boot
With the fix it takes: 46.91 to boot.

On x86 time saving is going to be even greater (proportionally to memory size)
because there are twice as many "struct page"es for the same amount of memory,
as base pages are twice smaller.


Pavel Tatashin (5):
  sparc64: simplify vmemmap_populate
  mm: defining memblock_virt_alloc_try_nid_raw
  mm: add "zero" argument to vmemmap allocators
  mm: zero struct pages during initialization
  mm: teach platforms not to zero struct pages memory

 arch/powerpc/mm/init_64.c |4 +-
 arch/s390/mm/vmem.c   |5 ++-
 arch/sparc/mm/init_64.c   |   26 +++
 arch/x86/mm/init_64.c |3 +-
 include/linux/bootmem.h   |3 ++
 include/linux/mm.h|   15 +++--
 mm/memblock.c |   46 --
 mm/page_alloc.c   |3 ++
 mm/sparse-vmemmap.c   |   48 +---
 9 files changed, 103 insertions(+), 50 deletions(-)

[v1 3/5] mm: add "zero" argument to vmemmap allocators

2017-03-23 Thread Pavel Tatashin

Allow clients to request non-zeroed memory from vmemmap allocator.
The following two public function have a new boolean argument called zero:

__vmemmap_alloc_block_buf()
vmemmap_alloc_block()

When zero is true, memory that is allocated by memblock allocator is zeroed
(the current behavior), when argument is false, the memory is not zeroed.

This change allows for optimizations where client knows when it is better
to zero memory: may be later when other CPUs are started, or may be client
is going to set every byte in the allocated memory, so no need to zero
memory beforehand.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 arch/powerpc/mm/init_64.c |4 +-
 arch/s390/mm/vmem.c   |5 ++-
 arch/sparc/mm/init_64.c   |3 +-
 arch/x86/mm/init_64.c |3 +-
 include/linux/mm.h|6 ++--
 mm/sparse-vmemmap.c   |   48 +---
 6 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 9be9920..eb4c270 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -133,7 +133,7 @@ static int __meminit vmemmap_populated(unsigned long start, 
int page_size)
 
/* allocate a page when required and hand out chunks */
if (!num_left) {
-   next = vmemmap_alloc_block(PAGE_SIZE, node);
+   next = vmemmap_alloc_block(PAGE_SIZE, node, true);
if (unlikely(!next)) {
WARN_ON(1);
return NULL;
@@ -181,7 +181,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (vmemmap_populated(start, page_size))
continue;
 
-   p = vmemmap_alloc_block(page_size, node);
+   p = vmemmap_alloc_block(page_size, node, true);
if (!p)
return -ENOMEM;
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 60d3899..9c75214 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -251,7 +251,8 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (MACHINE_HAS_EDAT1) {
void *new_page;
 
-   new_page = vmemmap_alloc_block(PMD_SIZE, node);
+   new_page = vmemmap_alloc_block(PMD_SIZE, node,
+  true);
if (!new_page)
goto out;
pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
@@ -271,7 +272,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (pte_none(*pt_dir)) {
void *new_page;
 
-   new_page = vmemmap_alloc_block(PAGE_SIZE, node);
+   new_page = vmemmap_alloc_block(PAGE_SIZE, node, true);
if (!new_page)
goto out;
pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 01eccab..d91e462 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2541,7 +2541,8 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
pmd = pmd_offset(pud, vstart);
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
-   void *block = vmemmap_alloc_block(PMD_SIZE, node);
+   void *block = vmemmap_alloc_block(PMD_SIZE, node,
+ true);
 
if (!block)
return -ENOMEM;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 15173d3..46101b6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1176,7 +1176,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned 
long start,
if (pmd_none(*pmd)) {
void *p;
 
-   p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+   p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap,
+ true);
if (p) {
pte_t entry;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5f01c88..54df194 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2410,13 +2410,13 @@ void sparse_mem_maps_populate_node(struct page 
**map_map,
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
-void *vmemmap_alloc_block(unsigned lon

[v2 4/5] mm: zero struct pages during initialization

2017-03-24 Thread Pavel Tatashin

When deferred struct page initialization is enabled, do not expect that
the memory that was allocated for struct pages was zeroed by the
allocator. Zero it when "struct pages" are initialized.

Also, a defined boolean VMEMMAP_ZERO is provided to tell platforms whether
they should zero memory or can deffer it.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 include/linux/mm.h |9 +
 mm/page_alloc.c|3 +++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 54df194..eb052f6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2427,6 +2427,15 @@ int vmemmap_populate_basepages(unsigned long start, 
unsigned long end,
 #ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end);
 #endif
+/*
+ * Don't zero "struct page"es during early boot, and zero only when they are
+ * initialized in parallel.
+ */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+#define VMEMMAP_ZERO   false
+#else
+#define VMEMMAP_ZERO   true
+#endif
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
  unsigned long size);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f202f8b..02945e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1168,6 +1168,9 @@ static void free_one_page(struct zone *zone,
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
 {
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+   memset(page, 0, sizeof(struct page));
+#endif
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
-- 
1.7.1

[v2 0/5] parallelized "struct page" zeroing

2017-03-24 Thread Pavel Tatashin

Changelog:
v1 - v2
- Per request, added s390 to deferred "struct page" zeroing
- Collected performance data on x86 which proofs the importance to
  keep memset() as prefetch (see below).

When deferred struct page initialization feature is enabled, we get a
performance gain of initializing vmemmap in parallel after other CPUs are
started. However, we still zero the memory for vmemmap using one boot CPU.
This patch-set fixes the memset-zeroing limitation by deferring it as well.

Performance gain on SPARC with 32T:
base:   https://hastebin.com/ozanelatat.go
fix:https://hastebin.com/utonawukof.go

As you can see without the fix it takes: 97.89s to boot
With the fix it takes: 46.91 to boot.

Performance gain on x86 with 1T:
base:   https://hastebin.com/uvifasohon.pas
fix:https://hastebin.com/anodiqaguj.pas

On Intel we save 10.66s/T while on SPARC we save 1.59s/T. Intel has
twice as many pages, and also fewer nodes than SPARC (sparc 32 nodes, vs.
intel 8 nodes).

It takes one thread 11.25s to zero vmemmap on Intel for 1T, so it should
take additional 11.25 / 8 = 1.4s  (this machine has 8 nodes) per node to
initialize the memory, but it takes only additional 0.456s per node, which
means on Intel we also benefit from having memset() and initializing all
other fields in one place.

Pavel Tatashin (5):
  sparc64: simplify vmemmap_populate
  mm: defining memblock_virt_alloc_try_nid_raw
  mm: add "zero" argument to vmemmap allocators
  mm: zero struct pages during initialization
  mm: teach platforms not to zero struct pages memory

 arch/powerpc/mm/init_64.c |4 +-
 arch/s390/mm/vmem.c   |5 ++-
 arch/sparc/mm/init_64.c   |   26 +++
 arch/x86/mm/init_64.c |3 +-
 include/linux/bootmem.h   |3 ++
 include/linux/mm.h|   15 +++--
 mm/memblock.c |   46 --
 mm/page_alloc.c   |3 ++
 mm/sparse-vmemmap.c   |   48 +---
 9 files changed, 103 insertions(+), 50 deletions(-)

[v2 5/5] mm: teach platforms not to zero struct pages memory

2017-03-24 Thread Pavel Tatashin

If we are using deferred struct page initialization feature, most of
"struct page"es are getting initialized after other CPUs are started, and
hence we are benefiting from doing this job in parallel. However, we are
still zeroing all the memory that is allocated for "struct pages" using the
boot CPU.  This patch solves this problem, by deferring zeroing "struct
pages" to only when they are initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 arch/powerpc/mm/init_64.c |2 +-
 arch/s390/mm/vmem.c   |2 +-
 arch/sparc/mm/init_64.c   |2 +-
 arch/x86/mm/init_64.c |2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index eb4c270..24faf2d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -181,7 +181,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (vmemmap_populated(start, page_size))
continue;
 
-   p = vmemmap_alloc_block(page_size, node, true);
+   p = vmemmap_alloc_block(page_size, node, VMEMMAP_ZERO);
if (!p)
return -ENOMEM;
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 9c75214..ffe9ba1 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -252,7 +252,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
void *new_page;
 
new_page = vmemmap_alloc_block(PMD_SIZE, node,
-  true);
+  VMEMMAP_ZERO);
if (!new_page)
goto out;
pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index d91e462..280834e 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2542,7 +2542,7 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
void *block = vmemmap_alloc_block(PMD_SIZE, node,
- true);
+ VMEMMAP_ZERO);
 
if (!block)
return -ENOMEM;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 46101b6..9d8c72c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1177,7 +1177,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned 
long start,
void *p;
 
p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap,
- true);
+ VMEMMAP_ZERO);
if (p) {
pte_t entry;
 
-- 
1.7.1

[v2 3/5] mm: add "zero" argument to vmemmap allocators

2017-03-24 Thread Pavel Tatashin

Allow clients to request non-zeroed memory from vmemmap allocator.
The following two public function have a new boolean argument called zero:

__vmemmap_alloc_block_buf()
vmemmap_alloc_block()

When zero is true, memory that is allocated by memblock allocator is zeroed
(the current behavior), when argument is false, the memory is not zeroed.

This change allows for optimizations where client knows when it is better
to zero memory: may be later when other CPUs are started, or may be client
is going to set every byte in the allocated memory, so no need to zero
memory beforehand.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 arch/powerpc/mm/init_64.c |4 +-
 arch/s390/mm/vmem.c   |5 ++-
 arch/sparc/mm/init_64.c   |3 +-
 arch/x86/mm/init_64.c |3 +-
 include/linux/mm.h|6 ++--
 mm/sparse-vmemmap.c   |   48 +---
 6 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 9be9920..eb4c270 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -133,7 +133,7 @@ static int __meminit vmemmap_populated(unsigned long start, 
int page_size)
 
/* allocate a page when required and hand out chunks */
if (!num_left) {
-   next = vmemmap_alloc_block(PAGE_SIZE, node);
+   next = vmemmap_alloc_block(PAGE_SIZE, node, true);
if (unlikely(!next)) {
WARN_ON(1);
return NULL;
@@ -181,7 +181,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (vmemmap_populated(start, page_size))
continue;
 
-   p = vmemmap_alloc_block(page_size, node);
+   p = vmemmap_alloc_block(page_size, node, true);
if (!p)
return -ENOMEM;
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 60d3899..9c75214 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -251,7 +251,8 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (MACHINE_HAS_EDAT1) {
void *new_page;
 
-   new_page = vmemmap_alloc_block(PMD_SIZE, node);
+   new_page = vmemmap_alloc_block(PMD_SIZE, node,
+  true);
if (!new_page)
goto out;
pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
@@ -271,7 +272,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (pte_none(*pt_dir)) {
void *new_page;
 
-   new_page = vmemmap_alloc_block(PAGE_SIZE, node);
+   new_page = vmemmap_alloc_block(PAGE_SIZE, node, true);
if (!new_page)
goto out;
pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 01eccab..d91e462 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2541,7 +2541,8 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
pmd = pmd_offset(pud, vstart);
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
-   void *block = vmemmap_alloc_block(PMD_SIZE, node);
+   void *block = vmemmap_alloc_block(PMD_SIZE, node,
+ true);
 
if (!block)
return -ENOMEM;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 15173d3..46101b6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1176,7 +1176,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned 
long start,
if (pmd_none(*pmd)) {
void *p;
 
-   p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+   p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap,
+ true);
if (p) {
pte_t entry;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5f01c88..54df194 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2410,13 +2410,13 @@ void sparse_mem_maps_populate_node(struct page 
**map_map,
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
-void *vmemmap_alloc_block(unsigned lon

[v2 2/5] mm: defining memblock_virt_alloc_try_nid_raw

2017-03-24 Thread Pavel Tatashin

A new version of memblock_virt_alloc_* allocations:
- Does not zero the allocated memory
- Does not panic if request cannot be satisfied

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 include/linux/bootmem.h |3 +++
 mm/memblock.c   |   46 +++---
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index dbaf312..b61ea10 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -160,6 +160,9 @@ extern int reserve_bootmem_node(pg_data_t *pgdat,
 #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)
 
 /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr,
+ phys_addr_t max_addr, int nid);
 void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
phys_addr_t align, phys_addr_t min_addr,
phys_addr_t max_addr, int nid);
diff --git a/mm/memblock.c b/mm/memblock.c
index 696f06d..7fdc555 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1271,7 +1271,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
 static void * __init memblock_virt_alloc_internal(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
-   int nid)
+   int nid, bool zero)
 {
phys_addr_t alloc;
void *ptr;
@@ -1322,7 +1322,8 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
return NULL;
 done:
ptr = phys_to_virt(alloc);
-   memset(ptr, 0, size);
+   if (zero)
+   memset(ptr, 0, size);
 
/*
 * The min_count is set to 0 so that bootmem allocated blocks
@@ -1336,6 +1337,37 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
 }
 
 /**
+ * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
+ * memory and without panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *   is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *   is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *   allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory, does not panic if request
+ * cannot be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_raw(
+   phys_addr_t size, phys_addr_t align,
+   phys_addr_t min_addr, phys_addr_t max_addr,
+   int nid)
+{
+   memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx 
max_addr=0x%llx %pF\n",
+__func__, (u64)size, (u64)align, nid, (u64)min_addr,
+(u64)max_addr, (void *)_RET_IP_);
+   return memblock_virt_alloc_internal(size, align,
+  min_addr, max_addr, nid, false);
+}
+
+/**
  * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
  * @align: alignment of the region and block's size
@@ -1346,8 +1378,8 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
  *   allocate only from memory limited by memblock.current_limit value
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
  *
- * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
- * additional debug information (including caller info), if enabled.
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. This function zeroes the allocated memory.
  *
  * RETURNS:
  * Virtual address of allocated memory block on success, NULL on failure.
@@ -1361,7 +1393,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t 
size, phys_addr_t align, i
 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
 (u64)max_addr, (void *)_RET_IP_);
return memblock_virt_alloc_internal(size, align, min_addr,
-max_addr, nid);
+max_addr, nid, true);
 }
 
 /**
@@ -1375,7 +1407,7 @@ phys_addr_t __init memblock_alloc_try

[v2 1/5] sparc64: simplify vmemmap_populate

2017-03-24 Thread Pavel Tatashin

Remove duplicating code, by using common functions
vmemmap_pud_populate and vmemmap_pgd_populate functions.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 arch/sparc/mm/init_64.c |   23 ++-
 1 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 2c0cb2a..01eccab 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2526,30 +2526,19 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
vstart = vstart & PMD_MASK;
vend = ALIGN(vend, PMD_SIZE);
for (; vstart < vend; vstart += PMD_SIZE) {
-   pgd_t *pgd = pgd_offset_k(vstart);
+   pgd_t *pgd = vmemmap_pgd_populate(vstart, node);
unsigned long pte;
pud_t *pud;
pmd_t *pmd;
 
-   if (pgd_none(*pgd)) {
-   pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
+   if (!pgd)
+   return -ENOMEM;
 
-   if (!new)
-   return -ENOMEM;
-   pgd_populate(_mm, pgd, new);
-   }
-
-   pud = pud_offset(pgd, vstart);
-   if (pud_none(*pud)) {
-   pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
-
-   if (!new)
-   return -ENOMEM;
-   pud_populate(_mm, pud, new);
-   }
+   pud = vmemmap_pud_populate(pgd, vstart, node);
+   if (!pud)
+   return -ENOMEM;
 
pmd = pmd_offset(pud, vstart);
-
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
void *block = vmemmap_alloc_block(PMD_SIZE, node);
-- 
1.7.1

[v1 6/9] x86/tsc: use cpuid to determine TSC frequency

2017-03-22 Thread Pavel Tatashin

One of the newer methods to determine TSC frequency, is to use one of cpuid
extensions to get TSC/Crystal ratio. This method is preferred on CPUs that
implements it. This patch adds a new function calibrate_tsc_early() that
can be called early in boot to determine the TSC by using this method.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/tsc.c |   39 ++-
 1 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5add503..1c9fc23 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -26,6 +26,9 @@
 #include 
 #include 
 
+/* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
+#define CPUID_TSC_LEAF 0x15
+
 unsigned int __read_mostly cpu_khz;/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
 
@@ -671,24 +674,16 @@ static unsigned long quick_pit_calibrate(bool early_boot)
 }
 
 /**
- * native_calibrate_tsc
- * Determine TSC frequency via CPUID, else return 0.
+ * The caller already checked that TSC leaf capability can be read from cpuid
  */
-unsigned long native_calibrate_tsc(void)
+static unsigned long calibrate_tsc_early(int model)
 {
unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
unsigned int crystal_khz;
 
-   if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-   return 0;
-
-   if (boot_cpu_data.cpuid_level < 0x15)
-   return 0;
-
eax_denominator = ebx_numerator = ecx_hz = edx = 0;
 
-   /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
-   cpuid(0x15, _denominator, _numerator, _hz, );
+   cpuid(CPUID_TSC_LEAF, _denominator, _numerator, _hz, );
 
if (ebx_numerator == 0 || eax_denominator == 0)
return 0;
@@ -696,7 +691,7 @@ unsigned long native_calibrate_tsc(void)
crystal_khz = ecx_hz / 1000;
 
if (crystal_khz == 0) {
-   switch (boot_cpu_data.x86_model) {
+   switch (model) {
case INTEL_FAM6_SKYLAKE_MOBILE:
case INTEL_FAM6_SKYLAKE_DESKTOP:
case INTEL_FAM6_KABYLAKE_MOBILE:
@@ -713,6 +708,24 @@ unsigned long native_calibrate_tsc(void)
}
}
 
+   return crystal_khz * ebx_numerator / eax_denominator;
+}
+
+/**
+ * native_calibrate_tsc
+ * Determine TSC frequency via CPUID, else return 0.
+ */
+unsigned long native_calibrate_tsc(void)
+{
+   unsigned int tsc_khz;
+
+   if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+   return 0;
+
+   if (boot_cpu_data.cpuid_level < CPUID_TSC_LEAF)
+   return 0;
+
+   tsc_khz = calibrate_tsc_early(boot_cpu_data.x86_model);
/*
 * TSC frequency determined by CPUID is a "hardware reported"
 * frequency and is the most accurate one so far we have. This
@@ -727,7 +740,7 @@ unsigned long native_calibrate_tsc(void)
if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
 
-   return crystal_khz * ebx_numerator / eax_denominator;
+   return tsc_khz;
 }
 
 static unsigned long cpu_khz_from_cpuid(void)
-- 
1.7.1

[v1 8/9] x86/tsc: tsc early

2017-03-22 Thread Pavel Tatashin

tsc_early_init():
Use verious methods to determine the availability of TSC feature and its
frequency early in boot, and if that is possible initialize TSC and also
call sched_clock_early_init() to be able to get timestamps early in boot.

tsc_early_fini()
Implement the finish part of early tsc feature, print message about the
offset, which can be useful to findout how much time was spent in post and
boot manager, and also call sched_clock_early_fini() to let sched clock
know that

sched_clock_early():
TSC based implementation of weak function that is defined in sched clock.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/include/asm/tsc.h |4 ++
 arch/x86/kernel/tsc.c  |  107 
 2 files changed, 111 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 893de0c..a8c7f2e 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -50,11 +50,15 @@ static inline cycles_t get_cycles(void)
 extern void tsc_verify_tsc_adjust(bool resume);
 extern void check_tsc_sync_source(int cpu);
 extern void check_tsc_sync_target(void);
+void tsc_early_init(void);
+void tsc_early_fini(void);
 #else
 static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return 
false; }
 static inline void tsc_verify_tsc_adjust(bool resume) { }
 static inline void check_tsc_sync_source(int cpu) { }
 static inline void check_tsc_sync_target(void) { }
+static inline void tsc_early_init(void) { }
+static inline void tsc_early_fini(void) { }
 #endif
 
 extern int notsc_setup(char *);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 58bd575..67ecddf 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -25,10 +25,13 @@
 #include 
 #include 
 #include 
+#include 
 
 /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
 #define CPUID_TSC_LEAF 0x15
 
+static struct cyc2ns_data __read_mostly cyc2ns_early;
+
 unsigned int __read_mostly cpu_khz;/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
 
@@ -290,6 +293,16 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
sched_clock_idle_wakeup_event(0);
local_irq_restore(flags);
 }
+
+u64 sched_clock_early(void)
+{
+   u64 ns;
+
+   ns = mul_u64_u32_shr(rdtsc(), cyc2ns_early.cyc2ns_mul,
+cyc2ns_early.cyc2ns_shift);
+   return ns + cyc2ns_early.cyc2ns_offset;
+}
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
@@ -1365,6 +1378,100 @@ static int __init init_tsc_clocksource(void)
  */
 device_initcall(init_tsc_clocksource);
 
+#ifdef CONFIG_X86_TSC
+
+/* Determine if tsc is invariant early in boot */
+static bool __init tsc_invariant_early(void)
+{
+   unsigned int ext_cpuid_level, tsc_flag;
+
+   /* Get extended CPUID level */
+   ext_cpuid_level = cpuid_eax(0x8000);
+   if (ext_cpuid_level < 0x8007)
+   return false;
+
+   /* get field with invariant TSC flag */
+   tsc_flag = cpuid_edx(0x8007);
+   if (!(tsc_flag & (1 << 8)))
+   return false;
+
+   return true;
+}
+
+/*
+ * Determine if we can use TSC early in boot. On larger machines early boot can
+ * take a significant amount of time, therefore, for observability reasons, and
+ * also to avoid regressions it is important to have timestamps during the 
whole
+ * boot process.
+ */
+void __init tsc_early_init(void)
+{
+   int vendor, model, family, cpuid_level;
+   unsigned int sig, khz;
+   u64 tsc_now;
+
+   /*
+* Should we disable early timestamps on platforms without invariant
+* TSC?
+*
+* On the one hand invariant TSC guarantees, that early timestamps run
+* only on the latest hardware (Nehalem and later), but on the other
+* hand accuracy wise, non-invariant timestamps should be OK,
+* because during early boot power management features are not used.
+* ---
+* For now we disable invariant TSC for early boot.
+*/
+   if (!tsc_invariant_early())
+   return;
+
+   cpuid_level = cpuid_eax(0);
+   sig = cpuid_eax(1);
+   model = x86_model(sig);
+   family = x86_family(sig);
+   vendor = get_x86_vendor_early();
+
+   /*
+* Try several methods to get TSC frequency, if fail, return false
+* otherwise setup mult and shift values to convert ticks to nanoseconds
+* efficiently.
+*/
+   khz = 0;
+   if (vendor == X86_VENDOR_INTEL && cpuid_level >= CPUID_TSC_LEAF)
+   khz = calibrate_tsc_early(model);
+
+   if (khz == 0)
+   khz = cpu_khz_from_cpuid_early(vendor, cpuid_level);
+
+   if (khz == 0)
+   khz = cpu_khz_from_msr_early(vendor, family, model);
+
+   if (khz == 0)
+   khz = quick_pit_calibrate(true);
+
+   if (kh

[v1 7/9] x86/tsc: use cpuid to determine CPU frequency

2017-03-22 Thread Pavel Tatashin

Newer processors implement cpuid extension to determine CPU frequency
from cpuid. This patch adds a function that can do this early in boot.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/tsc.c |   10 ++
 1 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1c9fc23..58bd575 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -743,14 +743,14 @@ unsigned long native_calibrate_tsc(void)
return tsc_khz;
 }
 
-static unsigned long cpu_khz_from_cpuid(void)
+static unsigned long cpu_khz_from_cpuid_early(int vendor, int cpuid_level)
 {
unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
 
-   if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+   if (vendor != X86_VENDOR_INTEL)
return 0;
 
-   if (boot_cpu_data.cpuid_level < 0x16)
+   if (cpuid_level < 0x16)
return 0;
 
eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
@@ -770,7 +770,9 @@ unsigned long native_calibrate_cpu(void)
unsigned long flags, latch, ms, fast_calibrate;
int hpet = is_hpet_enabled(), i, loopmin;
 
-   fast_calibrate = cpu_khz_from_cpuid();
+   fast_calibrate = cpu_khz_from_cpuid_early(boot_cpu_data.x86_vendor,
+ boot_cpu_data.cpuid_level);
+
if (fast_calibrate)
return fast_calibrate;
 
-- 
1.7.1

[v1 0/9] Early boot time stamps for x86

2017-03-22 Thread Pavel Tatashin

Last week I sent out patches to enable early boot time stamp on SPARC, that
work is now under review:
http://www.spinics.net/lists/sparclinux/msg17372.html

This is continuation for that work, but adding early boot time stamps
support for x86 machines.

Here is example how this information is useful:

Before:
https://hastebin.com/zofiqazuze.scala

After:
https://hastebin.com/otayoliruc.scala

If you take a look at the before case, it seems that this particular x86
machine took only 2 minutes to boot. Which, while can be improved, is not
too bad considering that this machine has 192CPUs and 2.2T of memory.

But, as it can be seen in the fix case,
the time is much longer:
- early boot time stamps account for another 80s!
- early tsc offset is 549s, so it took over 9 minutes to get through POST,
  and GRUB before starting linux

Now, the total boot time is 12m52s, which is really slow.

Pavel Tatashin (9):
  sched/clock: broken stable to unstable transfer
  sched/clock: interface to allow timestamps early in boot
  x86/cpu: determining x86 vendor early
  x86/tsc: early MSR-based CPU/TSC frequency discovery
  x86/tsc: disable early messages from quick_pit_calibrate
  x86/tsc: use cpuid to determine TSC frequency
  x86/tsc: use cpuid to determine CPU frequency
  x86/tsc: tsc early
  x86/tsc: use tsc early

 arch/x86/include/asm/processor.h |1 +
 arch/x86/include/asm/tsc.h   |5 +
 arch/x86/kernel/cpu/common.c |   36 
 arch/x86/kernel/head64.c |1 +
 arch/x86/kernel/time.c   |1 +
 arch/x86/kernel/tsc.c|  166 +-
 arch/x86/kernel/tsc_msr.c|   38 ++---
 include/linux/sched/clock.h  |4 +
 kernel/sched/clock.c |   70 +++-
 9 files changed, 284 insertions(+), 38 deletions(-)

[v1 3/9] x86/cpu: determining x86 vendor early

2017-03-22 Thread Pavel Tatashin

In order to support early time stamps we must know the vendor id of the
chip early in boot. This patch implements it by getting vendor string from
cpuid, and comparing it against the known to Linux x86 vendors.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/include/asm/processor.h |1 +
 arch/x86/kernel/cpu/common.c |   36 
 2 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca..a4128d1 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -184,6 +184,7 @@ enum cpuid_regs_idx {
 
 extern void cpu_detect(struct cpuinfo_x86 *c);
 
+int get_x86_vendor_early(void);
 extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 58094a1..9c02851 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -847,6 +847,42 @@ static void __init early_identify_cpu(struct cpuinfo_x86 
*c)
fpu__init_system(c);
 }
 
+/*
+ * Returns x86_vendor early, before other platform data structures being
+ * initialized.
+ */
+int get_x86_vendor_early(void)
+{
+   int x86_vendor = X86_VENDOR_UNKNOWN;
+   const struct cpu_dev *const *cdev;
+   char vendor_str[16];
+   int count = 0;
+
+   cpuid(0, (unsigned int *)_str[12],
+ (unsigned int *)_str[0],
+ (unsigned int *)_str[8],
+ (unsigned int *)_str[4]);
+   vendor_str[12] = '\0';
+
+   for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
+   const struct cpu_dev *cpudev = *cdev;
+   const char *str1 = cpudev->c_ident[0];
+   const char *str2 = cpudev->c_ident[1];
+
+   if (count == X86_VENDOR_NUM)
+   break;
+
+   if ((!strcmp(vendor_str, str1)) ||
+   (str2 && !strcmp(vendor_str, str2))) {
+   x86_vendor = cpudev->c_x86_vendor;
+   break;
+   }
+   count++;
+   }
+
+   return x86_vendor;
+}
+
 void __init early_cpu_init(void)
 {
const struct cpu_dev *const *cdev;
-- 
1.7.1

[v1 9/9] x86/tsc: use tsc early

2017-03-22 Thread Pavel Tatashin

Call tsc_early_init() to initialize early boot time stamps functionality on
the supported x86 platforms, and call tsc_early_fini() to finish this
feature after permanent tsc has been initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/head64.c |1 +
 arch/x86/kernel/time.c   |1 +
 2 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b5785c1..1068a56 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -157,6 +157,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * 
real_mode_data)
clear_bss();
 
clear_page(init_level4_pgt);
+   tsc_early_init();
 
kasan_early_init();
 
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index d39c091..2d691eb 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -85,6 +85,7 @@ static __init void x86_late_time_init(void)
 {
x86_init.timers.timer_init();
tsc_init();
+   tsc_early_fini();
 }
 
 /*
-- 
1.7.1

[v1 1/9] sched/clock: broken stable to unstable transfer

2017-03-22 Thread Pavel Tatashin

When it is determined that clock is actually unstable, and we switch from
stable to unstable the following function is eventually called:

__clear_sched_clock_stable()

In this function we set gtod_offset so the following holds true:

sched_clock() + raw_offset == ktime_get_ns() + gtod_offset

But instead of getting the latest timestamps, we use the last values
from scd, so instead of sched_clock() we use scd->tick_raw, and instead of
ktime_get_ns() we use scd->tick_gtod.

However, later, when we use gtod_offset sched_clock_local() we do not add
it to scd->tick_gtod to calculate the correct clock value when we determine
the boundaries for min/max clocks.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 kernel/sched/clock.c |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index a08795e..2bc1090 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -214,7 +214,7 @@ static inline u64 wrap_max(u64 x, u64 y)
  */
 static u64 sched_clock_local(struct sched_clock_data *scd)
 {
-   u64 now, clock, old_clock, min_clock, max_clock;
+   u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
 
 again:
@@ -231,9 +231,10 @@ static u64 sched_clock_local(struct sched_clock_data *scd)
 *scd->tick_gtod + TICK_NSEC);
 */
 
-   clock = scd->tick_gtod + gtod_offset + delta;
-   min_clock = wrap_max(scd->tick_gtod, old_clock);
-   max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
+   gtod = scd->tick_gtod + gtod_offset;
+   clock = gtod + delta;
+   min_clock = wrap_max(gtod, old_clock);
+   max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
 
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
-- 
1.7.1

[v1 2/9] sched/clock: interface to allow timestamps early in boot

2017-03-22 Thread Pavel Tatashin

In Linux printk() can output timestamps next to every line.  This is very
useful for tracking regressions, and finding places that can be optimized.
However, the timestamps are available only later in boot. On smaller
machines it is insignificant amount of time, but on larger it can be many
seconds or even minutes into the boot process.

This patch adds an interface for platforms with unstable sched clock to
show timestamps early in boot. In order to get this functionality a
platform must do:

- Implement u64 sched_clock_early()
  Clock that returns monotonic time

- Call sched_clock_early_init()
  Tells sched clock that the early clock can be used

- Call sched_clock_early_fini()
  Tells sched clock that the early clock is finished, and sched clock
  should hand over the operation to permanent clock.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 include/linux/sched/clock.h |4 +++
 kernel/sched/clock.c|   61 +++
 2 files changed, 65 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 4a68c67..9b06b3a 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -67,6 +67,10 @@ static inline u64 local_clock(void)
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
+void sched_clock_early_init(void);
+void sched_clock_early_fini(void);
+u64 sched_clock_early(void);
+
 /*
  * As outlined in clock.c, provides a fast, high resolution, nanosecond
  * time source that is monotonic per cpu argument and has bounded drift
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 2bc1090..6a6d57b 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -79,6 +79,15 @@ unsigned long long __weak sched_clock(void)
 
 __read_mostly int sched_clock_running;
 
+/*
+ * We start with sched clock early static branch enabled, and global status
+ * disabled.  Early in boot it is decided whether to enable the global
+ * status as well (set sched_clock_early_running to true), and later, when
+ * early clock is no longer needed, the static branch is disabled.
+ */
+static DEFINE_STATIC_KEY_TRUE(__use_sched_clock_early);
+static bool __read_mostly sched_clock_early_running;
+
 void sched_clock_init(void)
 {
sched_clock_running = 1;
@@ -188,6 +197,12 @@ void sched_clock_init_late(void)
 */
smp_mb(); /* matches {set,clear}_sched_clock_stable() */
 
+   /*
+* It is guaranteed early clock is not running anymore. This function is
+* called from sched_init_smp(), and early clock must finish before smp.
+*/
+   static_branch_disable(&__use_sched_clock_early);
+
if (__sched_clock_stable_early)
__set_sched_clock_stable();
 }
@@ -320,6 +335,11 @@ u64 sched_clock_cpu(int cpu)
if (sched_clock_stable())
return sched_clock() + raw_offset;
 
+   if (static_branch_unlikely(&__use_sched_clock_early)) {
+   if (sched_clock_early_running)
+   return sched_clock_early();
+   }
+
if (unlikely(!sched_clock_running))
return 0ull;
 
@@ -379,6 +399,47 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+u64 __weak sched_clock_early(void)
+{
+   return 0;
+}
+
+/*
+ * Is called when sched_clock_early() is about to be finished, notifies sched
+ * clock that after this call sched_clock_early() can't be used.
+ *
+ * Must be called before smp_init() as sched_clock_early() is supposed to be
+ * used only during early boot, and are not guarantee to handle multi-CPU
+ * environment properly.
+ */
+void __init sched_clock_early_fini(void)
+{
+   struct sched_clock_data *scd = this_scd();
+   u64 now_early = sched_clock_early();
+   u64 now_sched = sched_clock();
+
+   /*
+* Set both: gtod_offset and raw_offset because we could switch to
+* either unstable clock or stable clock.
+*/
+   gtod_offset = now_early - scd->tick_gtod;
+   raw_offset = now_early - now_sched;
+
+   sched_clock_early_running = false;
+}
+
+/*
+ * Notifies sched clock that early boot clocksource is available, it means that
+ * the current platform has implemented sched_clock_early().
+ *
+ * The early clock is running until we switch to a stable clock, or when we
+ * learn that the stable clock is not available.
+ */
+void __init sched_clock_early_init(void)
+{
+   sched_clock_early_running = true;
+}
+
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 u64 sched_clock_cpu(int cpu)
-- 
1.7.1

[v1 4/9] x86/tsc: early MSR-based CPU/TSC frequency discovery

2017-03-22 Thread Pavel Tatashin

Allow discovering MSR-based CPU/TSC frequency early in boot. This method
works only for some Intel CPUs.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/include/asm/tsc.h |1 +
 arch/x86/kernel/tsc_msr.c  |   38 +-
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index f5e6f1c..893de0c 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -61,6 +61,7 @@ static inline void check_tsc_sync_target(void) { }
 extern void tsc_save_sched_clock_state(void);
 extern void tsc_restore_sched_clock_state(void);
 
+unsigned long cpu_khz_from_msr_early(int vendor, int family, int model);
 unsigned long cpu_khz_from_msr(void);
 
 #endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 19afdbd..6f5f617 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -62,22 +62,20 @@ static int match_cpu(u8 family, u8 model)
 #define id_to_freq(cpu_index, freq_id) \
(freq_desc_tables[cpu_index].freqs[freq_id])
 
-/*
- * MSR-based CPU/TSC frequency discovery for certain CPUs.
- *
- * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy
- * Return processor base frequency in KHz, or 0 on failure.
- */
-unsigned long cpu_khz_from_msr(void)
+ /*
+  * Get CPU/TSC frequency early in boot.
+  * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy
+  * Return processor base frequency in KHz, or 0 on failure.
+  */
+unsigned long cpu_khz_from_msr_early(int vendor, int family, int model)
 {
u32 lo, hi, ratio, freq_id, freq;
-   unsigned long res;
int cpu_index;
 
-   if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+   if (vendor != X86_VENDOR_INTEL)
return 0;
 
-   cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
+   cpu_index = match_cpu(family, model);
if (cpu_index < 0)
return 0;
 
@@ -94,13 +92,27 @@ unsigned long cpu_khz_from_msr(void)
freq_id = lo & 0x7;
freq = id_to_freq(cpu_index, freq_id);
 
-   /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
-   res = freq * ratio;
-
 #ifdef CONFIG_X86_LOCAL_APIC
lapic_timer_frequency = (freq * 1000) / HZ;
 #endif
 
+   /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
+   return freq * ratio;
+}
+
+/*
+ * MSR-based CPU/TSC frequency discovery for certain CPUs.
+ */
+unsigned long cpu_khz_from_msr(void)
+{
+   unsigned long res;
+
+   res = cpu_khz_from_msr_early(boot_cpu_data.x86_vendor,
+boot_cpu_data.x86,
+boot_cpu_data.x86_model);
+   if (res ==  0)
+   return 0;
+
/*
 * TSC frequency determined by MSR is always considered "known"
 * because it is reported by HW.
-- 
1.7.1

[v1 5/9] x86/tsc: disable early messages from quick_pit_calibrate

2017-03-22 Thread Pavel Tatashin

quick_pit_calibrate() is another method that can determine the frequency
of TSC.  However, this function by default outputs some information
messages. Allow to disable these messages if this function is called early
so early time stamps can be initialized before anything is printed.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/tsc.c |   10 ++
 1 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c73a7f9..5add503 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -580,7 +580,7 @@ static inline int pit_expect_msb(unsigned char val, u64 
*tscp, unsigned long *de
 #define MAX_QUICK_PIT_MS 50
 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 
256)
 
-static unsigned long quick_pit_calibrate(void)
+static unsigned long quick_pit_calibrate(bool early_boot)
 {
int i;
u64 tsc, delta;
@@ -645,7 +645,8 @@ static unsigned long quick_pit_calibrate(void)
goto success;
}
}
-   pr_info("Fast TSC calibration failed\n");
+   if (!early_boot)
+   pr_info("Fast TSC calibration failed\n");
return 0;
 
 success:
@@ -664,7 +665,8 @@ static unsigned long quick_pit_calibrate(void)
 */
delta *= PIT_TICK_RATE;
do_div(delta, i*256*1000);
-   pr_info("Fast TSC calibration using PIT\n");
+   if (!early_boot)
+   pr_info("Fast TSC calibration using PIT\n");
return delta;
 }
 
@@ -764,7 +766,7 @@ unsigned long native_calibrate_cpu(void)
return fast_calibrate;
 
local_irq_save(flags);
-   fast_calibrate = quick_pit_calibrate();
+   fast_calibrate = quick_pit_calibrate(false);
local_irq_restore(flags);
if (fast_calibrate)
return fast_calibrate;
-- 
1.7.1

[v2 6/9] x86/tsc: use cpuid to determine TSC frequency

2017-03-24 Thread Pavel Tatashin

One of the newer methods to determine TSC frequency, is to use one of cpuid
extensions to get TSC/Crystal ratio. This method is preferred on CPUs that
implements it. This patch adds a new function calibrate_tsc_early() that
can be called early in boot to determine the TSC by using this method.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/tsc.c | 39 ++-
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5add503..1c9fc23 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -26,6 +26,9 @@
 #include 
 #include 
 
+/* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
+#define CPUID_TSC_LEAF 0x15
+
 unsigned int __read_mostly cpu_khz;/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
 
@@ -671,24 +674,16 @@ static unsigned long quick_pit_calibrate(bool early_boot)
 }
 
 /**
- * native_calibrate_tsc
- * Determine TSC frequency via CPUID, else return 0.
+ * The caller already checked that TSC leaf capability can be read from cpuid
  */
-unsigned long native_calibrate_tsc(void)
+static unsigned long calibrate_tsc_early(int model)
 {
unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
unsigned int crystal_khz;
 
-   if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-   return 0;
-
-   if (boot_cpu_data.cpuid_level < 0x15)
-   return 0;
-
eax_denominator = ebx_numerator = ecx_hz = edx = 0;
 
-   /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
-   cpuid(0x15, _denominator, _numerator, _hz, );
+   cpuid(CPUID_TSC_LEAF, _denominator, _numerator, _hz, );
 
if (ebx_numerator == 0 || eax_denominator == 0)
return 0;
@@ -696,7 +691,7 @@ unsigned long native_calibrate_tsc(void)
crystal_khz = ecx_hz / 1000;
 
if (crystal_khz == 0) {
-   switch (boot_cpu_data.x86_model) {
+   switch (model) {
case INTEL_FAM6_SKYLAKE_MOBILE:
case INTEL_FAM6_SKYLAKE_DESKTOP:
case INTEL_FAM6_KABYLAKE_MOBILE:
@@ -713,6 +708,24 @@ unsigned long native_calibrate_tsc(void)
}
}
 
+   return crystal_khz * ebx_numerator / eax_denominator;
+}
+
+/**
+ * native_calibrate_tsc
+ * Determine TSC frequency via CPUID, else return 0.
+ */
+unsigned long native_calibrate_tsc(void)
+{
+   unsigned int tsc_khz;
+
+   if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+   return 0;
+
+   if (boot_cpu_data.cpuid_level < CPUID_TSC_LEAF)
+   return 0;
+
+   tsc_khz = calibrate_tsc_early(boot_cpu_data.x86_model);
/*
 * TSC frequency determined by CPUID is a "hardware reported"
 * frequency and is the most accurate one so far we have. This
@@ -727,7 +740,7 @@ unsigned long native_calibrate_tsc(void)
if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
 
-   return crystal_khz * ebx_numerator / eax_denominator;
+   return tsc_khz;
 }
 
 static unsigned long cpu_khz_from_cpuid(void)
-- 
1.8.3.1

[v2 5/9] x86/tsc: disable early messages from quick_pit_calibrate

2017-03-24 Thread Pavel Tatashin

quick_pit_calibrate() is another method that can determine the frequency
of TSC.  However, this function by default outputs some information
messages. Allow to disable these messages if this function is called early
so early time stamps can be initialized before anything is printed.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/tsc.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c73a7f9..5add503 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -580,7 +580,7 @@ static inline int pit_expect_msb(unsigned char val, u64 
*tscp, unsigned long *de
 #define MAX_QUICK_PIT_MS 50
 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 
256)
 
-static unsigned long quick_pit_calibrate(void)
+static unsigned long quick_pit_calibrate(bool early_boot)
 {
int i;
u64 tsc, delta;
@@ -645,7 +645,8 @@ static unsigned long quick_pit_calibrate(void)
goto success;
}
}
-   pr_info("Fast TSC calibration failed\n");
+   if (!early_boot)
+   pr_info("Fast TSC calibration failed\n");
return 0;
 
 success:
@@ -664,7 +665,8 @@ static unsigned long quick_pit_calibrate(void)
 */
delta *= PIT_TICK_RATE;
do_div(delta, i*256*1000);
-   pr_info("Fast TSC calibration using PIT\n");
+   if (!early_boot)
+   pr_info("Fast TSC calibration using PIT\n");
return delta;
 }
 
@@ -764,7 +766,7 @@ unsigned long native_calibrate_cpu(void)
return fast_calibrate;
 
local_irq_save(flags);
-   fast_calibrate = quick_pit_calibrate();
+   fast_calibrate = quick_pit_calibrate(false);
local_irq_restore(flags);
if (fast_calibrate)
return fast_calibrate;
-- 
1.8.3.1

[v2 1/9] sched/clock: broken stable to unstable transfer

2017-03-24 Thread Pavel Tatashin

When it is determined that clock is actually unstable, and we switch from
stable to unstable the following function is eventually called:

__clear_sched_clock_stable()

In this function we set gtod_offset so the following holds true:

sched_clock() + raw_offset == ktime_get_ns() + gtod_offset

But instead of getting the latest timestamps, we use the last values
from scd, so instead of sched_clock() we use scd->tick_raw, and instead of
ktime_get_ns() we use scd->tick_gtod.

However, later, when we use gtod_offset sched_clock_local() we do not add
it to scd->tick_gtod to calculate the correct clock value when we determine
the boundaries for min/max clocks.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 kernel/sched/clock.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index a08795e..2bc1090 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -214,7 +214,7 @@ static inline u64 wrap_max(u64 x, u64 y)
  */
 static u64 sched_clock_local(struct sched_clock_data *scd)
 {
-   u64 now, clock, old_clock, min_clock, max_clock;
+   u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
 
 again:
@@ -231,9 +231,10 @@ static u64 sched_clock_local(struct sched_clock_data *scd)
 *scd->tick_gtod + TICK_NSEC);
 */
 
-   clock = scd->tick_gtod + gtod_offset + delta;
-   min_clock = wrap_max(scd->tick_gtod, old_clock);
-   max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
+   gtod = scd->tick_gtod + gtod_offset;
+   clock = gtod + delta;
+   min_clock = wrap_max(gtod, old_clock);
+   max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
 
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
-- 
1.8.3.1

[v2 0/9] Early boot time stamps for x86

2017-03-24 Thread Pavel Tatashin

changelog
-
v1 - v2
In patch "x86/tsc: tsc early":
- added tsc_adjusted_early()
- fixed 32-bit compile error use do_div()

Adding early boot time stamps support for x86 machines.
SPARC patch see here:
http://www.spinics.net/lists/sparclinux/msg17372.html

sample output
-
Before:
https://hastebin.com/zofiqazuze.scala

After:
https://hastebin.com/otayoliruc.scala

example why early time is crucial
-
Here is boot log on x86 machine with 8 E7-8895 CPUs, and 1T of memory:
https://hastebin.com/ebidosafic.pas

We want to improve boot time, by deferring vmemmap initialization to when
the secondary CPUs are online:
https://hastebin.com/migajadiju.pas

We see in the log that struct pages are initialized in parallel:
[   19.446609] node 0 initialised, 31980145 pages in 530ms
...

But, we have no idea if we saved overall boot time or not by enabling this
feature, because the total boot time as shown currently has not changed
much, and is actually increased from: 18.99s to 19.52s

Now, with this feature we would see right away that the time is actually
decreased:
Before: https://hastebin.com/ivobehahud.pas
After:  https://hastebin.com/uvifasohon.pas

Boot time reduced from 40.68s to 31.41s, we saved 30% by enabling deferred
page initialization feature, great.

However, now, we have early boot time stamps, and we can analyze even
further, and see that we still spend a lot of time in this interval:
[0.009071] kexec_core: crashkernel: memory value expected
[   11.255710] Zone ranges:

And, this is where this fix comes handy:
https://lkml.org/lkml/2017/3/23/864

The final boot time on this machine is this:
https://hastebin.com/anodiqaguj.pas

So, 20.7 seconds vs. 40.68s before, and we can show it with the early boot
time stamps, and avoid future regressions by having this data always
available to us.

Pavel Tatashin (9):
  sched/clock: broken stable to unstable transfer
  sched/clock: interface to allow timestamps early in boot
  x86/cpu: determining x86 vendor early
  x86/tsc: early MSR-based CPU/TSC frequency discovery
  x86/tsc: disable early messages from quick_pit_calibrate
  x86/tsc: use cpuid to determine TSC frequency
  x86/tsc: use cpuid to determine CPU frequency
  x86/tsc: tsc early
  x86/tsc: use tsc early

 arch/x86/include/asm/processor.h |   1 +
 arch/x86/include/asm/tsc.h   |   5 ++
 arch/x86/kernel/cpu/common.c |  36 
 arch/x86/kernel/head64.c |   1 +
 arch/x86/kernel/time.c   |   1 +
 arch/x86/kernel/tsc.c| 186 ++-
 arch/x86/kernel/tsc_msr.c|  38 +---
 include/linux/sched/clock.h  |   4 +
 kernel/sched/clock.c |  70 ++-
 9 files changed, 304 insertions(+), 38 deletions(-)

-- 
1.8.3.1

[v2 4/9] x86/tsc: early MSR-based CPU/TSC frequency discovery

2017-03-24 Thread Pavel Tatashin

Allow discovering MSR-based CPU/TSC frequency early in boot. This method
works only for some Intel CPUs.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/include/asm/tsc.h |  1 +
 arch/x86/kernel/tsc_msr.c  | 38 +-
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index f5e6f1c..893de0c 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -61,6 +61,7 @@ static inline void check_tsc_sync_target(void) { }
 extern void tsc_save_sched_clock_state(void);
 extern void tsc_restore_sched_clock_state(void);
 
+unsigned long cpu_khz_from_msr_early(int vendor, int family, int model);
 unsigned long cpu_khz_from_msr(void);
 
 #endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 19afdbd..6f5f617 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -62,22 +62,20 @@ static int match_cpu(u8 family, u8 model)
 #define id_to_freq(cpu_index, freq_id) \
(freq_desc_tables[cpu_index].freqs[freq_id])
 
-/*
- * MSR-based CPU/TSC frequency discovery for certain CPUs.
- *
- * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy
- * Return processor base frequency in KHz, or 0 on failure.
- */
-unsigned long cpu_khz_from_msr(void)
+ /*
+  * Get CPU/TSC frequency early in boot.
+  * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy
+  * Return processor base frequency in KHz, or 0 on failure.
+  */
+unsigned long cpu_khz_from_msr_early(int vendor, int family, int model)
 {
u32 lo, hi, ratio, freq_id, freq;
-   unsigned long res;
int cpu_index;
 
-   if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+   if (vendor != X86_VENDOR_INTEL)
return 0;
 
-   cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
+   cpu_index = match_cpu(family, model);
if (cpu_index < 0)
return 0;
 
@@ -94,13 +92,27 @@ unsigned long cpu_khz_from_msr(void)
freq_id = lo & 0x7;
freq = id_to_freq(cpu_index, freq_id);
 
-   /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
-   res = freq * ratio;
-
 #ifdef CONFIG_X86_LOCAL_APIC
lapic_timer_frequency = (freq * 1000) / HZ;
 #endif
 
+   /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
+   return freq * ratio;
+}
+
+/*
+ * MSR-based CPU/TSC frequency discovery for certain CPUs.
+ */
+unsigned long cpu_khz_from_msr(void)
+{
+   unsigned long res;
+
+   res = cpu_khz_from_msr_early(boot_cpu_data.x86_vendor,
+boot_cpu_data.x86,
+boot_cpu_data.x86_model);
+   if (res ==  0)
+   return 0;
+
/*
 * TSC frequency determined by MSR is always considered "known"
 * because it is reported by HW.
-- 
1.8.3.1

[v2 3/9] x86/cpu: determining x86 vendor early

2017-03-24 Thread Pavel Tatashin

In order to support early time stamps we must know the vendor id of the
chip early in boot. This patch implements it by getting vendor string from
cpuid, and comparing it against the known to Linux x86 vendors.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/common.c | 36 
 2 files changed, 37 insertions(+)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca..a4128d1 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -184,6 +184,7 @@ enum cpuid_regs_idx {
 
 extern void cpu_detect(struct cpuinfo_x86 *c);
 
+int get_x86_vendor_early(void);
 extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
 extern void identify_secondary_cpu(struct cpuinfo_x86 *);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 58094a1..9c02851 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -847,6 +847,42 @@ static void __init early_identify_cpu(struct cpuinfo_x86 
*c)
fpu__init_system(c);
 }
 
+/*
+ * Returns x86_vendor early, before other platform data structures being
+ * initialized.
+ */
+int get_x86_vendor_early(void)
+{
+   int x86_vendor = X86_VENDOR_UNKNOWN;
+   const struct cpu_dev *const *cdev;
+   char vendor_str[16];
+   int count = 0;
+
+   cpuid(0, (unsigned int *)_str[12],
+ (unsigned int *)_str[0],
+ (unsigned int *)_str[8],
+ (unsigned int *)_str[4]);
+   vendor_str[12] = '\0';
+
+   for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
+   const struct cpu_dev *cpudev = *cdev;
+   const char *str1 = cpudev->c_ident[0];
+   const char *str2 = cpudev->c_ident[1];
+
+   if (count == X86_VENDOR_NUM)
+   break;
+
+   if ((!strcmp(vendor_str, str1)) ||
+   (str2 && !strcmp(vendor_str, str2))) {
+   x86_vendor = cpudev->c_x86_vendor;
+   break;
+   }
+   count++;
+   }
+
+   return x86_vendor;
+}
+
 void __init early_cpu_init(void)
 {
const struct cpu_dev *const *cdev;
-- 
1.8.3.1

[v2 2/9] sched/clock: interface to allow timestamps early in boot

2017-03-24 Thread Pavel Tatashin

In Linux printk() can output timestamps next to every line.  This is very
useful for tracking regressions, and finding places that can be optimized.
However, the timestamps are available only later in boot. On smaller
machines it is insignificant amount of time, but on larger it can be many
seconds or even minutes into the boot process.

This patch adds an interface for platforms with unstable sched clock to
show timestamps early in boot. In order to get this functionality a
platform must do:

- Implement u64 sched_clock_early()
  Clock that returns monotonic time

- Call sched_clock_early_init()
  Tells sched clock that the early clock can be used

- Call sched_clock_early_fini()
  Tells sched clock that the early clock is finished, and sched clock
  should hand over the operation to permanent clock.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 include/linux/sched/clock.h |  4 +++
 kernel/sched/clock.c| 61 +
 2 files changed, 65 insertions(+)

diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 4a68c67..9b06b3a 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -67,6 +67,10 @@ static inline u64 local_clock(void)
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
+void sched_clock_early_init(void);
+void sched_clock_early_fini(void);
+u64 sched_clock_early(void);
+
 /*
  * As outlined in clock.c, provides a fast, high resolution, nanosecond
  * time source that is monotonic per cpu argument and has bounded drift
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 2bc1090..6a6d57b 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -79,6 +79,15 @@ unsigned long long __weak sched_clock(void)
 
 __read_mostly int sched_clock_running;
 
+/*
+ * We start with sched clock early static branch enabled, and global status
+ * disabled.  Early in boot it is decided whether to enable the global
+ * status as well (set sched_clock_early_running to true), and later, when
+ * early clock is no longer needed, the static branch is disabled.
+ */
+static DEFINE_STATIC_KEY_TRUE(__use_sched_clock_early);
+static bool __read_mostly sched_clock_early_running;
+
 void sched_clock_init(void)
 {
sched_clock_running = 1;
@@ -188,6 +197,12 @@ void sched_clock_init_late(void)
 */
smp_mb(); /* matches {set,clear}_sched_clock_stable() */
 
+   /*
+* It is guaranteed early clock is not running anymore. This function is
+* called from sched_init_smp(), and early clock must finish before smp.
+*/
+   static_branch_disable(&__use_sched_clock_early);
+
if (__sched_clock_stable_early)
__set_sched_clock_stable();
 }
@@ -320,6 +335,11 @@ u64 sched_clock_cpu(int cpu)
if (sched_clock_stable())
return sched_clock() + raw_offset;
 
+   if (static_branch_unlikely(&__use_sched_clock_early)) {
+   if (sched_clock_early_running)
+   return sched_clock_early();
+   }
+
if (unlikely(!sched_clock_running))
return 0ull;
 
@@ -379,6 +399,47 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+u64 __weak sched_clock_early(void)
+{
+   return 0;
+}
+
+/*
+ * Is called when sched_clock_early() is about to be finished, notifies sched
+ * clock that after this call sched_clock_early() can't be used.
+ *
+ * Must be called before smp_init() as sched_clock_early() is supposed to be
+ * used only during early boot, and are not guarantee to handle multi-CPU
+ * environment properly.
+ */
+void __init sched_clock_early_fini(void)
+{
+   struct sched_clock_data *scd = this_scd();
+   u64 now_early = sched_clock_early();
+   u64 now_sched = sched_clock();
+
+   /*
+* Set both: gtod_offset and raw_offset because we could switch to
+* either unstable clock or stable clock.
+*/
+   gtod_offset = now_early - scd->tick_gtod;
+   raw_offset = now_early - now_sched;
+
+   sched_clock_early_running = false;
+}
+
+/*
+ * Notifies sched clock that early boot clocksource is available, it means that
+ * the current platform has implemented sched_clock_early().
+ *
+ * The early clock is running until we switch to a stable clock, or when we
+ * learn that the stable clock is not available.
+ */
+void __init sched_clock_early_init(void)
+{
+   sched_clock_early_running = true;
+}
+
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 u64 sched_clock_cpu(int cpu)
-- 
1.8.3.1

[v2 8/9] x86/tsc: tsc early

2017-03-24 Thread Pavel Tatashin

tsc_early_init():
Use verious methods to determine the availability of TSC feature and its
frequency early in boot, and if that is possible initialize TSC and also
call sched_clock_early_init() to be able to get timestamps early in boot.

tsc_early_fini()
Implement the finish part of early tsc feature, print message about the
offset, which can be useful to findout how much time was spent in post and
boot manager, and also call sched_clock_early_fini() to let sched clock
know that

sched_clock_early():
TSC based implementation of weak function that is defined in sched clock.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/include/asm/tsc.h |   4 ++
 arch/x86/kernel/tsc.c  | 127 +
 2 files changed, 131 insertions(+)

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 893de0c..a8c7f2e 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -50,11 +50,15 @@ static inline cycles_t get_cycles(void)
 extern void tsc_verify_tsc_adjust(bool resume);
 extern void check_tsc_sync_source(int cpu);
 extern void check_tsc_sync_target(void);
+void tsc_early_init(void);
+void tsc_early_fini(void);
 #else
 static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return 
false; }
 static inline void tsc_verify_tsc_adjust(bool resume) { }
 static inline void check_tsc_sync_source(int cpu) { }
 static inline void check_tsc_sync_target(void) { }
+static inline void tsc_early_init(void) { }
+static inline void tsc_early_fini(void) { }
 #endif
 
 extern int notsc_setup(char *);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 58bd575..8782af5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -25,10 +25,13 @@
 #include 
 #include 
 #include 
+#include 
 
 /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
 #define CPUID_TSC_LEAF 0x15
 
+static struct cyc2ns_data __read_mostly cyc2ns_early;
+
 unsigned int __read_mostly cpu_khz;/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
 
@@ -290,6 +293,16 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
sched_clock_idle_wakeup_event(0);
local_irq_restore(flags);
 }
+
+u64 sched_clock_early(void)
+{
+   u64 ns;
+
+   ns = mul_u64_u32_shr(rdtsc(), cyc2ns_early.cyc2ns_mul,
+cyc2ns_early.cyc2ns_shift);
+   return ns + cyc2ns_early.cyc2ns_offset;
+}
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
@@ -1365,6 +1378,120 @@ static int __init init_tsc_clocksource(void)
  */
 device_initcall(init_tsc_clocksource);
 
+#ifdef CONFIG_X86_TSC
+
+/* Determine if tsc is invariant early in boot */
+static bool __init tsc_invariant_early(void)
+{
+   unsigned int ext_cpuid_level, tsc_flag;
+
+   /* Get extended CPUID level */
+   ext_cpuid_level = cpuid_eax(0x8000);
+   if (ext_cpuid_level < 0x8007)
+   return false;
+
+   /* get field with invariant TSC flag */
+   tsc_flag = cpuid_edx(0x8007);
+   if (!(tsc_flag & (1 << 8)))
+   return false;
+
+   return true;
+}
+
+/* Returns true if TSC was adjusted by BIOS. */
+static __init bool
+tsc_adjusted_early(int cpuid_level)
+{
+   u64 adjusted = 0;
+
+   /*  IA32_TSC_ADJUST MSR is bit 1 in ebx register of level 7 cpuid */
+   if (cpuid_level > 6 && (cpuid_ebx(7) & (1 << 1)))
+   rdmsrl(MSR_IA32_TSC_ADJUST, adjusted);
+
+   return (adjusted != 0);
+}
+
+/*
+ * Determine if we can use TSC early in boot. On larger machines early boot can
+ * take a significant amount of time, therefore, for observability reasons, and
+ * also to avoid regressions it is important to have timestamps during the 
whole
+ * boot process.
+ */
+void __init tsc_early_init(void)
+{
+   int vendor, model, family, cpuid_level;
+   unsigned int sig, khz;
+   u64 tsc_now;
+
+   /*
+* Should we disable early timestamps on platforms without invariant
+* TSC?
+*
+* On the one hand invariant TSC guarantees, that early timestamps run
+* only on the latest hardware (Nehalem and later), but on the other
+* hand accuracy wise, non-invariant timestamps should be OK,
+* because during early boot power management features are not used.
+* ---
+* For now we disable invariant TSC for early boot.
+*/
+   if (!tsc_invariant_early())
+   return;
+
+   cpuid_level = cpuid_eax(0);
+   sig = cpuid_eax(1);
+   model = x86_model(sig);
+   family = x86_family(sig);
+   vendor = get_x86_vendor_early();
+
+   if (tsc_adjusted_early(cpuid_level))
+   return;
+
+   /*
+* Try several methods to get TSC frequency, if fail, return false
+* otherwise setup mult and shift values to convert ticks to nanoseconds
+* efficiently.
+

[v2 7/9] x86/tsc: use cpuid to determine CPU frequency

2017-03-24 Thread Pavel Tatashin

Newer processors implement cpuid extension to determine CPU frequency
from cpuid. This patch adds a function that can do this early in boot.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/tsc.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 1c9fc23..58bd575 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -743,14 +743,14 @@ unsigned long native_calibrate_tsc(void)
return tsc_khz;
 }
 
-static unsigned long cpu_khz_from_cpuid(void)
+static unsigned long cpu_khz_from_cpuid_early(int vendor, int cpuid_level)
 {
unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
 
-   if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+   if (vendor != X86_VENDOR_INTEL)
return 0;
 
-   if (boot_cpu_data.cpuid_level < 0x16)
+   if (cpuid_level < 0x16)
return 0;
 
eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
@@ -770,7 +770,9 @@ unsigned long native_calibrate_cpu(void)
unsigned long flags, latch, ms, fast_calibrate;
int hpet = is_hpet_enabled(), i, loopmin;
 
-   fast_calibrate = cpu_khz_from_cpuid();
+   fast_calibrate = cpu_khz_from_cpuid_early(boot_cpu_data.x86_vendor,
+ boot_cpu_data.cpuid_level);
+
if (fast_calibrate)
return fast_calibrate;
 
-- 
1.8.3.1

[v2 9/9] x86/tsc: use tsc early

2017-03-24 Thread Pavel Tatashin

Call tsc_early_init() to initialize early boot time stamps functionality on
the supported x86 platforms, and call tsc_early_fini() to finish this
feature after permanent tsc has been initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/head64.c | 1 +
 arch/x86/kernel/time.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b5785c1..1068a56 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -157,6 +157,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * 
real_mode_data)
clear_bss();
 
clear_page(init_level4_pgt);
+   tsc_early_init();
 
kasan_early_init();
 
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index d39c091..2d691eb3 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -85,6 +85,7 @@ static __init void x86_late_time_init(void)
 {
x86_init.timers.timer_init();
tsc_init();
+   tsc_early_fini();
 }
 
 /*
-- 
1.8.3.1

[v4 00/15] complete deferred page initialization

2017-08-02 Thread Pavel Tatashin

Changelog:
v3 - v2
- Rewrote code to zero sturct pages in __init_single_page() as
  suggested by Michal Hocko
- Added code to handle issues related to accessing struct page
  memory before they are initialized.

v2 - v3
- Addressed David Miller comments about one change per patch:
* Splited changes to platforms into 4 patches
* Made "do not zero vmemmap_buf" as a separate patch

v1 - v2
- Per request, added s390 to deferred "struct page" zeroing
- Collected performance data on x86 which proofs the importance to
  keep memset() as prefetch (see below).

SMP machines can benefit from the DEFERRED_STRUCT_PAGE_INIT config option,
which defers initializing struct pages until all cpus have been started so
it can be done in parallel.

However, this feature is sub-optimal, because the deferred page
initialization code expects that the struct pages have already been zeroed,
and the zeroing is done early in boot with a single thread only.  Also, we
access that memory and set flags before struct pages are initialized. All
of this is fixed in this patchset.

In this work we do the following:
- Never read access struct page until it was initialized
- Never set any fields in struct pages before they are initialized
- Zero struct page at the beginning of struct page initialization

Performance improvements on x86 machine with 8 nodes:
Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz

Single threaded struct page init: 7.6s/T improvement
Deferred struct page init: 10.2s/T improvement

Pavel Tatashin (15):
  x86/mm: reserve only exiting low pages
  x86/mm: setting fields in deferred pages
  sparc64/mm: setting fields in deferred pages
  mm: discard memblock data later
  mm: don't accessed uninitialized struct pages
  sparc64: simplify vmemmap_populate
  mm: defining memblock_virt_alloc_try_nid_raw
  mm: zero struct pages during initialization
  sparc64: optimized struct page zeroing
  x86/kasan: explicitly zero kasan shadow memory
  arm64/kasan: explicitly zero kasan shadow memory
  mm: explicitly zero pagetable memory
  mm: stop zeroing memory during allocation in vmemmap
  mm: optimize early system hash allocations
  mm: debug for raw alloctor

 arch/arm64/mm/kasan_init.c  |  32 
 arch/sparc/include/asm/pgtable_64.h |  18 +
 arch/sparc/mm/init_64.c |  31 +++-
 arch/x86/kernel/setup.c |   5 +-
 arch/x86/mm/init_64.c   |   9 ++-
 arch/x86/mm/kasan_init_64.c |  29 +++
 include/linux/bootmem.h |  11 +++
 include/linux/memblock.h|  10 ++-
 include/linux/mm.h  |   9 +++
 mm/memblock.c   | 152 
 mm/nobootmem.c  |  16 
 mm/page_alloc.c |  29 ---
 mm/sparse-vmemmap.c |  10 ++-
 mm/sparse.c |   6 +-
 14 files changed, 279 insertions(+), 88 deletions(-)

--
2.13.3

[v4 09/15] sparc64: optimized struct page zeroing

2017-08-02 Thread Pavel Tatashin

Add an optimized mm_zero_struct_page(), so struct page's are zeroed without
calling memset(). We do eight regular stores, thus avoid cost of membar.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/sparc/include/asm/pgtable_64.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/sparc/include/asm/pgtable_64.h 
b/arch/sparc/include/asm/pgtable_64.h
index 6fbd931f0570..23ad51ea5340 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -230,6 +230,24 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
 extern struct page *mem_map_zero;
 #define ZERO_PAGE(vaddr)   (mem_map_zero)
 
+/* This macro must be updated when the size of struct page changes,
+ * so use static assert to enforce the assumed size.
+ */
+#definemm_zero_struct_page(pp) \
+   do {\
+   unsigned long *_pp = (void *)(pp);  \
+   \
+   BUILD_BUG_ON(sizeof(struct page) != 64);\
+   _pp[0] = 0; \
+   _pp[1] = 0; \
+   _pp[2] = 0; \
+   _pp[3] = 0; \
+   _pp[4] = 0; \
+   _pp[5] = 0; \
+   _pp[6] = 0; \
+   _pp[7] = 0; \
+   } while (0)
+
 /* PFNs are real physical page numbers.  However, mem_map only begins to record
  * per-page information starting at pfn_base.  This is to handle systems where
  * the first physical page in the machine is at some huge physical address,
-- 
2.13.3

[v4 03/15] sparc64/mm: setting fields in deferred pages

2017-08-02 Thread Pavel Tatashin

Without deferred struct page feature (CONFIG_DEFERRED_STRUCT_PAGE_INIT),
flags and other fields in "struct page"es are never changed prior to first
initializing struct pages by going through __init_single_page().

With deferred struct page feature enabled there is a case where we set some
fields prior to initializing:

 mem_init() {
 register_page_bootmem_info();
 free_all_bootmem();
 ...
 }

When register_page_bootmem_info() is called only non-deferred struct pages
are initialized. But, this function goes through some reserved pages which
might be part of the deferred, and thus are not yet initialized.

mem_init
register_page_bootmem_info
 register_page_bootmem_info_node
  get_page_bootmem
   .. setting fields here ..
   such as: page->freelist = (void *)type;

We end-up with similar issue as in the previous patch, where currently we
do not observe problem as memory is zeroed. But, if flag asserts are
changed we can start hitting issues.

Also, because in this patch series we will stop zeroing struct page memory
during allocation, we must make sure that struct pages are properly
initialized prior to using them.

The deferred-reserved pages are initialized in free_all_bootmem().
Therefore, the fix is to switch the above calls.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/sparc/mm/init_64.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 3c40ebd50f92..ba957b763c07 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2464,9 +2464,15 @@ void __init mem_init(void)
 {
high_memory = __va(last_valid_pfn << PAGE_SHIFT);
 
-   register_page_bootmem_info();
free_all_bootmem();
 
+   /* Must be done after boot memory is put on freelist, because here we
+* might set fields in deferred struct pages that have not yet been
+* initialized, and free_all_bootmem() initializes all the reserved
+* deferred pages for us.
+*/
+   register_page_bootmem_info();
+
/*
 * Set up the zero page, mark it reserved, so that page count
 * is not manipulated when freeing the page from user ptes.
-- 
2.13.3

[v4 06/15] sparc64: simplify vmemmap_populate

2017-08-02 Thread Pavel Tatashin

Remove duplicating code by using common functions
vmemmap_pud_populate and vmemmap_pgd_populate.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/sparc/mm/init_64.c | 23 ++-
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index ba957b763c07..e18947e9ab6c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2567,30 +2567,19 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
vstart = vstart & PMD_MASK;
vend = ALIGN(vend, PMD_SIZE);
for (; vstart < vend; vstart += PMD_SIZE) {
-   pgd_t *pgd = pgd_offset_k(vstart);
+   pgd_t *pgd = vmemmap_pgd_populate(vstart, node);
unsigned long pte;
pud_t *pud;
pmd_t *pmd;
 
-   if (pgd_none(*pgd)) {
-   pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
+   if (!pgd)
+   return -ENOMEM;
 
-   if (!new)
-   return -ENOMEM;
-   pgd_populate(_mm, pgd, new);
-   }
-
-   pud = pud_offset(pgd, vstart);
-   if (pud_none(*pud)) {
-   pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
-
-   if (!new)
-   return -ENOMEM;
-   pud_populate(_mm, pud, new);
-   }
+   pud = vmemmap_pud_populate(pgd, vstart, node);
+   if (!pud)
+   return -ENOMEM;
 
pmd = pmd_offset(pud, vstart);
-
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
void *block = vmemmap_alloc_block(PMD_SIZE, node);
-- 
2.13.3

[v4 05/15] mm: don't accessed uninitialized struct pages

2017-08-02 Thread Pavel Tatashin

In deferred_init_memmap() where all deferred struct pages are initialized
we have a check like this:

if (page->flags) {
VM_BUG_ON(page_zone(page) != zone);
goto free_range;
}

This way we are checking if the current deferred page has already been
initialized. It works, because memory for struct pages has been zeroed, and
the only way flags are not zero if it went through __init_single_page()
before.  But, once we change the current behavior and won't zero the memory
in memblock allocator, we cannot trust anything inside "struct page"es
until they are initialized. This patch fixes this.

This patch defines a new accessor memblock_get_reserved_pfn_range()
which returns successive ranges of reserved PFNs.  deferred_init_memmap()
calls it to determine if a PFN and its struct page has already been
initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/memblock.h |  3 +++
 mm/memblock.c| 54 ++--
 mm/page_alloc.c  | 11 +-
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index c89d16c88512..9d8dabedf5ba 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -321,6 +321,9 @@ int memblock_is_map_memory(phys_addr_t addr);
 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
 bool memblock_is_reserved(phys_addr_t addr);
 bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+void memblock_get_reserved_pfn_range(unsigned long pfn,
+unsigned long *pfn_start,
+unsigned long *pfn_end);
 
 extern void __memblock_dump_all(void);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index 3a2707914064..e6df054e3180 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1580,7 +1580,13 @@ void __init memblock_mem_limit_remove_map(phys_addr_t 
limit)
memblock_cap_memory_range(0, max_addr);
 }
 
-static int __init_memblock memblock_search(struct memblock_type *type, 
phys_addr_t addr)
+/**
+ * Return index in regions array if addr is within the region. Otherwise
+ * return -1. If -1 is returned and *next_idx is not %NULL, sets it to the
+ * next region index or -1 if there is none.
+ */
+static int __init_memblock memblock_search(struct memblock_type *type,
+  phys_addr_t addr, int *next_idx)
 {
unsigned int left = 0, right = type->cnt;
 
@@ -1595,22 +1601,26 @@ static int __init_memblock memblock_search(struct 
memblock_type *type, phys_addr
else
return mid;
} while (left < right);
+
+   if (next_idx)
+   *next_idx = (right == type->cnt) ? -1 : right;
+
return -1;
 }
 
 bool __init memblock_is_reserved(phys_addr_t addr)
 {
-   return memblock_search(, addr) != -1;
+   return memblock_search(, addr, NULL) != -1;
 }
 
 bool __init_memblock memblock_is_memory(phys_addr_t addr)
 {
-   return memblock_search(, addr) != -1;
+   return memblock_search(, addr, NULL) != -1;
 }
 
 int __init_memblock memblock_is_map_memory(phys_addr_t addr)
 {
-   int i = memblock_search(, addr);
+   int i = memblock_search(, addr, NULL);
 
if (i == -1)
return false;
@@ -1622,7 +1632,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long 
pfn,
 unsigned long *start_pfn, unsigned long *end_pfn)
 {
struct memblock_type *type = 
-   int mid = memblock_search(type, PFN_PHYS(pfn));
+   int mid = memblock_search(type, PFN_PHYS(pfn), NULL);
 
if (mid == -1)
return -1;
@@ -1646,7 +1656,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long 
pfn,
  */
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t 
size)
 {
-   int idx = memblock_search(, base);
+   int idx = memblock_search(, base, NULL);
phys_addr_t end = base + memblock_cap_size(base, );
 
if (idx == -1)
@@ -1656,6 +1666,38 @@ int __init_memblock 
memblock_is_region_memory(phys_addr_t base, phys_addr_t size
 }
 
 /**
+ * memblock_get_reserved_pfn_range - search for the next reserved region
+ *
+ * @pfn: start searching from this pfn.
+ *
+ * RETURNS:
+ * [start_pfn, end_pfn), where start_pfn >= pfn. If none is found
+ * start_pfn, and end_pfn are both set to ULONG_MAX.
+ */
+void __init_memblock memblock_get_reserved_pfn_range(unsigned long pfn,
+unsigned long *start_pfn,
+unsigned long *end_pfn)
+{
+   struct memblock_type *type = 
+   int next_idx, idx;
+
+   idx = memblock_s

[v4 01/15] x86/mm: reserve only exiting low pages

2017-08-02 Thread Pavel Tatashin

Struct pages are initialized by going through __init_single_page(). Since
the existing physical memory in memblock is represented in memblock.memory
list, struct page for every page from this list goes through
__init_single_page().

The second memblock list: memblock.reserved, manages the allocated memory.
The memory that won't be available to kernel allocator. So, every page from
this list goes through reserve_bootmem_region(), where certain struct page
fields are set, the assumption being that the struct pages have been
initialized beforehand.

In trim_low_memory_range() we unconditionally reserve memoryfrom PFN 0, but
memblock.memory might start at a later PFN. For example, in QEMU,
e820__memblock_setup() can use PFN 1 as the first PFN in memblock.memory,
so PFN 0 is not on memblock.memory (and hence isn't initialized via
__init_single_page) but is on memblock.reserved (and hence we set fields in
the uninitialized struct page).

Currently, the struct page memory is always zeroed during allocation,
which prevents this problem from being detected. But, if some asserts
provided by CONFIG_DEBUG_VM_PGFLAGS are tighten, this problem may become
visible in existing kernels.

In this patchset we will stop zeroing struct page memory during allocation.
Therefore, this bug must be fixed in order to avoid random assert failures
caused by CONFIG_DEBUG_VM_PGFLAGS triggers.

The fix is to reserve memory from the first existing PFN.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/x86/kernel/setup.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..489cdc141bcb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -790,7 +790,10 @@ early_param("reservelow", parse_reservelow);
 
 static void __init trim_low_memory_range(void)
 {
-   memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+   unsigned long min_pfn = find_min_pfn_with_active_regions();
+   phys_addr_t base = min_pfn << PAGE_SHIFT;
+
+   memblock_reserve(base, ALIGN(reserve_low, PAGE_SIZE));
 }

 /*
-- 
2.13.3

[v4 11/15] arm64/kasan: explicitly zero kasan shadow memory

2017-08-02 Thread Pavel Tatashin

To optimize the performance of struct page initialization,
vmemmap_populate() will no longer zero memory.

We must explicitly zero the memory that is allocated by vmemmap_populate()
for kasan, as this memory does not go through struct page initialization
path.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/arm64/mm/kasan_init.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 81f03959a4ab..a57104bc54b8 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -135,6 +135,31 @@ static void __init clear_pgds(unsigned long start,
set_pgd(pgd_offset_k(start), __pgd(0));
 }
 
+/*
+ * Memory that was allocated by vmemmap_populate is not zeroed, so we must
+ * zero it here explicitly.
+ */
+static void
+zero_vemmap_populated_memory(void)
+{
+   struct memblock_region *reg;
+   u64 start, end;
+
+   for_each_memblock(memory, reg) {
+   start = __phys_to_virt(reg->base);
+   end = __phys_to_virt(reg->base + reg->size);
+
+   if (start >= end)
+   break;
+
+   memset((void *)start, 0, end - start);
+   }
+
+   start = (u64)kasan_mem_to_shadow(_stext);
+   end = (u64)kasan_mem_to_shadow(_end);
+   memset((void *)start, 0, end - start);
+}
+
 void __init kasan_init(void)
 {
u64 kimg_shadow_start, kimg_shadow_end;
@@ -205,6 +230,13 @@ void __init kasan_init(void)
pfn_pte(sym_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
 
memset(kasan_zero_page, 0, PAGE_SIZE);
+
+   /*
+* vmemmap_populate does not zero the memory, so we need to zero it
+* explicitly
+*/
+   zero_vemmap_populated_memory();
+
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
/* At this point kasan is fully initialized. Enable error messages */
-- 
2.13.3

[v4 15/15] mm: debug for raw alloctor

2017-08-02 Thread Pavel Tatashin

When CONFIG_DEBUG_VM is enabled, this patch sets all the memory that is
returned by memblock_virt_alloc_try_nid_raw() to ones to ensure that no
places excpect zeroed memory.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/memblock.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index bdf31f207fa4..b6f90e75946c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1363,12 +1363,19 @@ void * __init memblock_virt_alloc_try_nid_raw(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
 {
+   void *ptr;
+
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx 
max_addr=0x%llx %pF\n",
 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
 (u64)max_addr, (void *)_RET_IP_);
 
-   return memblock_virt_alloc_internal(size, align,
-   min_addr, max_addr, nid);
+   ptr = memblock_virt_alloc_internal(size, align,
+  min_addr, max_addr, nid);
+#ifdef CONFIG_DEBUG_VM
+   if (ptr && size > 0)
+   memset(ptr, 0xff, size);
+#endif
+   return ptr;
 }
 
 /**
-- 
2.13.3

[v4 10/15] x86/kasan: explicitly zero kasan shadow memory

2017-08-02 Thread Pavel Tatashin

To optimize the performance of struct page initialization,
vmemmap_populate() will no longer zero memory.

We must explicitly zero the memory that is allocated by vmemmap_populate()
for kasan, as this memory does not go through struct page initialization
path.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/x86/mm/kasan_init_64.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..7d06cf0b0b6e 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -84,6 +84,28 @@ static struct notifier_block kasan_die_notifier = {
 };
 #endif
 
+/*
+ * Memory that was allocated by vmemmap_populate is not zeroed, so we must
+ * zero it here explicitly.
+ */
+static void
+zero_vemmap_populated_memory(void)
+{
+   u64 i, start, end;
+
+   for (i = 0; i < E820_MAX_ENTRIES && pfn_mapped[i].end; i++) {
+   void *kaddr_start = pfn_to_kaddr(pfn_mapped[i].start);
+   void *kaddr_end = pfn_to_kaddr(pfn_mapped[i].end);
+
+   start = (u64)kasan_mem_to_shadow(kaddr_start);
+   end = (u64)kasan_mem_to_shadow(kaddr_end);
+   memset((void *)start, 0, end - start);
+   }
+   start = (u64)kasan_mem_to_shadow(_stext);
+   end = (u64)kasan_mem_to_shadow(_end);
+   memset((void *)start, 0, end - start);
+}
+
 void __init kasan_early_init(void)
 {
int i;
@@ -156,6 +178,13 @@ void __init kasan_init(void)
pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
set_pte(_zero_pte[i], pte);
}
+
+   /*
+* vmemmap_populate does not zero the memory, so we need to zero it
+* explicitly
+*/
+   zero_vemmap_populated_memory();
+
/* Flush TLBs again to be sure that write protection applied. */
__flush_tlb_all();
 
-- 
2.13.3

[v4 02/15] x86/mm: setting fields in deferred pages

2017-08-02 Thread Pavel Tatashin

Without deferred struct page feature (CONFIG_DEFERRED_STRUCT_PAGE_INIT),
flags and other fields in "struct page"es are never changed prior to first
initializing struct pages by going through __init_single_page().

With deferred struct page feature enabled there is a case where we set some
fields prior to initializing:

mem_init() {
register_page_bootmem_info();
free_all_bootmem();
...
}

When register_page_bootmem_info() is called only non-deferred struct pages
are initialized. But, this function goes through some reserved pages which
might be part of the deferred, and thus are not yet initialized.

  mem_init
   register_page_bootmem_info
register_page_bootmem_info_node
 get_page_bootmem
  .. setting fields here ..
  such as: page->freelist = (void *)type;

We end-up with similar issue as in the previous patch, where currently we
do not observe problem as memory is zeroed. But, if flag asserts are
changed we can start hitting issues.

Also, because in this patch series we will stop zeroing struct page memory
during allocation, we must make sure that struct pages are properly
initialized prior to using them.

The deferred-reserved pages are initialized in free_all_bootmem().
Therefore, the fix is to switch the above calls.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/x86/mm/init_64.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..1e863baec847 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1165,12 +1165,17 @@ void __init mem_init(void)
 
/* clear_bss() already clear the empty_zero_page */
 
-   register_page_bootmem_info();
-
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
 
+   /* Must be done after boot memory is put on freelist, because here we
+* might set fields in deferred struct pages that have not yet been
+* initialized, and free_all_bootmem() initializes all the reserved
+* deferred pages for us.
+*/
+   register_page_bootmem_info();
+
/* Register memory areas for /proc/kcore */
kclist_add(_vsyscall, (void *)VSYSCALL_ADDR,
 PAGE_SIZE, KCORE_OTHER);
-- 
2.13.3

[v4 07/15] mm: defining memblock_virt_alloc_try_nid_raw

2017-08-02 Thread Pavel Tatashin

A new variant of memblock_virt_alloc_* allocations:
memblock_virt_alloc_try_nid_raw()
- Does not zero the allocated memory
- Does not panic if request cannot be satisfied

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/bootmem.h | 11 ++
 mm/memblock.c   | 53 ++---
 2 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index e223d91b6439..0a0a37f3e292 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -160,6 +160,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)
 
 /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr,
+ phys_addr_t max_addr, int nid);
 void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
phys_addr_t align, phys_addr_t min_addr,
phys_addr_t max_addr, int nid);
@@ -176,6 +179,14 @@ static inline void * __init memblock_virt_alloc(
NUMA_NO_NODE);
 }
 
+static inline void * __init memblock_virt_alloc_raw(
+   phys_addr_t size,  phys_addr_t align)
+{
+   return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT,
+   BOOTMEM_ALLOC_ACCESSIBLE,
+   NUMA_NO_NODE);
+}
+
 static inline void * __init memblock_virt_alloc_nopanic(
phys_addr_t size, phys_addr_t align)
 {
diff --git a/mm/memblock.c b/mm/memblock.c
index e6df054e3180..bdf31f207fa4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1327,7 +1327,6 @@ static void * __init memblock_virt_alloc_internal(
return NULL;
 done:
ptr = phys_to_virt(alloc);
-   memset(ptr, 0, size);
 
/*
 * The min_count is set to 0 so that bootmem allocated blocks
@@ -1341,6 +1340,38 @@ static void * __init memblock_virt_alloc_internal(
 }
 
 /**
+ * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
+ * memory and without panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *   is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *   is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *   allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory, does not panic if request
+ * cannot be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_raw(
+   phys_addr_t size, phys_addr_t align,
+   phys_addr_t min_addr, phys_addr_t max_addr,
+   int nid)
+{
+   memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx 
max_addr=0x%llx %pF\n",
+__func__, (u64)size, (u64)align, nid, (u64)min_addr,
+(u64)max_addr, (void *)_RET_IP_);
+
+   return memblock_virt_alloc_internal(size, align,
+   min_addr, max_addr, nid);
+}
+
+/**
  * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
  * @align: alignment of the region and block's size
@@ -1351,8 +1382,8 @@ static void * __init memblock_virt_alloc_internal(
  *   allocate only from memory limited by memblock.current_limit value
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
  *
- * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
- * additional debug information (including caller info), if enabled.
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. This function zeroes the allocated memory.
  *
  * RETURNS:
  * Virtual address of allocated memory block on success, NULL on failure.
@@ -1362,11 +1393,17 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
 {
+   void *ptr;
+
memblock_dbg("%s: %llu bytes align=0x%llx ni

[v4 04/15] mm: discard memblock data later

2017-08-02 Thread Pavel Tatashin

There is existing use after free bug when deferred struct pages are
enabled:

The memblock_add() allocates memory for the memory array if more than
128 entries are needed.  See comment in e820__memblock_setup():

  * The bootstrap memblock region count maximum is 128 entries
  * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
  * than that - so allow memblock resizing.

This memblock memory is freed here:
free_low_memory_core_early()

We access the freed memblock.memory later in boot when deferred pages are
initialized in this path:

deferred_init_memmap()
for_each_mem_pfn_range()
  __next_mem_pfn_range()
type = 

One possible explanation for why this use-after-free hasn't been hit
before is that the limit of INIT_MEMBLOCK_REGIONS has never been exceeded
at least on systems where deferred struct pages were enabled.

Another reason why we want this problem fixed in this patch series is,
in the next patch, we will need to access memblock.reserved from
deferred_init_memmap().

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/memblock.h |  7 +--
 mm/memblock.c| 38 +-
 mm/nobootmem.c   | 16 
 mm/page_alloc.c  |  2 ++
 4 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77d427974f57..c89d16c88512 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,9 +61,11 @@ extern int memblock_debug;
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
+void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
+#define memblock_discard()
 #endif
 
 #define memblock_dbg(fmt, ...) \
@@ -74,8 +76,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, 
phys_addr_t align,
int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
   phys_addr_t size, phys_addr_t align);
-phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
-phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -110,6 +110,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
phys_addr_t *out_end);
 
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..3a2707914064 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct 
memblock_type *type, u
 }
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
-   phys_addr_t *addr)
-{
-   if (memblock.reserved.regions == memblock_reserved_init_regions)
-   return 0;
-
-   *addr = __pa(memblock.reserved.regions);
-
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.reserved.max);
-}
-
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
-   phys_addr_t *addr)
+/**
+ * Discard memory and reserved arrays if they were allocated
+ */
+void __init_memblock memblock_discard(void)
 {
-   if (memblock.memory.regions == memblock_memory_init_regions)
-   return 0;
+   phys_addr_t addr, size;
 
-   *addr = __pa(memblock.memory.regions);
+   if (memblock.reserved.regions != memblock_reserved_init_regions) {
+   addr = __pa(memblock.reserved.regions);
+   size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+   __memblock_free_late(addr, size);
+   }
 
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.memory.max);
+   if (memblock.memory.regions == memblock_memory_init_regions) {
+   addr = __pa(memblock.memory.regions);
+   size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+   __memblock_free_late(addr, size);
+   }
 }
-
 #endi

[v4 14/15] mm: optimize early system hash allocations

2017-08-02 Thread Pavel Tatashin

Clients can call alloc_large_system_hash() with flag: HASH_ZERO to specify
that memory that was allocated for system hash needs to be zeroed,
otherwise the memory does not need to be zeroed, and client will initialize
it.

If memory does not need to be zero'd, call the new
memblock_virt_alloc_raw() interface, and thus improve the boot performance.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/page_alloc.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index debea7c0febb..623e2f7634e7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7350,18 +7350,17 @@ void *__init alloc_large_system_hash(const char 
*tablename,
 
log2qty = ilog2(numentries);
 
-   /*
-* memblock allocator returns zeroed memory already, so HASH_ZERO is
-* currently not used when HASH_EARLY is specified.
-*/
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
-   if (flags & HASH_EARLY)
-   table = memblock_virt_alloc_nopanic(size, 0);
-   else if (hashdist)
+   if (flags & HASH_EARLY) {
+   if (flags & HASH_ZERO)
+   table = memblock_virt_alloc_nopanic(size, 0);
+   else
+   table = memblock_virt_alloc_raw(size, 0);
+   } else if (hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
-   else {
+   } else {
/*
 * If bucketsize is not a power-of-two, we may free
 * some pages at the end of hash table which
-- 
2.13.3

[v4 13/15] mm: stop zeroing memory during allocation in vmemmap

2017-08-02 Thread Pavel Tatashin

Replace allocators in sprase-vmemmap to use the non-zeroing version. So,
we will get the performance improvement by zeroing the memory in parallel
when struct pages are zeroed.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/sparse-vmemmap.c | 6 +++---
 mm/sparse.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d40c721ab19f..3b646b5ce1b6 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -41,7 +41,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
 {
-   return memblock_virt_alloc_try_nid(size, align, goal,
+   return memblock_virt_alloc_try_nid_raw(size, align, goal,
BOOTMEM_ALLOC_ACCESSIBLE, node);
 }
 
@@ -56,11 +56,11 @@ void * __meminit vmemmap_alloc_block(unsigned long size, 
int node)
 
if (node_state(node, N_HIGH_MEMORY))
page = alloc_pages_node(
-   node, GFP_KERNEL | __GFP_ZERO | 
__GFP_RETRY_MAYFAIL,
+   node, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
get_order(size));
else
page = alloc_pages(
-   GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
+   GFP_KERNEL | __GFP_RETRY_MAYFAIL,
get_order(size));
if (page)
return page_address(page);
diff --git a/mm/sparse.c b/mm/sparse.c
index 7b4be3fd5cac..0e315766ad11 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -441,9 +441,9 @@ void __init sparse_mem_maps_populate_node(struct page 
**map_map,
}
 
size = PAGE_ALIGN(size);
-   map = memblock_virt_alloc_try_nid(size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
+   map = memblock_virt_alloc_try_nid_raw(size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
-- 
2.13.3

[v4 08/15] mm: zero struct pages during initialization

2017-08-02 Thread Pavel Tatashin

Add struct page zeroing as a part of initialization of other fields in
__init_single_page().

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/mm.h | 9 +
 mm/page_alloc.c| 1 +
 2 files changed, 10 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..183ac5e733db 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -94,6 +94,15 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #endif
 
 /*
+ * On some architectures it is expensive to call memset() for small sizes.
+ * Those architectures should provide their own implementation of "struct page"
+ * zeroing by defining this macro in .
+ */
+#ifndef mm_zero_struct_page
+#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
+#endif
+
+/*
  * Default maximum number of active map areas, this limits the number of vmas
  * per mm struct. Users can overwrite this number by sysctl but there is a
  * problem.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 99b9e2e06319..debea7c0febb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1168,6 +1168,7 @@ static void free_one_page(struct zone *zone,
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
 {
+   mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
-- 
2.13.3

[v4 12/15] mm: explicitly zero pagetable memory

2017-08-02 Thread Pavel Tatashin

Soon vmemmap_alloc_block() will no longer zero the block, so zero memory
at its call sites for everything except struct pages.  Struct page memory
is zero'd by struct page initialization.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/sparse-vmemmap.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c50b1a14d55e..d40c721ab19f 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -191,6 +191,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned 
long addr, int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
pmd_populate_kernel(_mm, pmd, p);
}
return pmd;
@@ -203,6 +204,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned 
long addr, int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
pud_populate(_mm, pud, p);
}
return pud;
@@ -215,6 +217,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned 
long addr, int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
p4d_populate(_mm, p4d, p);
}
return p4d;
@@ -227,6 +230,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, 
int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
pgd_populate(_mm, pgd, p);
}
return pgd;
-- 
2.13.3

[v5 08/15] mm: zero struct pages during initialization

2017-08-03 Thread Pavel Tatashin

Add struct page zeroing as a part of initialization of other fields in
__init_single_page().

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/mm.h | 9 +
 mm/page_alloc.c| 1 +
 2 files changed, 10 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..183ac5e733db 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -94,6 +94,15 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #endif
 
 /*
+ * On some architectures it is expensive to call memset() for small sizes.
+ * Those architectures should provide their own implementation of "struct page"
+ * zeroing by defining this macro in .
+ */
+#ifndef mm_zero_struct_page
+#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
+#endif
+
+/*
  * Default maximum number of active map areas, this limits the number of vmas
  * per mm struct. Users can overwrite this number by sysctl but there is a
  * problem.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05110ae50aca..cbcdd53ad9f8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1168,6 +1168,7 @@ static void free_one_page(struct zone *zone,
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
 {
+   mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
-- 
2.13.4

[v5 04/15] mm: discard memblock data later

2017-08-03 Thread Pavel Tatashin

There is existing use after free bug when deferred struct pages are
enabled:

The memblock_add() allocates memory for the memory array if more than
128 entries are needed.  See comment in e820__memblock_setup():

  * The bootstrap memblock region count maximum is 128 entries
  * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
  * than that - so allow memblock resizing.

This memblock memory is freed here:
free_low_memory_core_early()

We access the freed memblock.memory later in boot when deferred pages are
initialized in this path:

deferred_init_memmap()
for_each_mem_pfn_range()
  __next_mem_pfn_range()
type = 

One possible explanation for why this use-after-free hasn't been hit
before is that the limit of INIT_MEMBLOCK_REGIONS has never been exceeded
at least on systems where deferred struct pages were enabled.

Another reason why we want this problem fixed in this patch series is,
in the next patch, we will need to access memblock.reserved from
deferred_init_memmap().

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/memblock.h |  6 --
 mm/memblock.c| 38 +-
 mm/nobootmem.c   | 16 
 mm/page_alloc.c  |  4 
 4 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77d427974f57..bae11c7e7bf3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,6 +61,7 @@ extern int memblock_debug;
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
+void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, 
phys_addr_t align,
int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
   phys_addr_t size, phys_addr_t align);
-phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
-phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
phys_addr_t *out_end);
 
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..3a2707914064 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct 
memblock_type *type, u
 }
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
-   phys_addr_t *addr)
-{
-   if (memblock.reserved.regions == memblock_reserved_init_regions)
-   return 0;
-
-   *addr = __pa(memblock.reserved.regions);
-
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.reserved.max);
-}
-
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
-   phys_addr_t *addr)
+/**
+ * Discard memory and reserved arrays if they were allocated
+ */
+void __init_memblock memblock_discard(void)
 {
-   if (memblock.memory.regions == memblock_memory_init_regions)
-   return 0;
+   phys_addr_t addr, size;
 
-   *addr = __pa(memblock.memory.regions);
+   if (memblock.reserved.regions != memblock_reserved_init_regions) {
+   addr = __pa(memblock.reserved.regions);
+   size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+   __memblock_free_late(addr, size);
+   }
 
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.memory.max);
+   if (memblock.memory.regions == memblock_memory_init_regions) {
+   addr = __pa(memblock.memory.regions);
+   size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+   __memblock_free_late(addr, size);
+   }
 }
-
 #endif
 
 /**
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 36454d0f96ee..3637

[v5 07/15] mm: defining memblock_virt_alloc_try_nid_raw

2017-08-03 Thread Pavel Tatashin

A new variant of memblock_virt_alloc_* allocations:
memblock_virt_alloc_try_nid_raw()
- Does not zero the allocated memory
- Does not panic if request cannot be satisfied

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/bootmem.h | 27 +
 mm/memblock.c   | 53 ++---
 2 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index e223d91b6439..ea30b3987282 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -160,6 +160,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)
 
 /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr,
+ phys_addr_t max_addr, int nid);
 void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
phys_addr_t align, phys_addr_t min_addr,
phys_addr_t max_addr, int nid);
@@ -176,6 +179,14 @@ static inline void * __init memblock_virt_alloc(
NUMA_NO_NODE);
 }
 
+static inline void * __init memblock_virt_alloc_raw(
+   phys_addr_t size,  phys_addr_t align)
+{
+   return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT,
+   BOOTMEM_ALLOC_ACCESSIBLE,
+   NUMA_NO_NODE);
+}
+
 static inline void * __init memblock_virt_alloc_nopanic(
phys_addr_t size, phys_addr_t align)
 {
@@ -257,6 +268,14 @@ static inline void * __init memblock_virt_alloc(
return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT);
 }
 
+static inline void * __init memblock_virt_alloc_raw(
+   phys_addr_t size,  phys_addr_t align)
+{
+   if (!align)
+   align = SMP_CACHE_BYTES;
+   return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT);
+}
+
 static inline void * __init memblock_virt_alloc_nopanic(
phys_addr_t size, phys_addr_t align)
 {
@@ -309,6 +328,14 @@ static inline void * __init 
memblock_virt_alloc_try_nid(phys_addr_t size,
  min_addr);
 }
 
+static inline void * __init memblock_virt_alloc_try_nid_raw(
+   phys_addr_t size, phys_addr_t align,
+   phys_addr_t min_addr, phys_addr_t max_addr, int nid)
+{
+   return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align,
+   min_addr, max_addr);
+}
+
 static inline void * __init memblock_virt_alloc_try_nid_nopanic(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr, int nid)
diff --git a/mm/memblock.c b/mm/memblock.c
index e6df054e3180..bdf31f207fa4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1327,7 +1327,6 @@ static void * __init memblock_virt_alloc_internal(
return NULL;
 done:
ptr = phys_to_virt(alloc);
-   memset(ptr, 0, size);
 
/*
 * The min_count is set to 0 so that bootmem allocated blocks
@@ -1341,6 +1340,38 @@ static void * __init memblock_virt_alloc_internal(
 }
 
 /**
+ * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
+ * memory and without panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *   is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *   is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *   allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory, does not panic if request
+ * cannot be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_raw(
+   phys_addr_t size, phys_addr_t align,
+   phys_addr_t min_addr, phys_addr_t max_addr,
+   int nid)
+{
+   memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx 
max_addr=0x%llx %pF\n",
+__func__, (u64)size, (u64)align, nid, (u64)min_addr,
+

[v5 09/15] sparc64: optimized struct page zeroing

2017-08-03 Thread Pavel Tatashin

Add an optimized mm_zero_struct_page(), so struct page's are zeroed without
calling memset(). We do eight regular stores, thus avoid cost of membar.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/sparc/include/asm/pgtable_64.h | 32 
 1 file changed, 32 insertions(+)

diff --git a/arch/sparc/include/asm/pgtable_64.h 
b/arch/sparc/include/asm/pgtable_64.h
index 6fbd931f0570..be47537e84c5 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -230,6 +230,38 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
 extern struct page *mem_map_zero;
 #define ZERO_PAGE(vaddr)   (mem_map_zero)
 
+/* This macro must be updated when the size of struct page grows above 80
+ * or reduces below 64.
+ * The idea that compiler optimizes out switch() statement, and only
+ * leaves clrx instructions or memset() call.
+ */
+#definemm_zero_struct_page(pp) do {
\
+   unsigned long *_pp = (void *)(pp);  \
+   \
+   /* Check that struct page is 8-byte aligned */  \
+   BUILD_BUG_ON(sizeof(struct page) & 7);  \
+   \
+   switch (sizeof(struct page)) {  \
+   case 80:\
+   _pp[9] = 0; /* fallthrough */   \
+   case 72:\
+   _pp[8] = 0; /* fallthrough */   \
+   case 64:\
+   _pp[7] = 0; \
+   _pp[6] = 0; \
+   _pp[5] = 0; \
+   _pp[4] = 0; \
+   _pp[3] = 0; \
+   _pp[2] = 0; \
+   _pp[1] = 0; \
+   _pp[0] = 0; \
+   break;  /* no fallthrough */\
+   default:\
+   pr_warn_once("suboptimal mm_zero_struct_page"); \
+   memset(_pp, 0, sizeof(struct page));\
+   }   \
+} while (0)
+
 /* PFNs are real physical page numbers.  However, mem_map only begins to record
  * per-page information starting at pfn_base.  This is to handle systems where
  * the first physical page in the machine is at some huge physical address,
-- 
2.13.4

[v5 11/15] arm64/kasan: explicitly zero kasan shadow memory

2017-08-03 Thread Pavel Tatashin

To optimize the performance of struct page initialization,
vmemmap_populate() will no longer zero memory.

We must explicitly zero the memory that is allocated by vmemmap_populate()
for kasan, as this memory does not go through struct page initialization
path.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/arm64/mm/kasan_init.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 81f03959a4ab..a57104bc54b8 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -135,6 +135,31 @@ static void __init clear_pgds(unsigned long start,
set_pgd(pgd_offset_k(start), __pgd(0));
 }
 
+/*
+ * Memory that was allocated by vmemmap_populate is not zeroed, so we must
+ * zero it here explicitly.
+ */
+static void
+zero_vemmap_populated_memory(void)
+{
+   struct memblock_region *reg;
+   u64 start, end;
+
+   for_each_memblock(memory, reg) {
+   start = __phys_to_virt(reg->base);
+   end = __phys_to_virt(reg->base + reg->size);
+
+   if (start >= end)
+   break;
+
+   memset((void *)start, 0, end - start);
+   }
+
+   start = (u64)kasan_mem_to_shadow(_stext);
+   end = (u64)kasan_mem_to_shadow(_end);
+   memset((void *)start, 0, end - start);
+}
+
 void __init kasan_init(void)
 {
u64 kimg_shadow_start, kimg_shadow_end;
@@ -205,6 +230,13 @@ void __init kasan_init(void)
pfn_pte(sym_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
 
memset(kasan_zero_page, 0, PAGE_SIZE);
+
+   /*
+* vmemmap_populate does not zero the memory, so we need to zero it
+* explicitly
+*/
+   zero_vemmap_populated_memory();
+
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
/* At this point kasan is fully initialized. Enable error messages */
-- 
2.13.4

[v5 14/15] mm: optimize early system hash allocations

2017-08-03 Thread Pavel Tatashin

Clients can call alloc_large_system_hash() with flag: HASH_ZERO to specify
that memory that was allocated for system hash needs to be zeroed,
otherwise the memory does not need to be zeroed, and client will initialize
it.

If memory does not need to be zero'd, call the new
memblock_virt_alloc_raw() interface, and thus improve the boot performance.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/page_alloc.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cbcdd53ad9f8..53d16590dc01 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7352,18 +7352,17 @@ void *__init alloc_large_system_hash(const char 
*tablename,
 
log2qty = ilog2(numentries);
 
-   /*
-* memblock allocator returns zeroed memory already, so HASH_ZERO is
-* currently not used when HASH_EARLY is specified.
-*/
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
-   if (flags & HASH_EARLY)
-   table = memblock_virt_alloc_nopanic(size, 0);
-   else if (hashdist)
+   if (flags & HASH_EARLY) {
+   if (flags & HASH_ZERO)
+   table = memblock_virt_alloc_nopanic(size, 0);
+   else
+   table = memblock_virt_alloc_raw(size, 0);
+   } else if (hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
-   else {
+   } else {
/*
 * If bucketsize is not a power-of-two, we may free
 * some pages at the end of hash table which
-- 
2.13.4

[v5 13/15] mm: stop zeroing memory during allocation in vmemmap

2017-08-03 Thread Pavel Tatashin

Replace allocators in sprase-vmemmap to use the non-zeroing version. So,
we will get the performance improvement by zeroing the memory in parallel
when struct pages are zeroed.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/sparse-vmemmap.c | 6 +++---
 mm/sparse.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d40c721ab19f..3b646b5ce1b6 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -41,7 +41,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
 {
-   return memblock_virt_alloc_try_nid(size, align, goal,
+   return memblock_virt_alloc_try_nid_raw(size, align, goal,
BOOTMEM_ALLOC_ACCESSIBLE, node);
 }
 
@@ -56,11 +56,11 @@ void * __meminit vmemmap_alloc_block(unsigned long size, 
int node)
 
if (node_state(node, N_HIGH_MEMORY))
page = alloc_pages_node(
-   node, GFP_KERNEL | __GFP_ZERO | 
__GFP_RETRY_MAYFAIL,
+   node, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
get_order(size));
else
page = alloc_pages(
-   GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
+   GFP_KERNEL | __GFP_RETRY_MAYFAIL,
get_order(size));
if (page)
return page_address(page);
diff --git a/mm/sparse.c b/mm/sparse.c
index 7b4be3fd5cac..0e315766ad11 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -441,9 +441,9 @@ void __init sparse_mem_maps_populate_node(struct page 
**map_map,
}
 
size = PAGE_ALIGN(size);
-   map = memblock_virt_alloc_try_nid(size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
+   map = memblock_virt_alloc_try_nid_raw(size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
-- 
2.13.4

[v5 00/15] complete deferred page initialization

2017-08-03 Thread Pavel Tatashin

Changelog:
v5 - v4
- Fixed build issues reported by kbuild on various configurations

v4 - v3
- Rewrote code to zero sturct pages in __init_single_page() as
  suggested by Michal Hocko
- Added code to handle issues related to accessing struct page
  memory before they are initialized.

v3 - v2
- Addressed David Miller comments about one change per patch:
* Splited changes to platforms into 4 patches
* Made "do not zero vmemmap_buf" as a separate patch

v2 - v1
- Per request, added s390 to deferred "struct page" zeroing
- Collected performance data on x86 which proofs the importance to
  keep memset() as prefetch (see below).

SMP machines can benefit from the DEFERRED_STRUCT_PAGE_INIT config option,
which defers initializing struct pages until all cpus have been started so
it can be done in parallel.

However, this feature is sub-optimal, because the deferred page
initialization code expects that the struct pages have already been zeroed,
and the zeroing is done early in boot with a single thread only.  Also, we
access that memory and set flags before struct pages are initialized. All
of this is fixed in this patchset.

In this work we do the following:
- Never read access struct page until it was initialized
- Never set any fields in struct pages before they are initialized
- Zero struct page at the beginning of struct page initialization

Performance improvements on x86 machine with 8 nodes:
Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz

Single threaded struct page init: 7.6s/T improvement
Deferred struct page init: 10.2s/T improvement

Pavel Tatashin (15):
  x86/mm: reserve only exiting low pages
  x86/mm: setting fields in deferred pages
  sparc64/mm: setting fields in deferred pages
  mm: discard memblock data later
  mm: don't accessed uninitialized struct pages
  sparc64: simplify vmemmap_populate
  mm: defining memblock_virt_alloc_try_nid_raw
  mm: zero struct pages during initialization
  sparc64: optimized struct page zeroing
  x86/kasan: explicitly zero kasan shadow memory
  arm64/kasan: explicitly zero kasan shadow memory
  mm: explicitly zero pagetable memory
  mm: stop zeroing memory during allocation in vmemmap
  mm: optimize early system hash allocations
  mm: debug for raw alloctor

 arch/arm64/mm/kasan_init.c  |  32 
 arch/sparc/include/asm/pgtable_64.h |  32 
 arch/sparc/mm/init_64.c |  31 +++-
 arch/x86/kernel/setup.c |   5 +-
 arch/x86/mm/init_64.c   |   9 ++-
 arch/x86/mm/kasan_init_64.c |  29 +++
 include/linux/bootmem.h |  27 +++
 include/linux/memblock.h|   9 ++-
 include/linux/mm.h  |   9 +++
 mm/memblock.c   | 152 
 mm/nobootmem.c  |  16 
 mm/page_alloc.c |  31 +---
 mm/sparse-vmemmap.c |  10 ++-
 mm/sparse.c |   6 +-
 14 files changed, 310 insertions(+), 88 deletions(-)

-- 
2.13.4

[v5 03/15] sparc64/mm: setting fields in deferred pages

2017-08-03 Thread Pavel Tatashin

Without deferred struct page feature (CONFIG_DEFERRED_STRUCT_PAGE_INIT),
flags and other fields in "struct page"es are never changed prior to first
initializing struct pages by going through __init_single_page().

With deferred struct page feature enabled there is a case where we set some
fields prior to initializing:

 mem_init() {
 register_page_bootmem_info();
 free_all_bootmem();
 ...
 }

When register_page_bootmem_info() is called only non-deferred struct pages
are initialized. But, this function goes through some reserved pages which
might be part of the deferred, and thus are not yet initialized.

mem_init
register_page_bootmem_info
 register_page_bootmem_info_node
  get_page_bootmem
   .. setting fields here ..
   such as: page->freelist = (void *)type;

We end-up with similar issue as in the previous patch, where currently we
do not observe problem as memory is zeroed. But, if flag asserts are
changed we can start hitting issues.

Also, because in this patch series we will stop zeroing struct page memory
during allocation, we must make sure that struct pages are properly
initialized prior to using them.

The deferred-reserved pages are initialized in free_all_bootmem().
Therefore, the fix is to switch the above calls.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/sparc/mm/init_64.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 3c40ebd50f92..ba957b763c07 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2464,9 +2464,15 @@ void __init mem_init(void)
 {
high_memory = __va(last_valid_pfn << PAGE_SHIFT);
 
-   register_page_bootmem_info();
free_all_bootmem();
 
+   /* Must be done after boot memory is put on freelist, because here we
+* might set fields in deferred struct pages that have not yet been
+* initialized, and free_all_bootmem() initializes all the reserved
+* deferred pages for us.
+*/
+   register_page_bootmem_info();
+
/*
 * Set up the zero page, mark it reserved, so that page count
 * is not manipulated when freeing the page from user ptes.
-- 
2.13.4

[v5 15/15] mm: debug for raw alloctor

2017-08-03 Thread Pavel Tatashin

When CONFIG_DEBUG_VM is enabled, this patch sets all the memory that is
returned by memblock_virt_alloc_try_nid_raw() to ones to ensure that no
places excpect zeroed memory.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/memblock.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index bdf31f207fa4..b6f90e75946c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1363,12 +1363,19 @@ void * __init memblock_virt_alloc_try_nid_raw(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
 {
+   void *ptr;
+
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx 
max_addr=0x%llx %pF\n",
 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
 (u64)max_addr, (void *)_RET_IP_);
 
-   return memblock_virt_alloc_internal(size, align,
-   min_addr, max_addr, nid);
+   ptr = memblock_virt_alloc_internal(size, align,
+  min_addr, max_addr, nid);
+#ifdef CONFIG_DEBUG_VM
+   if (ptr && size > 0)
+   memset(ptr, 0xff, size);
+#endif
+   return ptr;
 }
 
 /**
-- 
2.13.4

[v5 02/15] x86/mm: setting fields in deferred pages

2017-08-03 Thread Pavel Tatashin

Without deferred struct page feature (CONFIG_DEFERRED_STRUCT_PAGE_INIT),
flags and other fields in "struct page"es are never changed prior to first
initializing struct pages by going through __init_single_page().

With deferred struct page feature enabled there is a case where we set some
fields prior to initializing:

mem_init() {
register_page_bootmem_info();
free_all_bootmem();
...
}

When register_page_bootmem_info() is called only non-deferred struct pages
are initialized. But, this function goes through some reserved pages which
might be part of the deferred, and thus are not yet initialized.

  mem_init
   register_page_bootmem_info
register_page_bootmem_info_node
 get_page_bootmem
  .. setting fields here ..
  such as: page->freelist = (void *)type;

We end-up with similar issue as in the previous patch, where currently we
do not observe problem as memory is zeroed. But, if flag asserts are
changed we can start hitting issues.

Also, because in this patch series we will stop zeroing struct page memory
during allocation, we must make sure that struct pages are properly
initialized prior to using them.

The deferred-reserved pages are initialized in free_all_bootmem().
Therefore, the fix is to switch the above calls.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/x86/mm/init_64.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..1e863baec847 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1165,12 +1165,17 @@ void __init mem_init(void)
 
/* clear_bss() already clear the empty_zero_page */
 
-   register_page_bootmem_info();
-
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
 
+   /* Must be done after boot memory is put on freelist, because here we
+* might set fields in deferred struct pages that have not yet been
+* initialized, and free_all_bootmem() initializes all the reserved
+* deferred pages for us.
+*/
+   register_page_bootmem_info();
+
/* Register memory areas for /proc/kcore */
kclist_add(_vsyscall, (void *)VSYSCALL_ADDR,
 PAGE_SIZE, KCORE_OTHER);
-- 
2.13.4

[v5 01/15] x86/mm: reserve only exiting low pages

2017-08-03 Thread Pavel Tatashin

Struct pages are initialized by going through __init_single_page(). Since
the existing physical memory in memblock is represented in memblock.memory
list, struct page for every page from this list goes through
__init_single_page().

The second memblock list: memblock.reserved, manages the allocated memory.
The memory that won't be available to kernel allocator. So, every page from
this list goes through reserve_bootmem_region(), where certain struct page
fields are set, the assumption being that the struct pages have been
initialized beforehand.

In trim_low_memory_range() we unconditionally reserve memoryfrom PFN 0, but
memblock.memory might start at a later PFN. For example, in QEMU,
e820__memblock_setup() can use PFN 1 as the first PFN in memblock.memory,
so PFN 0 is not on memblock.memory (and hence isn't initialized via
__init_single_page) but is on memblock.reserved (and hence we set fields in
the uninitialized struct page).

Currently, the struct page memory is always zeroed during allocation,
which prevents this problem from being detected. But, if some asserts
provided by CONFIG_DEBUG_VM_PGFLAGS are tighten, this problem may become
visible in existing kernels.

In this patchset we will stop zeroing struct page memory during allocation.
Therefore, this bug must be fixed in order to avoid random assert failures
caused by CONFIG_DEBUG_VM_PGFLAGS triggers.

The fix is to reserve memory from the first existing PFN.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/x86/kernel/setup.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..489cdc141bcb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -790,7 +790,10 @@ early_param("reservelow", parse_reservelow);
 
 static void __init trim_low_memory_range(void)
 {
-   memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+   unsigned long min_pfn = find_min_pfn_with_active_regions();
+   phys_addr_t base = min_pfn << PAGE_SHIFT;
+
+   memblock_reserve(base, ALIGN(reserve_low, PAGE_SIZE));
 }

 /*
-- 
2.13.4

[v5 05/15] mm: don't accessed uninitialized struct pages

2017-08-03 Thread Pavel Tatashin

In deferred_init_memmap() where all deferred struct pages are initialized
we have a check like this:

if (page->flags) {
VM_BUG_ON(page_zone(page) != zone);
goto free_range;
}

This way we are checking if the current deferred page has already been
initialized. It works, because memory for struct pages has been zeroed, and
the only way flags are not zero if it went through __init_single_page()
before.  But, once we change the current behavior and won't zero the memory
in memblock allocator, we cannot trust anything inside "struct page"es
until they are initialized. This patch fixes this.

This patch defines a new accessor memblock_get_reserved_pfn_range()
which returns successive ranges of reserved PFNs.  deferred_init_memmap()
calls it to determine if a PFN and its struct page has already been
initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/memblock.h |  3 +++
 mm/memblock.c| 54 ++--
 mm/page_alloc.c  | 11 +-
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index bae11c7e7bf3..b6a2a610f5e1 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -320,6 +320,9 @@ int memblock_is_map_memory(phys_addr_t addr);
 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
 bool memblock_is_reserved(phys_addr_t addr);
 bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+void memblock_get_reserved_pfn_range(unsigned long pfn,
+unsigned long *pfn_start,
+unsigned long *pfn_end);
 
 extern void __memblock_dump_all(void);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index 3a2707914064..e6df054e3180 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1580,7 +1580,13 @@ void __init memblock_mem_limit_remove_map(phys_addr_t 
limit)
memblock_cap_memory_range(0, max_addr);
 }
 
-static int __init_memblock memblock_search(struct memblock_type *type, 
phys_addr_t addr)
+/**
+ * Return index in regions array if addr is within the region. Otherwise
+ * return -1. If -1 is returned and *next_idx is not %NULL, sets it to the
+ * next region index or -1 if there is none.
+ */
+static int __init_memblock memblock_search(struct memblock_type *type,
+  phys_addr_t addr, int *next_idx)
 {
unsigned int left = 0, right = type->cnt;
 
@@ -1595,22 +1601,26 @@ static int __init_memblock memblock_search(struct 
memblock_type *type, phys_addr
else
return mid;
} while (left < right);
+
+   if (next_idx)
+   *next_idx = (right == type->cnt) ? -1 : right;
+
return -1;
 }
 
 bool __init memblock_is_reserved(phys_addr_t addr)
 {
-   return memblock_search(, addr) != -1;
+   return memblock_search(, addr, NULL) != -1;
 }
 
 bool __init_memblock memblock_is_memory(phys_addr_t addr)
 {
-   return memblock_search(, addr) != -1;
+   return memblock_search(, addr, NULL) != -1;
 }
 
 int __init_memblock memblock_is_map_memory(phys_addr_t addr)
 {
-   int i = memblock_search(, addr);
+   int i = memblock_search(, addr, NULL);
 
if (i == -1)
return false;
@@ -1622,7 +1632,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long 
pfn,
 unsigned long *start_pfn, unsigned long *end_pfn)
 {
struct memblock_type *type = 
-   int mid = memblock_search(type, PFN_PHYS(pfn));
+   int mid = memblock_search(type, PFN_PHYS(pfn), NULL);
 
if (mid == -1)
return -1;
@@ -1646,7 +1656,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long 
pfn,
  */
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t 
size)
 {
-   int idx = memblock_search(, base);
+   int idx = memblock_search(, base, NULL);
phys_addr_t end = base + memblock_cap_size(base, );
 
if (idx == -1)
@@ -1656,6 +1666,38 @@ int __init_memblock 
memblock_is_region_memory(phys_addr_t base, phys_addr_t size
 }
 
 /**
+ * memblock_get_reserved_pfn_range - search for the next reserved region
+ *
+ * @pfn: start searching from this pfn.
+ *
+ * RETURNS:
+ * [start_pfn, end_pfn), where start_pfn >= pfn. If none is found
+ * start_pfn, and end_pfn are both set to ULONG_MAX.
+ */
+void __init_memblock memblock_get_reserved_pfn_range(unsigned long pfn,
+unsigned long *start_pfn,
+unsigned long *end_pfn)
+{
+   struct memblock_type *type = 
+   int next_idx, idx;
+
+   idx = memblock_s

[v5 12/15] mm: explicitly zero pagetable memory

2017-08-03 Thread Pavel Tatashin

Soon vmemmap_alloc_block() will no longer zero the block, so zero memory
at its call sites for everything except struct pages.  Struct page memory
is zero'd by struct page initialization.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/sparse-vmemmap.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c50b1a14d55e..d40c721ab19f 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -191,6 +191,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned 
long addr, int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
pmd_populate_kernel(_mm, pmd, p);
}
return pmd;
@@ -203,6 +204,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned 
long addr, int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
pud_populate(_mm, pud, p);
}
return pud;
@@ -215,6 +217,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned 
long addr, int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
p4d_populate(_mm, p4d, p);
}
return p4d;
@@ -227,6 +230,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, 
int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
pgd_populate(_mm, pgd, p);
}
return pgd;
-- 
2.13.4

[v5 10/15] x86/kasan: explicitly zero kasan shadow memory

2017-08-03 Thread Pavel Tatashin

To optimize the performance of struct page initialization,
vmemmap_populate() will no longer zero memory.

We must explicitly zero the memory that is allocated by vmemmap_populate()
for kasan, as this memory does not go through struct page initialization
path.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/x86/mm/kasan_init_64.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..7d06cf0b0b6e 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -84,6 +84,28 @@ static struct notifier_block kasan_die_notifier = {
 };
 #endif
 
+/*
+ * Memory that was allocated by vmemmap_populate is not zeroed, so we must
+ * zero it here explicitly.
+ */
+static void
+zero_vemmap_populated_memory(void)
+{
+   u64 i, start, end;
+
+   for (i = 0; i < E820_MAX_ENTRIES && pfn_mapped[i].end; i++) {
+   void *kaddr_start = pfn_to_kaddr(pfn_mapped[i].start);
+   void *kaddr_end = pfn_to_kaddr(pfn_mapped[i].end);
+
+   start = (u64)kasan_mem_to_shadow(kaddr_start);
+   end = (u64)kasan_mem_to_shadow(kaddr_end);
+   memset((void *)start, 0, end - start);
+   }
+   start = (u64)kasan_mem_to_shadow(_stext);
+   end = (u64)kasan_mem_to_shadow(_end);
+   memset((void *)start, 0, end - start);
+}
+
 void __init kasan_early_init(void)
 {
int i;
@@ -156,6 +178,13 @@ void __init kasan_init(void)
pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
set_pte(_zero_pte[i], pte);
}
+
+   /*
+* vmemmap_populate does not zero the memory, so we need to zero it
+* explicitly
+*/
+   zero_vemmap_populated_memory();
+
/* Flush TLBs again to be sure that write protection applied. */
__flush_tlb_all();
 
-- 
2.13.4

[v5 06/15] sparc64: simplify vmemmap_populate

2017-08-03 Thread Pavel Tatashin

Remove duplicating code by using common functions
vmemmap_pud_populate and vmemmap_pgd_populate.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/sparc/mm/init_64.c | 23 ++-
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index ba957b763c07..e18947e9ab6c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2567,30 +2567,19 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
vstart = vstart & PMD_MASK;
vend = ALIGN(vend, PMD_SIZE);
for (; vstart < vend; vstart += PMD_SIZE) {
-   pgd_t *pgd = pgd_offset_k(vstart);
+   pgd_t *pgd = vmemmap_pgd_populate(vstart, node);
unsigned long pte;
pud_t *pud;
pmd_t *pmd;
 
-   if (pgd_none(*pgd)) {
-   pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
+   if (!pgd)
+   return -ENOMEM;
 
-   if (!new)
-   return -ENOMEM;
-   pgd_populate(_mm, pgd, new);
-   }
-
-   pud = pud_offset(pgd, vstart);
-   if (pud_none(*pud)) {
-   pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
-
-   if (!new)
-   return -ENOMEM;
-   pud_populate(_mm, pud, new);
-   }
+   pud = vmemmap_pud_populate(pgd, vstart, node);
+   if (!pud)
+   return -ENOMEM;
 
pmd = pmd_offset(pud, vstart);
-
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
void *block = vmemmap_alloc_block(PMD_SIZE, node);
-- 
2.13.4

[v3 2/2] x86/tsc: use tsc early

2017-08-11 Thread Pavel Tatashin

tsc_early_init():
Use verious methods to determine the availability of TSC feature and its
frequency early in boot, and if that is possible initialize TSC and also
call sched_clock_early_init() to be able to get timestamps early in boot.

tsc_early_fini()
Implement the finish part of early tsc feature, print message about the
offset, which can be useful to findout how much time was spent in post and
boot manager, and also call sched_clock_early_fini() to let sched clock
know that

sched_clock_early():
TSC based implementation of weak function that is defined in sched clock.

Call tsc_early_init() to initialize early boot time stamps functionality on
the supported x86 platforms, and call tsc_early_fini() to finish this
feature after permanent tsc has been initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/include/asm/tsc.h |  4 
 arch/x86/kernel/setup.c| 10 --
 arch/x86/kernel/tsc.c  | 47 ++
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index f5e6f1c417df..6dc9618b24e3 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -50,11 +50,15 @@ extern bool tsc_store_and_check_tsc_adjust(bool bootcpu);
 extern void tsc_verify_tsc_adjust(bool resume);
 extern void check_tsc_sync_source(int cpu);
 extern void check_tsc_sync_target(void);
+void tsc_early_init(unsigned int khz);
+void tsc_early_fini(void);
 #else
 static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return 
false; }
 static inline void tsc_verify_tsc_adjust(bool resume) { }
 static inline void check_tsc_sync_source(int cpu) { }
 static inline void check_tsc_sync_target(void) { }
+static inline void tsc_early_init(unsigned int khz) { }
+static inline void tsc_early_fini(void) { }
 #endif
 
 extern int notsc_setup(char *);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..413434d98a23 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -812,7 +812,11 @@ dump_kernel_offset(struct notifier_block *self, unsigned 
long v, void *p)
return 0;
 }
 
-static void __init simple_udelay_calibration(void)
+/*
+ * Initialize early tsc to show early boot timestamps, and also loops_per_jiffy
+ * for udelay
+ */
+static void __init early_clock_calibration(void)
 {
unsigned int tsc_khz, cpu_khz;
unsigned long lpj;
@@ -827,6 +831,8 @@ static void __init simple_udelay_calibration(void)
if (!tsc_khz)
return;
 
+   tsc_early_init(tsc_khz);
+
lpj = tsc_khz * 1000;
do_div(lpj, HZ);
loops_per_jiffy = lpj;
@@ -1039,7 +1045,7 @@ void __init setup_arch(char **cmdline_p)
 */
init_hypervisor_platform();
 
-   simple_udelay_calibration();
+   early_clock_calibration();
 
x86_init.resources.probe_roms();
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 796d96bb0821..bd44c2dd4235 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1263,6 +1263,53 @@ static int __init init_tsc_clocksource(void)
  */
 device_initcall(init_tsc_clocksource);
 
+#ifdef CONFIG_X86_TSC
+
+static struct cyc2ns_data  cyc2ns_early;
+static bool sched_clock_early_enabled;
+
+u64 sched_clock_early(void)
+{
+   u64 ns;
+
+   if (!sched_clock_early_enabled)
+   return 0;
+   ns = mul_u64_u32_shr(rdtsc(), cyc2ns_early.cyc2ns_mul,
+cyc2ns_early.cyc2ns_shift);
+   return ns + cyc2ns_early.cyc2ns_offset;
+}
+
+/*
+ * Initialize clock for early time stamps
+ */
+void __init tsc_early_init(unsigned int khz)
+{
+   sched_clock_early_enabled = true;
+   clocks_calc_mult_shift(_early.cyc2ns_mul,
+  _early.cyc2ns_shift,
+  khz, NSEC_PER_MSEC, 0);
+   cyc2ns_early.cyc2ns_offset = -sched_clock_early();
+   sched_clock_early_init();
+}
+
+void __init tsc_early_fini(void)
+{
+   unsigned long long t;
+   unsigned long r;
+
+   /* We did not have early sched clock if multiplier is 0 */
+   if (cyc2ns_early.cyc2ns_mul == 0)
+   return;
+
+   t = -cyc2ns_early.cyc2ns_offset;
+   r = do_div(t, NSEC_PER_SEC);
+
+   sched_clock_early_fini();
+   pr_info("sched clock early is finished, offset [%lld.%09lds]\n", t, r);
+   sched_clock_early_enabled = false;
+}
+#endif /* CONFIG_X86_TSC */
+
 void __init tsc_init(void)
 {
u64 lpj, cyc;
-- 
2.14.0

[v3 1/2] sched/clock: interface to allow timestamps early in boot

2017-08-11 Thread Pavel Tatashin

In Linux printk() can output timestamps next to every line.  This is very
useful for tracking regressions, and finding places that can be optimized.
However, the timestamps are available only later in boot. On smaller
machines it is insignificant amount of time, but on larger it can be many
seconds or even minutes into the boot process.

This patch adds an interface for platforms with unstable sched clock to
show timestamps early in boot. In order to get this functionality a
platform must do:

- Implement u64 sched_clock_early()
  Clock that returns monotonic time

- Call sched_clock_early_init()
  Tells sched clock that the early clock can be used

- Call sched_clock_early_fini()
  Tells sched clock that the early clock is finished, and sched clock
  should hand over the operation to permanent clock.

- Use weak sched_clock_early() interface to determine time from boot in
  arch specific read_boot_clock64()

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/time.c  | 22 
 include/linux/sched/clock.h |  4 +++
 kernel/sched/clock.c| 61 -
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index e0754cdbad37..6ede0da7041a 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -85,6 +86,7 @@ static __init void x86_late_time_init(void)
 {
x86_init.timers.timer_init();
tsc_init();
+   tsc_early_fini();
 }
 
 /*
@@ -95,3 +97,23 @@ void __init time_init(void)
 {
late_time_init = x86_late_time_init;
 }
+
+/*
+ * Called once during to boot to initialize boot time.
+ */
+void read_boot_clock64(struct timespec64 *ts)
+{
+   u64 ns_boot = sched_clock_early(); /* nsec from boot */
+   struct timespec64 ts_now;
+   bool valid_clock;
+
+   /* Time from epoch */
+   read_persistent_clock64(_now);
+   valid_clock = ns_boot && timespec64_valid_strict(_now) &&
+   (ts_now.tv_sec || ts_now.tv_nsec);
+
+   if (!valid_clock)
+   *ts = (struct timespec64){0, 0};
+   else
+   *ts = ns_to_timespec64(timespec64_to_ns(_now) - ns_boot);
+}
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index a55600ffdf4b..f8291fa28c0c 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -63,6 +63,10 @@ extern void sched_clock_tick_stable(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(void);
 
+void sched_clock_early_init(void);
+void sched_clock_early_fini(void);
+u64 sched_clock_early(void);
+
 /*
  * As outlined in clock.c, provides a fast, high resolution, nanosecond
  * time source that is monotonic per cpu argument and has bounded drift
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ca0f8fc945c6..be5b60af4ca9 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -80,9 +80,24 @@ EXPORT_SYMBOL_GPL(sched_clock);
 
 __read_mostly int sched_clock_running;
 
+/*
+ * We start with sched clock early static branch enabled, and global status
+ * disabled.  Early in boot it is decided whether to enable the global
+ * status as well (set sched_clock_early_running to true), and later, when
+ * early clock is no longer needed, the static branch is disabled.
+ */
+static DEFINE_STATIC_KEY_TRUE(__use_sched_clock_early);
+static bool __read_mostly sched_clock_early_running;
+
 void sched_clock_init(void)
 {
-   sched_clock_running = 1;
+   /*
+* We start clock only once early clock is finished, or if early clock
+* was not running.
+*/
+   if (!sched_clock_early_running)
+   sched_clock_running = 1;
+
 }
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
@@ -362,6 +377,11 @@ u64 sched_clock_cpu(int cpu)
if (sched_clock_stable())
return sched_clock() + __sched_clock_offset;
 
+   if (static_branch_unlikely(&__use_sched_clock_early)) {
+   if (sched_clock_early_running)
+   return sched_clock_early();
+   }
+
if (unlikely(!sched_clock_running))
return 0ull;
 
@@ -444,6 +464,45 @@ void sched_clock_idle_wakeup_event(void)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+u64 __weak sched_clock_early(void)
+{
+   return 0;
+}
+
+/*
+ * Is called when sched_clock_early() is about to be finished, notifies sched
+ * clock that after this call sched_clock_early() can't be used.
+ */
+void __init sched_clock_early_fini(void)
+{
+   struct sched_clock_data *scd = this_scd();
+   u64 now_early, now_sched;
+
+   now_early = sched_clock_early();
+   now_sched = sched_clock();
+
+   __gtod_offset = now_early - scd->tick_gtod;
+   __sched_clock_offset = now_early - now_sched;
+
+   sche

[v3 0/2] Early boot time stamps for x86

2017-08-11 Thread Pavel Tatashin

Resending, fixed the subject.

changelog
-
v2 - v3
- Addressed comment from Thomas Gleixner
- Timestamps are available a little later in boot but still much
  earlier than in mainline. This significantly simplified this
  work.
v1 - v2
In patch "x86/tsc: tsc early":
- added tsc_adjusted_early()
- fixed 32-bit compile error use do_div()

Adding early boot time stamps support for x86 machines.
SPARC patches for early boot time stamps are already integrated into
mainline linux.

Sample output
-
Before:
https://hastebin.com/jadaqukubu.scala

After:
https://hastebin.com/nubipozacu.scala

As seen above, currently timestamps are available from around the time when
"Security Framework" is initialized. But, 26s already passed until we
reached to this point.

Pavel Tatashin (2):
  sched/clock: interface to allow timestamps early in boot
  x86/tsc: use tsc early

 arch/x86/include/asm/tsc.h  |  4 +++
 arch/x86/kernel/setup.c | 10 ++--
 arch/x86/kernel/time.c  | 22 
 arch/x86/kernel/tsc.c   | 47 ++
 include/linux/sched/clock.h |  4 +++
 kernel/sched/clock.c| 61 -
 6 files changed, 145 insertions(+), 3 deletions(-)

-- 
2.14.0

[PATCH v4 1/2] sched/clock: interface to allow timestamps early in boot

2017-08-14 Thread Pavel Tatashin

In Linux printk() can output timestamps next to every line.  This is very
useful for tracking regressions, and finding places that can be optimized.
However, the timestamps are available only later in boot. On smaller
machines it is insignificant amount of time, but on larger it can be many
seconds or even minutes into the boot process.

This patch adds an interface for platforms with unstable sched clock to
show timestamps early in boot. In order to get this functionality a
platform must do:

- Implement u64 sched_clock_early()
  Clock that returns monotonic time

- Call sched_clock_early_init()
  Tells sched clock that the early clock can be used

- Call sched_clock_early_fini()
  Tells sched clock that the early clock is finished, and sched clock
  should hand over the operation to permanent clock.

- Use weak sched_clock_early() interface to determine time from boot in
  arch specific read_boot_clock64()

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/time.c  | 23 +
 include/linux/sched/clock.h |  4 +++
 kernel/sched/clock.c| 63 -
 3 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index e0754cdbad37..be458ea979e7 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -95,3 +96,25 @@ void __init time_init(void)
 {
late_time_init = x86_late_time_init;
 }
+
+/*
+ * Called once during to boot to initialize boot time.
+ */
+void read_boot_clock64(struct timespec64 *ts)
+{
+   u64 ns_boot = sched_clock_early(); /* nsec from boot */
+   struct timespec64 ts_now;
+   bool valid_clock;
+   u64 ns_now;
+
+   /* Time from epoch */
+   read_persistent_clock64(_now);
+   ns_now = timespec64_to_ns(_now);
+   valid_clock = ns_boot && timespec64_valid_strict(_now) &&
+   (ns_now > ns_boot);
+
+   if (!valid_clock)
+   *ts = (struct timespec64){0, 0};
+   else
+   *ts = ns_to_timespec64(ns_now - ns_boot);
+}
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index a55600ffdf4b..f8291fa28c0c 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -63,6 +63,10 @@ extern void sched_clock_tick_stable(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(void);
 
+void sched_clock_early_init(void);
+void sched_clock_early_fini(void);
+u64 sched_clock_early(void);
+
 /*
  * As outlined in clock.c, provides a fast, high resolution, nanosecond
  * time source that is monotonic per cpu argument and has bounded drift
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ca0f8fc945c6..72beb1ba3ddb 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -80,9 +80,26 @@ EXPORT_SYMBOL_GPL(sched_clock);
 
 __read_mostly int sched_clock_running;
 
+/*
+ * Because static branches cannot be altered before jump_label_init() is 
called,
+ * and early time stamps may be initialized before that, we start with sched
+ * clock early static branch enabled, and global status disabled.  Early in 
boot
+ * it is decided whether to enable the global status as well (set
+ * sched_clock_early_running to true), and later, when early clock is no longer
+ * needed, the static branch is disabled to keep hot-path fast.
+ */
+static DEFINE_STATIC_KEY_TRUE(__use_sched_clock_early);
+static bool __read_mostly sched_clock_early_running;
+
 void sched_clock_init(void)
 {
-   sched_clock_running = 1;
+   /*
+* We start clock only once early clock is finished, or if early clock
+* was not running.
+*/
+   if (!sched_clock_early_running)
+   sched_clock_running = 1;
+
 }
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
@@ -362,6 +379,11 @@ u64 sched_clock_cpu(int cpu)
if (sched_clock_stable())
return sched_clock() + __sched_clock_offset;
 
+   if (static_branch_unlikely(&__use_sched_clock_early)) {
+   if (sched_clock_early_running)
+   return sched_clock_early();
+   }
+
if (unlikely(!sched_clock_running))
return 0ull;
 
@@ -444,6 +466,45 @@ void sched_clock_idle_wakeup_event(void)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+u64 __weak sched_clock_early(void)
+{
+   return 0;
+}
+
+/*
+ * Is called when sched_clock_early() is about to be finished, notifies sched
+ * clock that after this call sched_clock_early() can't be used.
+ */
+void __init sched_clock_early_fini(void)
+{
+   struct sched_clock_data *scd = this_scd();
+   u64 now_early, now_sched;
+
+   now_early = sched_clock_early();
+   now_sched = sched_clock();
+
+   __gtod_offset = now_early - scd->tick_gtod;
+   __sched_cloc

[PATCH v4 2/2] x86/tsc: use tsc early

2017-08-14 Thread Pavel Tatashin

tsc_early_init():
Use verious methods to determine the availability of TSC feature and its
frequency early in boot, and if that is possible initialize TSC and also
call sched_clock_early_init() to be able to get timestamps early in boot.

tsc_early_fini()
Implement the finish part of early tsc feature, print message about the
offset, which can be useful to findout how much time was spent in post and
boot manager, and also call sched_clock_early_fini() to let sched clock
know that

sched_clock_early():
TSC based implementation of weak function that is defined in sched clock.

Call tsc_early_init() to initialize early boot time stamps functionality on
the supported x86 platforms, and call tsc_early_fini() to finish this
feature after permanent tsc has been initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/include/asm/tsc.h |  4 
 arch/x86/kernel/setup.c| 10 --
 arch/x86/kernel/time.c |  1 +
 arch/x86/kernel/tsc.c  | 47 ++
 4 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index f5e6f1c417df..6dc9618b24e3 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -50,11 +50,15 @@ extern bool tsc_store_and_check_tsc_adjust(bool bootcpu);
 extern void tsc_verify_tsc_adjust(bool resume);
 extern void check_tsc_sync_source(int cpu);
 extern void check_tsc_sync_target(void);
+void tsc_early_init(unsigned int khz);
+void tsc_early_fini(void);
 #else
 static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return 
false; }
 static inline void tsc_verify_tsc_adjust(bool resume) { }
 static inline void check_tsc_sync_source(int cpu) { }
 static inline void check_tsc_sync_target(void) { }
+static inline void tsc_early_init(unsigned int khz) { }
+static inline void tsc_early_fini(void) { }
 #endif
 
 extern int notsc_setup(char *);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..413434d98a23 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -812,7 +812,11 @@ dump_kernel_offset(struct notifier_block *self, unsigned 
long v, void *p)
return 0;
 }
 
-static void __init simple_udelay_calibration(void)
+/*
+ * Initialize early tsc to show early boot timestamps, and also loops_per_jiffy
+ * for udelay
+ */
+static void __init early_clock_calibration(void)
 {
unsigned int tsc_khz, cpu_khz;
unsigned long lpj;
@@ -827,6 +831,8 @@ static void __init simple_udelay_calibration(void)
if (!tsc_khz)
return;
 
+   tsc_early_init(tsc_khz);
+
lpj = tsc_khz * 1000;
do_div(lpj, HZ);
loops_per_jiffy = lpj;
@@ -1039,7 +1045,7 @@ void __init setup_arch(char **cmdline_p)
 */
init_hypervisor_platform();
 
-   simple_udelay_calibration();
+   early_clock_calibration();
 
x86_init.resources.probe_roms();
 
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be458ea979e7..2c82c7e0f747 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -86,6 +86,7 @@ static __init void x86_late_time_init(void)
 {
x86_init.timers.timer_init();
tsc_init();
+   tsc_early_fini();
 }
 
 /*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 796d96bb0821..bd44c2dd4235 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1263,6 +1263,53 @@ static int __init init_tsc_clocksource(void)
  */
 device_initcall(init_tsc_clocksource);
 
+#ifdef CONFIG_X86_TSC
+
+static struct cyc2ns_data  cyc2ns_early;
+static bool sched_clock_early_enabled;
+
+u64 sched_clock_early(void)
+{
+   u64 ns;
+
+   if (!sched_clock_early_enabled)
+   return 0;
+   ns = mul_u64_u32_shr(rdtsc(), cyc2ns_early.cyc2ns_mul,
+cyc2ns_early.cyc2ns_shift);
+   return ns + cyc2ns_early.cyc2ns_offset;
+}
+
+/*
+ * Initialize clock for early time stamps
+ */
+void __init tsc_early_init(unsigned int khz)
+{
+   sched_clock_early_enabled = true;
+   clocks_calc_mult_shift(_early.cyc2ns_mul,
+  _early.cyc2ns_shift,
+  khz, NSEC_PER_MSEC, 0);
+   cyc2ns_early.cyc2ns_offset = -sched_clock_early();
+   sched_clock_early_init();
+}
+
+void __init tsc_early_fini(void)
+{
+   unsigned long long t;
+   unsigned long r;
+
+   /* We did not have early sched clock if multiplier is 0 */
+   if (cyc2ns_early.cyc2ns_mul == 0)
+   return;
+
+   t = -cyc2ns_early.cyc2ns_offset;
+   r = do_div(t, NSEC_PER_SEC);
+
+   sched_clock_early_fini();
+   pr_info("sched clock early is finished, offset [%lld.%09lds]\n", t, r);
+   sched_clock_early_enabled = false;
+}
+#endif /* CONFIG_X86_TSC */
+
 void __init tsc_init(void)
 {
u64 lpj, cyc;
-- 
2.14.1

[PATCH v4 0/2] Early boot time stamps for x86

2017-08-14 Thread Pavel Tatashin

changelog
-
v3 - v4
- Fixed tsc_early_fini() call to be in the 2nd patch as reported
  by Dou Liyang
- Improved comment before __use_sched_clock_early to explain why
  we need both booleans.
- Simplified valid_clock logic in read_boot_clock64().

v2 - v3
- Addressed comment from Thomas Gleixner
- Timestamps are available a little later in boot but still much
  earlier than in mainline. This significantly simplified this
  work.
v1 - v2
In patch "x86/tsc: tsc early":
- added tsc_adjusted_early()
- fixed 32-bit compile error use do_div()

Adding early boot time stamps support for x86 machines.
SPARC patches for early boot time stamps are already integrated into
mainline linux.

Sample output
-
Before:
https://hastebin.com/jadaqukubu.scala

After:
https://hastebin.com/nubipozacu.scala

As seen above, currently timestamps are available from around the time when
"Security Framework" is initialized. But, 26s already passed until we
reached to this point.

Pavel Tatashin (2):
  sched/clock: interface to allow timestamps early in boot
  x86/tsc: use tsc early

 arch/x86/include/asm/tsc.h  |  4 +++
 arch/x86/kernel/setup.c | 10 +--
 arch/x86/kernel/time.c  | 24 +
 arch/x86/kernel/tsc.c   | 47 +
 include/linux/sched/clock.h |  4 +++
 kernel/sched/clock.c| 63 -
 6 files changed, 149 insertions(+), 3 deletions(-)

-- 
2.14.1

[v3 2/2] x86/tsc: use tsc early

2017-08-10 Thread Pavel Tatashin

tsc_early_init():
Use verious methods to determine the availability of TSC feature and its
frequency early in boot, and if that is possible initialize TSC and also
call sched_clock_early_init() to be able to get timestamps early in boot.

tsc_early_fini()
Implement the finish part of early tsc feature, print message about the
offset, which can be useful to findout how much time was spent in post and
boot manager, and also call sched_clock_early_fini() to let sched clock
know that

sched_clock_early():
TSC based implementation of weak function that is defined in sched clock.

Call tsc_early_init() to initialize early boot time stamps functionality on
the supported x86 platforms, and call tsc_early_fini() to finish this
feature after permanent tsc has been initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/include/asm/tsc.h |  4 
 arch/x86/kernel/setup.c| 10 --
 arch/x86/kernel/tsc.c  | 47 ++
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index f5e6f1c417df..6dc9618b24e3 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -50,11 +50,15 @@ extern bool tsc_store_and_check_tsc_adjust(bool bootcpu);
 extern void tsc_verify_tsc_adjust(bool resume);
 extern void check_tsc_sync_source(int cpu);
 extern void check_tsc_sync_target(void);
+void tsc_early_init(unsigned int khz);
+void tsc_early_fini(void);
 #else
 static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return 
false; }
 static inline void tsc_verify_tsc_adjust(bool resume) { }
 static inline void check_tsc_sync_source(int cpu) { }
 static inline void check_tsc_sync_target(void) { }
+static inline void tsc_early_init(unsigned int khz) { }
+static inline void tsc_early_fini(void) { }
 #endif
 
 extern int notsc_setup(char *);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..413434d98a23 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -812,7 +812,11 @@ dump_kernel_offset(struct notifier_block *self, unsigned 
long v, void *p)
return 0;
 }
 
-static void __init simple_udelay_calibration(void)
+/*
+ * Initialize early tsc to show early boot timestamps, and also loops_per_jiffy
+ * for udelay
+ */
+static void __init early_clock_calibration(void)
 {
unsigned int tsc_khz, cpu_khz;
unsigned long lpj;
@@ -827,6 +831,8 @@ static void __init simple_udelay_calibration(void)
if (!tsc_khz)
return;
 
+   tsc_early_init(tsc_khz);
+
lpj = tsc_khz * 1000;
do_div(lpj, HZ);
loops_per_jiffy = lpj;
@@ -1039,7 +1045,7 @@ void __init setup_arch(char **cmdline_p)
 */
init_hypervisor_platform();
 
-   simple_udelay_calibration();
+   early_clock_calibration();
 
x86_init.resources.probe_roms();
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 796d96bb0821..bd44c2dd4235 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1263,6 +1263,53 @@ static int __init init_tsc_clocksource(void)
  */
 device_initcall(init_tsc_clocksource);
 
+#ifdef CONFIG_X86_TSC
+
+static struct cyc2ns_data  cyc2ns_early;
+static bool sched_clock_early_enabled;
+
+u64 sched_clock_early(void)
+{
+   u64 ns;
+
+   if (!sched_clock_early_enabled)
+   return 0;
+   ns = mul_u64_u32_shr(rdtsc(), cyc2ns_early.cyc2ns_mul,
+cyc2ns_early.cyc2ns_shift);
+   return ns + cyc2ns_early.cyc2ns_offset;
+}
+
+/*
+ * Initialize clock for early time stamps
+ */
+void __init tsc_early_init(unsigned int khz)
+{
+   sched_clock_early_enabled = true;
+   clocks_calc_mult_shift(_early.cyc2ns_mul,
+  _early.cyc2ns_shift,
+  khz, NSEC_PER_MSEC, 0);
+   cyc2ns_early.cyc2ns_offset = -sched_clock_early();
+   sched_clock_early_init();
+}
+
+void __init tsc_early_fini(void)
+{
+   unsigned long long t;
+   unsigned long r;
+
+   /* We did not have early sched clock if multiplier is 0 */
+   if (cyc2ns_early.cyc2ns_mul == 0)
+   return;
+
+   t = -cyc2ns_early.cyc2ns_offset;
+   r = do_div(t, NSEC_PER_SEC);
+
+   sched_clock_early_fini();
+   pr_info("sched clock early is finished, offset [%lld.%09lds]\n", t, r);
+   sched_clock_early_enabled = false;
+}
+#endif /* CONFIG_X86_TSC */
+
 void __init tsc_init(void)
 {
u64 lpj, cyc;
-- 
2.14.0

[v3 0/2] * SUBJECT HERE *

2017-08-10 Thread Pavel Tatashin

changelog
-
v2 - v3
- Addressed comment from Thomas Gleixner
- Timestamps are available a little later in boot but still much
  earlier than in mainline. This significantly simplified this
  work.
v1 - v2
In patch "x86/tsc: tsc early":
- added tsc_adjusted_early()
- fixed 32-bit compile error use do_div()

Adding early boot time stamps support for x86 machines.
SPARC patches for early boot time stamps are already integrated into
mainline linux.

Sample output
-
Before:
https://hastebin.com/jadaqukubu.scala

After:
https://hastebin.com/nubipozacu.scala

As seen above, currently timestamps are available from around the time when
"Security Framework" is initialized. But, 26s already passed until we
reached to this point.

Pavel Tatashin (2):
  sched/clock: interface to allow timestamps early in boot
  x86/tsc: use tsc early

 arch/x86/include/asm/tsc.h  |  4 +++
 arch/x86/kernel/setup.c | 10 ++--
 arch/x86/kernel/time.c  | 22 
 arch/x86/kernel/tsc.c   | 47 ++
 include/linux/sched/clock.h |  4 +++
 kernel/sched/clock.c| 61 -
 6 files changed, 145 insertions(+), 3 deletions(-)

-- 
2.14.0

[v3 1/2] sched/clock: interface to allow timestamps early in boot

2017-08-10 Thread Pavel Tatashin

In Linux printk() can output timestamps next to every line.  This is very
useful for tracking regressions, and finding places that can be optimized.
However, the timestamps are available only later in boot. On smaller
machines it is insignificant amount of time, but on larger it can be many
seconds or even minutes into the boot process.

This patch adds an interface for platforms with unstable sched clock to
show timestamps early in boot. In order to get this functionality a
platform must do:

- Implement u64 sched_clock_early()
  Clock that returns monotonic time

- Call sched_clock_early_init()
  Tells sched clock that the early clock can be used

- Call sched_clock_early_fini()
  Tells sched clock that the early clock is finished, and sched clock
  should hand over the operation to permanent clock.

- Use weak sched_clock_early() interface to determine time from boot in
  arch specific read_boot_clock64()

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 arch/x86/kernel/time.c  | 22 
 include/linux/sched/clock.h |  4 +++
 kernel/sched/clock.c| 61 -
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index e0754cdbad37..6ede0da7041a 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -85,6 +86,7 @@ static __init void x86_late_time_init(void)
 {
x86_init.timers.timer_init();
tsc_init();
+   tsc_early_fini();
 }
 
 /*
@@ -95,3 +97,23 @@ void __init time_init(void)
 {
late_time_init = x86_late_time_init;
 }
+
+/*
+ * Called once during to boot to initialize boot time.
+ */
+void read_boot_clock64(struct timespec64 *ts)
+{
+   u64 ns_boot = sched_clock_early(); /* nsec from boot */
+   struct timespec64 ts_now;
+   bool valid_clock;
+
+   /* Time from epoch */
+   read_persistent_clock64(_now);
+   valid_clock = ns_boot && timespec64_valid_strict(_now) &&
+   (ts_now.tv_sec || ts_now.tv_nsec);
+
+   if (!valid_clock)
+   *ts = (struct timespec64){0, 0};
+   else
+   *ts = ns_to_timespec64(timespec64_to_ns(_now) - ns_boot);
+}
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index a55600ffdf4b..f8291fa28c0c 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -63,6 +63,10 @@ extern void sched_clock_tick_stable(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(void);
 
+void sched_clock_early_init(void);
+void sched_clock_early_fini(void);
+u64 sched_clock_early(void);
+
 /*
  * As outlined in clock.c, provides a fast, high resolution, nanosecond
  * time source that is monotonic per cpu argument and has bounded drift
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ca0f8fc945c6..be5b60af4ca9 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -80,9 +80,24 @@ EXPORT_SYMBOL_GPL(sched_clock);
 
 __read_mostly int sched_clock_running;
 
+/*
+ * We start with sched clock early static branch enabled, and global status
+ * disabled.  Early in boot it is decided whether to enable the global
+ * status as well (set sched_clock_early_running to true), and later, when
+ * early clock is no longer needed, the static branch is disabled.
+ */
+static DEFINE_STATIC_KEY_TRUE(__use_sched_clock_early);
+static bool __read_mostly sched_clock_early_running;
+
 void sched_clock_init(void)
 {
-   sched_clock_running = 1;
+   /*
+* We start clock only once early clock is finished, or if early clock
+* was not running.
+*/
+   if (!sched_clock_early_running)
+   sched_clock_running = 1;
+
 }
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
@@ -362,6 +377,11 @@ u64 sched_clock_cpu(int cpu)
if (sched_clock_stable())
return sched_clock() + __sched_clock_offset;
 
+   if (static_branch_unlikely(&__use_sched_clock_early)) {
+   if (sched_clock_early_running)
+   return sched_clock_early();
+   }
+
if (unlikely(!sched_clock_running))
return 0ull;
 
@@ -444,6 +464,45 @@ void sched_clock_idle_wakeup_event(void)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+u64 __weak sched_clock_early(void)
+{
+   return 0;
+}
+
+/*
+ * Is called when sched_clock_early() is about to be finished, notifies sched
+ * clock that after this call sched_clock_early() can't be used.
+ */
+void __init sched_clock_early_fini(void)
+{
+   struct sched_clock_data *scd = this_scd();
+   u64 now_early, now_sched;
+
+   now_early = sched_clock_early();
+   now_sched = sched_clock();
+
+   __gtod_offset = now_early - scd->tick_gtod;
+   __sched_clock_offset = now_early - now_sched;
+
+   sche

[v1 1/1] mm: discard memblock data later

2017-08-11 Thread Pavel Tatashin

There is existing use after free bug when deferred struct pages are
enabled:

The memblock_add() allocates memory for the memory array if more than
128 entries are needed.  See comment in e820__memblock_setup():

  * The bootstrap memblock region count maximum is 128 entries
  * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
  * than that - so allow memblock resizing.

This memblock memory is freed here:
free_low_memory_core_early()

We access the freed memblock.memory later in boot when deferred pages are
initialized in this path:

deferred_init_memmap()
for_each_mem_pfn_range()
  __next_mem_pfn_range()
type = 

One possible explanation for why this use-after-free hasn't been hit
before is that the limit of INIT_MEMBLOCK_REGIONS has never been exceeded
at least on systems where deferred struct pages were enabled.

Tested by reducing INIT_MEMBLOCK_REGIONS down to 4 from the current 128,
and verifying in qemu that this code is getting excuted and that the freed
pages are sane.

Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in 
parallel with kswapd")
Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
Acked-by: Michal Hocko <mho...@suse.com>
---
 include/linux/memblock.h |  8 +---
 mm/memblock.c| 38 +-
 mm/nobootmem.c   | 16 
 mm/page_alloc.c  |  4 
 4 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77d427974f57..313917e74938 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -17,7 +17,7 @@
 #include 
 #include 
 
-#define INIT_MEMBLOCK_REGIONS  128
+#define INIT_MEMBLOCK_REGIONS  4
 #define INIT_PHYSMEM_REGIONS   4
 
 /* Definition of memblock flags. */
@@ -61,6 +61,7 @@ extern int memblock_debug;
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
+void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, 
phys_addr_t align,
int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
   phys_addr_t size, phys_addr_t align);
-phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
-phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
phys_addr_t *out_end);
 
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..bf14aea6ab70 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct 
memblock_type *type, u
 }
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
-   phys_addr_t *addr)
-{
-   if (memblock.reserved.regions == memblock_reserved_init_regions)
-   return 0;
-
-   *addr = __pa(memblock.reserved.regions);
-
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.reserved.max);
-}
-
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
-   phys_addr_t *addr)
+/**
+ * Discard memory and reserved arrays if they were allocated
+ */
+void __init memblock_discard(void)
 {
-   if (memblock.memory.regions == memblock_memory_init_regions)
-   return 0;
+   phys_addr_t addr, size;
 
-   *addr = __pa(memblock.memory.regions);
+   if (memblock.reserved.regions != memblock_reserved_init_regions) {
+   addr = __pa(memblock.reserved.regions);
+   size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+   __memblock_free_late(addr, size);
+   }
 
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.memory.max);
+   if (memblock.memory.regions == memblock_memory_ini

[v1 0/1] discard memblock data later

2017-08-11 Thread Pavel Tatashin

This fixes a problem with use after free that can happen when there are
many physical regions and deferred pages are enabled.

Also, this fix is needed for my upcoming improvements to deferred pages:
"complete deferred page initialization", where we do not zero the backing
struct page memory.

Pavel Tatashin (1):
  mm: discard memblock data later

 include/linux/memblock.h |  8 +---
 mm/memblock.c| 38 +-
 mm/nobootmem.c   | 16 
 mm/page_alloc.c  |  4 
 4 files changed, 26 insertions(+), 40 deletions(-)

-- 
2.14.0

[v2 1/1] mm: discard memblock data later

2017-08-11 Thread Pavel Tatashin

There is existing use after free bug when deferred struct pages are
enabled:

The memblock_add() allocates memory for the memory array if more than
128 entries are needed.  See comment in e820__memblock_setup():

  * The bootstrap memblock region count maximum is 128 entries
  * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
  * than that - so allow memblock resizing.

This memblock memory is freed here:
free_low_memory_core_early()

We access the freed memblock.memory later in boot when deferred pages are
initialized in this path:

deferred_init_memmap()
for_each_mem_pfn_range()
  __next_mem_pfn_range()
type = 

One possible explanation for why this use-after-free hasn't been hit
before is that the limit of INIT_MEMBLOCK_REGIONS has never been exceeded
at least on systems where deferred struct pages were enabled.

Tested by reducing INIT_MEMBLOCK_REGIONS down to 4 from the current 128,
and verifying in qemu that this code is getting excuted and that the freed
pages are sane.

Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in 
parallel with kswapd")
Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
Acked-by: Michal Hocko <mho...@suse.com>
---
 include/linux/memblock.h |  6 --
 mm/memblock.c| 38 +-
 mm/nobootmem.c   | 16 
 mm/page_alloc.c  |  4 
 4 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77d427974f57..bae11c7e7bf3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,6 +61,7 @@ extern int memblock_debug;
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
+void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, 
phys_addr_t align,
int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
   phys_addr_t size, phys_addr_t align);
-phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
-phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
phys_addr_t *out_end);
 
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..bf14aea6ab70 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct 
memblock_type *type, u
 }
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
-   phys_addr_t *addr)
-{
-   if (memblock.reserved.regions == memblock_reserved_init_regions)
-   return 0;
-
-   *addr = __pa(memblock.reserved.regions);
-
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.reserved.max);
-}
-
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
-   phys_addr_t *addr)
+/**
+ * Discard memory and reserved arrays if they were allocated
+ */
+void __init memblock_discard(void)
 {
-   if (memblock.memory.regions == memblock_memory_init_regions)
-   return 0;
+   phys_addr_t addr, size;
 
-   *addr = __pa(memblock.memory.regions);
+   if (memblock.reserved.regions != memblock_reserved_init_regions) {
+   addr = __pa(memblock.reserved.regions);
+   size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+   __memblock_free_late(addr, size);
+   }
 
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.memory.max);
+   if (memblock.memory.regions == memblock_memory_init_regions) {
+   addr = __pa(memblock.memory.regions);
+   size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+

[v2 0/1] discard memblock data later

2017-08-11 Thread Pavel Tatashin

Changelog:
v1 - v2
- Removed debugging change to INIT_MEMBLOCK_REGIONS

This fixes a problem with use after free that can happen when there are
many physical regions and deferred pages are enabled.

Also, this fix is needed for my upcoming improvements to deferred pages:
"complete deferred page initialization", where we do not zero the backing
struct page memory.

Pavel Tatashin (1):
  mm: discard memblock data later

 include/linux/memblock.h |  6 --
 mm/memblock.c| 38 +-
 mm/nobootmem.c   | 16 
 mm/page_alloc.c  |  4 
 4 files changed, 25 insertions(+), 39 deletions(-)

-- 
2.14.0

[v6 06/15] sparc64: simplify vmemmap_populate

2017-08-07 Thread Pavel Tatashin

Remove duplicating code by using common functions
vmemmap_pud_populate and vmemmap_pgd_populate.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/sparc/mm/init_64.c | 23 ++-
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 25ded711ab6c..ffb25be19389 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2590,30 +2590,19 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
vstart = vstart & PMD_MASK;
vend = ALIGN(vend, PMD_SIZE);
for (; vstart < vend; vstart += PMD_SIZE) {
-   pgd_t *pgd = pgd_offset_k(vstart);
+   pgd_t *pgd = vmemmap_pgd_populate(vstart, node);
unsigned long pte;
pud_t *pud;
pmd_t *pmd;
 
-   if (pgd_none(*pgd)) {
-   pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
+   if (!pgd)
+   return -ENOMEM;
 
-   if (!new)
-   return -ENOMEM;
-   pgd_populate(_mm, pgd, new);
-   }
-
-   pud = pud_offset(pgd, vstart);
-   if (pud_none(*pud)) {
-   pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
-
-   if (!new)
-   return -ENOMEM;
-   pud_populate(_mm, pud, new);
-   }
+   pud = vmemmap_pud_populate(pgd, vstart, node);
+   if (!pud)
+   return -ENOMEM;
 
pmd = pmd_offset(pud, vstart);
-
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
void *block = vmemmap_alloc_block(PMD_SIZE, node);
-- 
2.14.0

[v6 10/15] x86/kasan: explicitly zero kasan shadow memory

2017-08-07 Thread Pavel Tatashin

To optimize the performance of struct page initialization,
vmemmap_populate() will no longer zero memory.

We must explicitly zero the memory that is allocated by vmemmap_populate()
for kasan, as this memory does not go through struct page initialization
path.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/x86/mm/kasan_init_64.c | 67 +
 1 file changed, 67 insertions(+)

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..ec6b2272fd80 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -84,6 +84,66 @@ static struct notifier_block kasan_die_notifier = {
 };
 #endif
 
+/*
+ * x86 variant of vmemmap_populate() uses either PMD_SIZE pages or base pages
+ * to map allocated memory.  This routine determines the page size for the 
given
+ * address from vmemmap.
+ */
+static u64 get_vmemmap_pgsz(u64 addr)
+{
+   pgd_t *pgd;
+   p4d_t *p4d;
+   pud_t *pud;
+   pmd_t *pmd;
+
+   pgd = pgd_offset_k(addr);
+   BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+
+   p4d = p4d_offset(pgd, addr);
+   BUG_ON(p4d_none(*p4d) || p4d_large(*p4d));
+
+   pud = pud_offset(p4d, addr);
+   BUG_ON(pud_none(*pud) || pud_large(*pud));
+
+   pmd = pmd_offset(pud, addr);
+   BUG_ON(pmd_none(*pmd));
+
+   if (pmd_large(*pmd))
+   return PMD_SIZE;
+   return PAGE_SIZE;
+}
+
+/*
+ * Memory that was allocated by vmemmap_populate is not zeroed, so we must
+ * zero it here explicitly.
+ */
+static void
+zero_vmemmap_populated_memory(void)
+{
+   u64 i, start, end;
+
+   for (i = 0; i < E820_MAX_ENTRIES && pfn_mapped[i].end; i++) {
+   void *kaddr_start = pfn_to_kaddr(pfn_mapped[i].start);
+   void *kaddr_end = pfn_to_kaddr(pfn_mapped[i].end);
+
+   start = (u64)kasan_mem_to_shadow(kaddr_start);
+   end = (u64)kasan_mem_to_shadow(kaddr_end);
+
+   /* Round to the start end of the mapped pages */
+   start = rounddown(start, get_vmemmap_pgsz(start));
+   end = roundup(end, get_vmemmap_pgsz(start));
+   memset((void *)start, 0, end - start);
+   }
+
+   start = (u64)kasan_mem_to_shadow(_stext);
+   end = (u64)kasan_mem_to_shadow(_end);
+
+   /* Round to the start end of the mapped pages */
+   start = rounddown(start, get_vmemmap_pgsz(start));
+   end = roundup(end, get_vmemmap_pgsz(start));
+   memset((void *)start, 0, end - start);
+}
+
 void __init kasan_early_init(void)
 {
int i;
@@ -156,6 +216,13 @@ void __init kasan_init(void)
pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
set_pte(_zero_pte[i], pte);
}
+
+   /*
+* vmemmap_populate does not zero the memory, so we need to zero it
+* explicitly
+*/
+   zero_vmemmap_populated_memory();
+
/* Flush TLBs again to be sure that write protection applied. */
__flush_tlb_all();
 
-- 
2.14.0

[v6 11/15] arm64/kasan: explicitly zero kasan shadow memory

2017-08-07 Thread Pavel Tatashin

To optimize the performance of struct page initialization,
vmemmap_populate() will no longer zero memory.

We must explicitly zero the memory that is allocated by vmemmap_populate()
for kasan, as this memory does not go through struct page initialization
path.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/arm64/mm/kasan_init.c | 42 ++
 1 file changed, 42 insertions(+)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 81f03959a4ab..e78a9ecbb687 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -135,6 +135,41 @@ static void __init clear_pgds(unsigned long start,
set_pgd(pgd_offset_k(start), __pgd(0));
 }
 
+/*
+ * Memory that was allocated by vmemmap_populate is not zeroed, so we must
+ * zero it here explicitly.
+ */
+static void
+zero_vmemmap_populated_memory(void)
+{
+   struct memblock_region *reg;
+   u64 start, end;
+
+   for_each_memblock(memory, reg) {
+   start = __phys_to_virt(reg->base);
+   end = __phys_to_virt(reg->base + reg->size);
+
+   if (start >= end)
+   break;
+
+   start = (u64)kasan_mem_to_shadow((void *)start);
+   end = (u64)kasan_mem_to_shadow((void *)end);
+
+   /* Round to the start end of the mapped pages */
+   start = round_down(start, SWAPPER_BLOCK_SIZE);
+   end = round_up(end, SWAPPER_BLOCK_SIZE);
+   memset((void *)start, 0, end - start);
+   }
+
+   start = (u64)kasan_mem_to_shadow(_text);
+   end = (u64)kasan_mem_to_shadow(_end);
+
+   /* Round to the start end of the mapped pages */
+   start = round_down(start, SWAPPER_BLOCK_SIZE);
+   end = round_up(end, SWAPPER_BLOCK_SIZE);
+   memset((void *)start, 0, end - start);
+}
+
 void __init kasan_init(void)
 {
u64 kimg_shadow_start, kimg_shadow_end;
@@ -205,8 +240,15 @@ void __init kasan_init(void)
pfn_pte(sym_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
 
memset(kasan_zero_page, 0, PAGE_SIZE);
+
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
+   /*
+* vmemmap_populate does not zero the memory, so we need to zero it
+* explicitly
+*/
+   zero_vmemmap_populated_memory();
+
/* At this point kasan is fully initialized. Enable error messages */
init_task.kasan_depth = 0;
pr_info("KernelAddressSanitizer initialized\n");
-- 
2.14.0

[v6 07/15] mm: defining memblock_virt_alloc_try_nid_raw

2017-08-07 Thread Pavel Tatashin

A new variant of memblock_virt_alloc_* allocations:
memblock_virt_alloc_try_nid_raw()
- Does not zero the allocated memory
- Does not panic if request cannot be satisfied

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/bootmem.h | 27 +
 mm/memblock.c   | 53 ++---
 2 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index e223d91b6439..ea30b3987282 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -160,6 +160,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)
 
 /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr,
+ phys_addr_t max_addr, int nid);
 void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
phys_addr_t align, phys_addr_t min_addr,
phys_addr_t max_addr, int nid);
@@ -176,6 +179,14 @@ static inline void * __init memblock_virt_alloc(
NUMA_NO_NODE);
 }
 
+static inline void * __init memblock_virt_alloc_raw(
+   phys_addr_t size,  phys_addr_t align)
+{
+   return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT,
+   BOOTMEM_ALLOC_ACCESSIBLE,
+   NUMA_NO_NODE);
+}
+
 static inline void * __init memblock_virt_alloc_nopanic(
phys_addr_t size, phys_addr_t align)
 {
@@ -257,6 +268,14 @@ static inline void * __init memblock_virt_alloc(
return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT);
 }
 
+static inline void * __init memblock_virt_alloc_raw(
+   phys_addr_t size,  phys_addr_t align)
+{
+   if (!align)
+   align = SMP_CACHE_BYTES;
+   return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT);
+}
+
 static inline void * __init memblock_virt_alloc_nopanic(
phys_addr_t size, phys_addr_t align)
 {
@@ -309,6 +328,14 @@ static inline void * __init 
memblock_virt_alloc_try_nid(phys_addr_t size,
  min_addr);
 }
 
+static inline void * __init memblock_virt_alloc_try_nid_raw(
+   phys_addr_t size, phys_addr_t align,
+   phys_addr_t min_addr, phys_addr_t max_addr, int nid)
+{
+   return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align,
+   min_addr, max_addr);
+}
+
 static inline void * __init memblock_virt_alloc_try_nid_nopanic(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr, int nid)
diff --git a/mm/memblock.c b/mm/memblock.c
index 08f449acfdd1..3fbf3bcb52d9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1327,7 +1327,6 @@ static void * __init memblock_virt_alloc_internal(
return NULL;
 done:
ptr = phys_to_virt(alloc);
-   memset(ptr, 0, size);
 
/*
 * The min_count is set to 0 so that bootmem allocated blocks
@@ -1340,6 +1339,38 @@ static void * __init memblock_virt_alloc_internal(
return ptr;
 }
 
+/**
+ * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
+ * memory and without panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ *   is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ *   is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ *   allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory, does not panic if request
+ * cannot be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_raw(
+   phys_addr_t size, phys_addr_t align,
+   phys_addr_t min_addr, phys_addr_t max_addr,
+   int nid)
+{
+   memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx 
max_addr=0x%llx %pF\n",
+__func__, (u64)si

[v6 03/15] sparc64/mm: setting fields in deferred pages

2017-08-07 Thread Pavel Tatashin

Without deferred struct page feature (CONFIG_DEFERRED_STRUCT_PAGE_INIT),
flags and other fields in "struct page"es are never changed prior to first
initializing struct pages by going through __init_single_page().

With deferred struct page feature enabled there is a case where we set some
fields prior to initializing:

 mem_init() {
 register_page_bootmem_info();
 free_all_bootmem();
 ...
 }

When register_page_bootmem_info() is called only non-deferred struct pages
are initialized. But, this function goes through some reserved pages which
might be part of the deferred, and thus are not yet initialized.

mem_init
register_page_bootmem_info
 register_page_bootmem_info_node
  get_page_bootmem
   .. setting fields here ..
   such as: page->freelist = (void *)type;

We end-up with similar issue as in the previous patch, where currently we
do not observe problem as memory is zeroed. But, if flag asserts are
changed we can start hitting issues.

Also, because in this patch series we will stop zeroing struct page memory
during allocation, we must make sure that struct pages are properly
initialized prior to using them.

The deferred-reserved pages are initialized in free_all_bootmem().
Therefore, the fix is to switch the above calls.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/sparc/mm/init_64.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index fed73f14aa49..25ded711ab6c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2487,9 +2487,15 @@ void __init mem_init(void)
 {
high_memory = __va(last_valid_pfn << PAGE_SHIFT);
 
-   register_page_bootmem_info();
free_all_bootmem();
 
+   /* Must be done after boot memory is put on freelist, because here we
+* might set fields in deferred struct pages that have not yet been
+* initialized, and free_all_bootmem() initializes all the reserved
+* deferred pages for us.
+*/
+   register_page_bootmem_info();
+
/*
 * Set up the zero page, mark it reserved, so that page count
 * is not manipulated when freeing the page from user ptes.
-- 
2.14.0

[v6 12/15] mm: explicitly zero pagetable memory

2017-08-07 Thread Pavel Tatashin

Soon vmemmap_alloc_block() will no longer zero the block, so zero memory
at its call sites for everything except struct pages.  Struct page memory
is zero'd by struct page initialization.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/sparse-vmemmap.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c50b1a14d55e..d40c721ab19f 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -191,6 +191,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned 
long addr, int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
pmd_populate_kernel(_mm, pmd, p);
}
return pmd;
@@ -203,6 +204,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned 
long addr, int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
pud_populate(_mm, pud, p);
}
return pud;
@@ -215,6 +217,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned 
long addr, int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
p4d_populate(_mm, p4d, p);
}
return p4d;
@@ -227,6 +230,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, 
int node)
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
return NULL;
+   memset(p, 0, PAGE_SIZE);
pgd_populate(_mm, pgd, p);
}
return pgd;
-- 
2.14.0

[v6 05/15] mm: don't accessed uninitialized struct pages

2017-08-07 Thread Pavel Tatashin

In deferred_init_memmap() where all deferred struct pages are initialized
we have a check like this:

if (page->flags) {
VM_BUG_ON(page_zone(page) != zone);
goto free_range;
}

This way we are checking if the current deferred page has already been
initialized. It works, because memory for struct pages has been zeroed, and
the only way flags are not zero if it went through __init_single_page()
before.  But, once we change the current behavior and won't zero the memory
in memblock allocator, we cannot trust anything inside "struct page"es
until they are initialized. This patch fixes this.

This patch defines a new accessor memblock_get_reserved_pfn_range()
which returns successive ranges of reserved PFNs.  deferred_init_memmap()
calls it to determine if a PFN and its struct page has already been
initialized.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/memblock.h |  3 +++
 mm/memblock.c| 54 ++--
 mm/page_alloc.c  | 11 +-
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index bae11c7e7bf3..b6a2a610f5e1 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -320,6 +320,9 @@ int memblock_is_map_memory(phys_addr_t addr);
 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
 bool memblock_is_reserved(phys_addr_t addr);
 bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+void memblock_get_reserved_pfn_range(unsigned long pfn,
+unsigned long *pfn_start,
+unsigned long *pfn_end);
 
 extern void __memblock_dump_all(void);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index bf14aea6ab70..08f449acfdd1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1580,7 +1580,13 @@ void __init memblock_mem_limit_remove_map(phys_addr_t 
limit)
memblock_cap_memory_range(0, max_addr);
 }
 
-static int __init_memblock memblock_search(struct memblock_type *type, 
phys_addr_t addr)
+/**
+ * Return index in regions array if addr is within the region. Otherwise
+ * return -1. If -1 is returned and *next_idx is not %NULL, sets it to the
+ * next region index or -1 if there is none.
+ */
+static int __init_memblock memblock_search(struct memblock_type *type,
+  phys_addr_t addr, int *next_idx)
 {
unsigned int left = 0, right = type->cnt;
 
@@ -1595,22 +1601,26 @@ static int __init_memblock memblock_search(struct 
memblock_type *type, phys_addr
else
return mid;
} while (left < right);
+
+   if (next_idx)
+   *next_idx = (right == type->cnt) ? -1 : right;
+
return -1;
 }
 
 bool __init memblock_is_reserved(phys_addr_t addr)
 {
-   return memblock_search(, addr) != -1;
+   return memblock_search(, addr, NULL) != -1;
 }
 
 bool __init_memblock memblock_is_memory(phys_addr_t addr)
 {
-   return memblock_search(, addr) != -1;
+   return memblock_search(, addr, NULL) != -1;
 }
 
 int __init_memblock memblock_is_map_memory(phys_addr_t addr)
 {
-   int i = memblock_search(, addr);
+   int i = memblock_search(, addr, NULL);
 
if (i == -1)
return false;
@@ -1622,7 +1632,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long 
pfn,
 unsigned long *start_pfn, unsigned long *end_pfn)
 {
struct memblock_type *type = 
-   int mid = memblock_search(type, PFN_PHYS(pfn));
+   int mid = memblock_search(type, PFN_PHYS(pfn), NULL);
 
if (mid == -1)
return -1;
@@ -1646,7 +1656,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long 
pfn,
  */
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t 
size)
 {
-   int idx = memblock_search(, base);
+   int idx = memblock_search(, base, NULL);
phys_addr_t end = base + memblock_cap_size(base, );
 
if (idx == -1)
@@ -1655,6 +1665,38 @@ int __init_memblock 
memblock_is_region_memory(phys_addr_t base, phys_addr_t size
 memblock.memory.regions[idx].size) >= end;
 }
 
+/**
+ * memblock_get_reserved_pfn_range - search for the next reserved region
+ *
+ * @pfn: start searching from this pfn.
+ *
+ * RETURNS:
+ * [start_pfn, end_pfn), where start_pfn >= pfn. If none is found
+ * start_pfn, and end_pfn are both set to ULONG_MAX.
+ */
+void __init_memblock memblock_get_reserved_pfn_range(unsigned long pfn,
+unsigned long *start_pfn,
+unsigned long *end_pfn)
+{
+   struct memblock_ty

[v6 04/15] mm: discard memblock data later

2017-08-07 Thread Pavel Tatashin

There is existing use after free bug when deferred struct pages are
enabled:

The memblock_add() allocates memory for the memory array if more than
128 entries are needed.  See comment in e820__memblock_setup():

  * The bootstrap memblock region count maximum is 128 entries
  * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
  * than that - so allow memblock resizing.

This memblock memory is freed here:
free_low_memory_core_early()

We access the freed memblock.memory later in boot when deferred pages are
initialized in this path:

deferred_init_memmap()
for_each_mem_pfn_range()
  __next_mem_pfn_range()
type = 

One possible explanation for why this use-after-free hasn't been hit
before is that the limit of INIT_MEMBLOCK_REGIONS has never been exceeded
at least on systems where deferred struct pages were enabled.

Another reason why we want this problem fixed in this patch series is,
in the next patch, we will need to access memblock.reserved from
deferred_init_memmap().

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/memblock.h |  6 --
 mm/memblock.c| 38 +-
 mm/nobootmem.c   | 16 
 mm/page_alloc.c  |  4 
 4 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77d427974f57..bae11c7e7bf3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,6 +61,7 @@ extern int memblock_debug;
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
+void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, 
phys_addr_t align,
int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
   phys_addr_t size, phys_addr_t align);
-phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
-phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
phys_addr_t *out_end);
 
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..bf14aea6ab70 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct 
memblock_type *type, u
 }
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
-   phys_addr_t *addr)
-{
-   if (memblock.reserved.regions == memblock_reserved_init_regions)
-   return 0;
-
-   *addr = __pa(memblock.reserved.regions);
-
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.reserved.max);
-}
-
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
-   phys_addr_t *addr)
+/**
+ * Discard memory and reserved arrays if they were allocated
+ */
+void __init memblock_discard(void)
 {
-   if (memblock.memory.regions == memblock_memory_init_regions)
-   return 0;
+   phys_addr_t addr, size;
 
-   *addr = __pa(memblock.memory.regions);
+   if (memblock.reserved.regions != memblock_reserved_init_regions) {
+   addr = __pa(memblock.reserved.regions);
+   size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+   __memblock_free_late(addr, size);
+   }
 
-   return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.memory.max);
+   if (memblock.memory.regions == memblock_memory_init_regions) {
+   addr = __pa(memblock.memory.regions);
+   size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+   __memblock_free_late(addr, size);
+   }
 }
-
 #endif
 
 /**
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 36454d0f96ee..3637809a18d0 100

[v6 00/15] complete deferred page initialization

2017-08-07 Thread Pavel Tatashin

Changelog:
v6 - v4
- Fixed ARM64 + kasan code, as reported by Ard Biesheuvel
- Tested ARM64 code in qemu and found few more issues, that I fixed in this
  iteration
- Added page roundup/rounddown to x86 and arm zeroing routines to zero the
  whole allocated range, instead of only provided address range.
- Addressed SPARC related comment from Sam Ravnborg
- Fixed section mismatch warnings related to memblock_discard().

v5 - v4
- Fixed build issues reported by kbuild on various configurations

v4 - v3
- Rewrote code to zero sturct pages in __init_single_page() as
  suggested by Michal Hocko
- Added code to handle issues related to accessing struct page
  memory before they are initialized.

v3 - v2
- Addressed David Miller comments about one change per patch:
* Splited changes to platforms into 4 patches
* Made "do not zero vmemmap_buf" as a separate patch

v2 - v1
- Per request, added s390 to deferred "struct page" zeroing
- Collected performance data on x86 which proofs the importance to
  keep memset() as prefetch (see below).

SMP machines can benefit from the DEFERRED_STRUCT_PAGE_INIT config option,
which defers initializing struct pages until all cpus have been started so
it can be done in parallel.

However, this feature is sub-optimal, because the deferred page
initialization code expects that the struct pages have already been zeroed,
and the zeroing is done early in boot with a single thread only.  Also, we
access that memory and set flags before struct pages are initialized. All
of this is fixed in this patchset.

In this work we do the following:
- Never read access struct page until it was initialized
- Never set any fields in struct pages before they are initialized
- Zero struct page at the beginning of struct page initialization

Performance improvements on x86 machine with 8 nodes:
Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz

Single threaded struct page init: 7.6s/T improvement
Deferred struct page init: 10.2s/T improvement

Pavel Tatashin (15):
  x86/mm: reserve only exiting low pages
  x86/mm: setting fields in deferred pages
  sparc64/mm: setting fields in deferred pages
  mm: discard memblock data later
  mm: don't accessed uninitialized struct pages
  sparc64: simplify vmemmap_populate
  mm: defining memblock_virt_alloc_try_nid_raw
  mm: zero struct pages during initialization
  sparc64: optimized struct page zeroing
  x86/kasan: explicitly zero kasan shadow memory
  arm64/kasan: explicitly zero kasan shadow memory
  mm: explicitly zero pagetable memory
  mm: stop zeroing memory during allocation in vmemmap
  mm: optimize early system hash allocations
  mm: debug for raw alloctor

 arch/arm64/mm/kasan_init.c  |  42 ++
 arch/sparc/include/asm/pgtable_64.h |  30 +++
 arch/sparc/mm/init_64.c |  31 +++-
 arch/x86/kernel/setup.c |   5 +-
 arch/x86/mm/init_64.c   |   9 ++-
 arch/x86/mm/kasan_init_64.c |  67 
 include/linux/bootmem.h |  27 +++
 include/linux/memblock.h|   9 ++-
 include/linux/mm.h  |   9 +++
 mm/memblock.c   | 152 
 mm/nobootmem.c  |  16 
 mm/page_alloc.c |  31 +---
 mm/sparse-vmemmap.c |  10 ++-
 mm/sparse.c |   6 +-
 14 files changed, 356 insertions(+), 88 deletions(-)

-- 
2.14.0

[v6 02/15] x86/mm: setting fields in deferred pages

2017-08-07 Thread Pavel Tatashin

Without deferred struct page feature (CONFIG_DEFERRED_STRUCT_PAGE_INIT),
flags and other fields in "struct page"es are never changed prior to first
initializing struct pages by going through __init_single_page().

With deferred struct page feature enabled there is a case where we set some
fields prior to initializing:

mem_init() {
register_page_bootmem_info();
free_all_bootmem();
...
}

When register_page_bootmem_info() is called only non-deferred struct pages
are initialized. But, this function goes through some reserved pages which
might be part of the deferred, and thus are not yet initialized.

  mem_init
   register_page_bootmem_info
register_page_bootmem_info_node
 get_page_bootmem
  .. setting fields here ..
  such as: page->freelist = (void *)type;

We end-up with similar issue as in the previous patch, where currently we
do not observe problem as memory is zeroed. But, if flag asserts are
changed we can start hitting issues.

Also, because in this patch series we will stop zeroing struct page memory
during allocation, we must make sure that struct pages are properly
initialized prior to using them.

The deferred-reserved pages are initialized in free_all_bootmem().
Therefore, the fix is to switch the above calls.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/x86/mm/init_64.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..1e863baec847 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1165,12 +1165,17 @@ void __init mem_init(void)
 
/* clear_bss() already clear the empty_zero_page */
 
-   register_page_bootmem_info();
-
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
 
+   /* Must be done after boot memory is put on freelist, because here we
+* might set fields in deferred struct pages that have not yet been
+* initialized, and free_all_bootmem() initializes all the reserved
+* deferred pages for us.
+*/
+   register_page_bootmem_info();
+
/* Register memory areas for /proc/kcore */
kclist_add(_vsyscall, (void *)VSYSCALL_ADDR,
 PAGE_SIZE, KCORE_OTHER);
-- 
2.14.0

[v6 14/15] mm: optimize early system hash allocations

2017-08-07 Thread Pavel Tatashin

Clients can call alloc_large_system_hash() with flag: HASH_ZERO to specify
that memory that was allocated for system hash needs to be zeroed,
otherwise the memory does not need to be zeroed, and client will initialize
it.

If memory does not need to be zero'd, call the new
memblock_virt_alloc_raw() interface, and thus improve the boot performance.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/page_alloc.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4d32c1fa4c6c..000806298dfb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7354,18 +7354,17 @@ void *__init alloc_large_system_hash(const char 
*tablename,
 
log2qty = ilog2(numentries);
 
-   /*
-* memblock allocator returns zeroed memory already, so HASH_ZERO is
-* currently not used when HASH_EARLY is specified.
-*/
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
-   if (flags & HASH_EARLY)
-   table = memblock_virt_alloc_nopanic(size, 0);
-   else if (hashdist)
+   if (flags & HASH_EARLY) {
+   if (flags & HASH_ZERO)
+   table = memblock_virt_alloc_nopanic(size, 0);
+   else
+   table = memblock_virt_alloc_raw(size, 0);
+   } else if (hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
-   else {
+   } else {
/*
 * If bucketsize is not a power-of-two, we may free
 * some pages at the end of hash table which
-- 
2.14.0

[v6 15/15] mm: debug for raw alloctor

2017-08-07 Thread Pavel Tatashin

When CONFIG_DEBUG_VM is enabled, this patch sets all the memory that is
returned by memblock_virt_alloc_try_nid_raw() to ones to ensure that no
places excpect zeroed memory.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/memblock.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index 3fbf3bcb52d9..29fcb1dd8a81 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1363,12 +1363,19 @@ void * __init memblock_virt_alloc_try_nid_raw(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
 {
+   void *ptr;
+
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx 
max_addr=0x%llx %pF\n",
 __func__, (u64)size, (u64)align, nid, (u64)min_addr,
 (u64)max_addr, (void *)_RET_IP_);
 
-   return memblock_virt_alloc_internal(size, align,
-   min_addr, max_addr, nid);
+   ptr = memblock_virt_alloc_internal(size, align,
+  min_addr, max_addr, nid);
+#ifdef CONFIG_DEBUG_VM
+   if (ptr && size > 0)
+   memset(ptr, 0xff, size);
+#endif
+   return ptr;
 }
 
 /**
-- 
2.14.0

[v6 08/15] mm: zero struct pages during initialization

2017-08-07 Thread Pavel Tatashin

Add struct page zeroing as a part of initialization of other fields in
__init_single_page().

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 include/linux/mm.h | 9 +
 mm/page_alloc.c| 1 +
 2 files changed, 10 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..183ac5e733db 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -93,6 +93,15 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #define mm_forbids_zeropage(X) (0)
 #endif
 
+/*
+ * On some architectures it is expensive to call memset() for small sizes.
+ * Those architectures should provide their own implementation of "struct page"
+ * zeroing by defining this macro in .
+ */
+#ifndef mm_zero_struct_page
+#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
+#endif
+
 /*
  * Default maximum number of active map areas, this limits the number of vmas
  * per mm struct. Users can overwrite this number by sysctl but there is a
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 983de0a8047b..4d32c1fa4c6c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1168,6 +1168,7 @@ static void free_one_page(struct zone *zone,
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
 {
+   mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
-- 
2.14.0

[v6 13/15] mm: stop zeroing memory during allocation in vmemmap

2017-08-07 Thread Pavel Tatashin

Replace allocators in sprase-vmemmap to use the non-zeroing version. So,
we will get the performance improvement by zeroing the memory in parallel
when struct pages are zeroed.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 mm/sparse-vmemmap.c | 6 +++---
 mm/sparse.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d40c721ab19f..3b646b5ce1b6 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -41,7 +41,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
 {
-   return memblock_virt_alloc_try_nid(size, align, goal,
+   return memblock_virt_alloc_try_nid_raw(size, align, goal,
BOOTMEM_ALLOC_ACCESSIBLE, node);
 }
 
@@ -56,11 +56,11 @@ void * __meminit vmemmap_alloc_block(unsigned long size, 
int node)
 
if (node_state(node, N_HIGH_MEMORY))
page = alloc_pages_node(
-   node, GFP_KERNEL | __GFP_ZERO | 
__GFP_RETRY_MAYFAIL,
+   node, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
get_order(size));
else
page = alloc_pages(
-   GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
+   GFP_KERNEL | __GFP_RETRY_MAYFAIL,
get_order(size));
if (page)
return page_address(page);
diff --git a/mm/sparse.c b/mm/sparse.c
index 7b4be3fd5cac..0e315766ad11 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -441,9 +441,9 @@ void __init sparse_mem_maps_populate_node(struct page 
**map_map,
}
 
size = PAGE_ALIGN(size);
-   map = memblock_virt_alloc_try_nid(size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
+   map = memblock_virt_alloc_try_nid_raw(size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
-- 
2.14.0

[v6 01/15] x86/mm: reserve only exiting low pages

2017-08-07 Thread Pavel Tatashin

Struct pages are initialized by going through __init_single_page(). Since
the existing physical memory in memblock is represented in memblock.memory
list, struct page for every page from this list goes through
__init_single_page().

The second memblock list: memblock.reserved, manages the allocated memory.
The memory that won't be available to kernel allocator. So, every page from
this list goes through reserve_bootmem_region(), where certain struct page
fields are set, the assumption being that the struct pages have been
initialized beforehand.

In trim_low_memory_range() we unconditionally reserve memoryfrom PFN 0, but
memblock.memory might start at a later PFN. For example, in QEMU,
e820__memblock_setup() can use PFN 1 as the first PFN in memblock.memory,
so PFN 0 is not on memblock.memory (and hence isn't initialized via
__init_single_page) but is on memblock.reserved (and hence we set fields in
the uninitialized struct page).

Currently, the struct page memory is always zeroed during allocation,
which prevents this problem from being detected. But, if some asserts
provided by CONFIG_DEBUG_VM_PGFLAGS are tighten, this problem may become
visible in existing kernels.

In this patchset we will stop zeroing struct page memory during allocation.
Therefore, this bug must be fixed in order to avoid random assert failures
caused by CONFIG_DEBUG_VM_PGFLAGS triggers.

The fix is to reserve memory from the first existing PFN.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/x86/kernel/setup.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..489cdc141bcb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -790,7 +790,10 @@ early_param("reservelow", parse_reservelow);
 
 static void __init trim_low_memory_range(void)
 {
-   memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+   unsigned long min_pfn = find_min_pfn_with_active_regions();
+   phys_addr_t base = min_pfn << PAGE_SHIFT;
+
+   memblock_reserve(base, ALIGN(reserve_low, PAGE_SIZE));
 }

 /*
-- 
2.14.0

[v6 09/15] sparc64: optimized struct page zeroing

2017-08-07 Thread Pavel Tatashin

Add an optimized mm_zero_struct_page(), so struct page's are zeroed without
calling memset(). We do eight to tent regular stores based on the size of
struct page. Compiler optimizes out the conditions of switch() statement.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Steven Sistare <steven.sist...@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jor...@oracle.com>
Reviewed-by: Bob Picco <bob.pi...@oracle.com>
---
 arch/sparc/include/asm/pgtable_64.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/arch/sparc/include/asm/pgtable_64.h 
b/arch/sparc/include/asm/pgtable_64.h
index 6fbd931f0570..cee5cc7ccc51 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -230,6 +230,36 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
 extern struct page *mem_map_zero;
 #define ZERO_PAGE(vaddr)   (mem_map_zero)
 
+/* This macro must be updated when the size of struct page grows above 80
+ * or reduces below 64.
+ * The idea that compiler optimizes out switch() statement, and only
+ * leaves clrx instructions
+ */
+#definemm_zero_struct_page(pp) do {
\
+   unsigned long *_pp = (void *)(pp);  \
+   \
+/* Check that struct page is either 64, 72, or 80 bytes */ \
+   BUILD_BUG_ON(sizeof(struct page) & 7);  \
+   BUILD_BUG_ON(sizeof(struct page) < 64); \
+   BUILD_BUG_ON(sizeof(struct page) > 80); \
+   \
+   switch (sizeof(struct page)) {  \
+   case 80:\
+   _pp[9] = 0; /* fallthrough */   \
+   case 72:\
+   _pp[8] = 0; /* fallthrough */   \
+   default:\
+   _pp[7] = 0; \
+   _pp[6] = 0; \
+   _pp[5] = 0; \
+   _pp[4] = 0; \
+   _pp[3] = 0; \
+   _pp[2] = 0; \
+   _pp[1] = 0; \
+   _pp[0] = 0; \
+   }   \
+} while (0)
+
 /* PFNs are real physical page numbers.  However, mem_map only begins to record
  * per-page information starting at pfn_base.  This is to handle systems where
  * the first physical page in the machine is at some huge physical address,
-- 
2.14.0

[v4 1/1] mm: Adaptive hash table scaling

2017-05-20 Thread Pavel Tatashin

Allow hash tables to scale with memory but at slower pace, when HASH_ADAPT
is provided every time memory quadruples the sizes of hash tables will only
double instead of quadrupling as well. This algorithm starts working only
when memory size reaches a certain point, currently set to 64G.

This is example of dentry hash table size, before and after four various
memory configurations:

MEMORYSCALEHASH_SIZE
oldnewold new
8G  13 13  8M  8M
   16G  13 13 16M 16M
   32G  13 13 32M 32M
   64G  13 13 64M 64M
  128G  13 14128M 64M
  256G  13 14256M128M
  512G  13 15512M128M
 1024G  13 15   1024M256M
 2048G  13 16   2048M256M
 4096G  13 16   4096M512M
 8192G  13 17   8192M512M
16384G  13 17  16384M   1024M
32768G  13 18  32768M   1024M
65536G  13 18  65536M   2048M

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 mm/page_alloc.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8afa63e81e73..15bba5c325a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7169,6 +7169,17 @@ static unsigned long __init 
arch_reserved_kernel_pages(void)
 #endif
 
 /*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ */
+#define ADAPT_SCALE_BASE   (64ull << 30)
+#define ADAPT_SCALE_SHIFT  2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+
+/*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
@@ -7199,6 +7210,14 @@ void *__init alloc_large_system_hash(const char 
*tablename,
if (PAGE_SHIFT < 20)
numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 
+   if (!high_limit) {
+   unsigned long long adapt;
+
+   for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+adapt <<= ADAPT_SCALE_SHIFT)
+   scale++;
+   }
+
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
-- 
2.13.0

[v4 0/1] mm: Adaptive hash table scaling

2017-05-20 Thread Pavel Tatashin

Changes from v3 - v4:
- Fixed an issue with 32-bit overflow (adapt is ull now instead ul)
- Added changes suggested by Michal Hocko: use high_limit instead of
  a new flag to determine that we should use this new scaling.

Pavel Tatashin (1):
  mm: Adaptive hash table scaling

 mm/page_alloc.c | 19 +++
 1 file changed, 19 insertions(+)

-- 
2.13.0

[v5 1/1] mm: Adaptive hash table scaling

2017-05-22 Thread Pavel Tatashin

Allow hash tables to scale with memory but at slower pace, when HASH_ADAPT
is provided every time memory quadruples the sizes of hash tables will only
double instead of quadrupling as well. This algorithm starts working only
when memory size reaches a certain point, currently set to 64G.

This is example of dentry hash table size, before and after four various
memory configurations:

MEMORYSCALEHASH_SIZE
oldnewold new
8G  13 13  8M  8M
   16G  13 13 16M 16M
   32G  13 13 32M 32M
   64G  13 13 64M 64M
  128G  13 14128M 64M
  256G  13 14256M128M
  512G  13 15512M128M
 1024G  13 15   1024M256M
 2048G  13 16   2048M256M
 4096G  13 16   4096M512M
 8192G  13 17   8192M512M
16384G  13 17  16384M   1024M
32768G  13 18  32768M   1024M
65536G  13 18  65536M   2048M

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 mm/page_alloc.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8afa63e81e73..409e0cd35381 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7169,6 +7169,21 @@ static unsigned long __init 
arch_reserved_kernel_pages(void)
 #endif
 
 /*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE   (64ul << 30)
+#define ADAPT_SCALE_SHIFT  2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
+/*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
@@ -7199,6 +7214,16 @@ void *__init alloc_large_system_hash(const char 
*tablename,
if (PAGE_SHIFT < 20)
numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 
+#if __BITS_PER_LONG > 32
+   if (!high_limit) {
+   unsigned long adapt;
+
+   for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+adapt <<= ADAPT_SCALE_SHIFT)
+   scale++;
+   }
+#endif
+
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
-- 
2.13.0

[v5 0/1] mm: Adaptive hash table scaling

2017-05-22 Thread Pavel Tatashin

Changes from v5 - v4
- Disabled adaptive hash on 32 bit systems to avoid confusion of
  whether base should be different for smaller systems, and to
  avoid overflows.

Pavel Tatashin (1):
  mm: Adaptive hash table scaling

 mm/page_alloc.c | 25 +
 1 file changed, 25 insertions(+)

-- 
2.13.0

[v3 5/9] mm: zero struct pages during initialization

2017-05-05 Thread Pavel Tatashin

When deferred struct page initialization is enabled, do not expect that
the memory that was allocated for struct pages was zeroed by the
allocator. Zero it when "struct pages" are initialized.

Also, a defined boolean VMEMMAP_ZERO is provided to tell platforms whether
they should zero memory or can deffer it.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 include/linux/mm.h |9 +
 mm/page_alloc.c|3 +++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4375015..1c481fc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2419,6 +2419,15 @@ int vmemmap_populate_basepages(unsigned long start, 
unsigned long end,
 #ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end);
 #endif
+/*
+ * Don't zero "struct page"es during early boot, and zero only when they are
+ * initialized in parallel.
+ */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+#define VMEMMAP_ZERO   false
+#else
+#define VMEMMAP_ZERO   true
+#endif
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
  unsigned long size);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c25de4..e736c6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1159,6 +1159,9 @@ static void free_one_page(struct zone *zone,
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
 {
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+   memset(page, 0, sizeof(struct page));
+#endif
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
-- 
1.7.1

[v3 6/9] sparc64: teach sparc not to zero struct pages memory

2017-05-05 Thread Pavel Tatashin

If we are using deferred struct page initialization feature, most of
"struct page"es are getting initialized after other CPUs are started, and
hence we are benefiting from doing this job in parallel. However, we are
still zeroing all the memory that is allocated for "struct pages" using the
boot CPU.  This patch solves this problem, by deferring zeroing "struct
pages" to only when they are initialized on SPARC.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 arch/sparc/mm/init_64.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index c72d070..dae040c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2546,7 +2546,7 @@ int __meminit vmemmap_populate(unsigned long vstart, 
unsigned long vend,
pte = pmd_val(*pmd);
if (!(pte & _PAGE_VALID)) {
void *block = vmemmap_alloc_block(PMD_SIZE, node,
- true);
+ VMEMMAP_ZERO);
 
if (!block)
return -ENOMEM;
-- 
1.7.1

[v3 4/9] mm: do not zero vmemmap_buf

2017-05-05 Thread Pavel Tatashin

alloc_block_buf() can either use external allocator by calling
vmemmap_alloc_block() or when available use pre-allocated vmemmap_buf
to do allocation. In either case, alloc_block_buf() knows when to zero
memory based on the "zero" argument.  This is why it is not needed to
zero vmemmap_buf beforehand. Let clients of alloc_block_buf() to
decide whether that is needed.

Signed-off-by: Pavel Tatashin <pasha.tatas...@oracle.com>
---
 mm/sparse-vmemmap.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 5d255b0..1e9508b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -305,7 +305,7 @@ void __init sparse_mem_maps_populate_node(struct page 
**map_map,
 
size = ALIGN(size, PMD_SIZE);
vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size
-   * map_count, PMD_SIZE, __pa(MAX_DMA_ADDRESS), true);
+   * map_count, PMD_SIZE, __pa(MAX_DMA_ADDRESS), false);
 
if (vmemmap_buf_start) {
vmemmap_buf = vmemmap_buf_start;
-- 
1.7.1

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2213 matches

Mail list logo