Re: [PATCH] powerpc/mm: fix a hardcode on memory boundary checking

2017-02-05 Thread Rui Teng

On 31/01/2017 5:11 PM, Michael Ellerman wrote:

Rui Teng <rui.t...@linux.vnet.ibm.com> writes:


The offset of hugepage block will not be 16G, if the expected
page is more than one. Calculate the totol size instead of the
hardcode value.


I assume you found this by code inspection and not by triggering an
actual bug?


Yes, I found this problem only by code inspection. We were finding the
ways to enable 16G huge page besides changing the device tree. For 
example, provide a new interface to set these size and pages parameters.

So that I think it may cause problem here.



cheers


diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8033493..b829f8e 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -506,7 +506,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned 
long node,
printk(KERN_INFO "Huge page(16GB) memory: "
"addr = 0x%lX size = 0x%lX pages = %d\n",
phys_addr, block_size, expected_pages);
-   if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
+   if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
memblock_reserve(phys_addr, block_size * expected_pages);
add_gpage(phys_addr, block_size, expected_pages);
}
--
2.9.0






[PATCH] powerpc/mm: fix a hardcode on memory boundary checking

2017-01-12 Thread Rui Teng
The offset of hugepage block will not be 16G, if the expected
page is more than one. Calculate the totol size instead of the
hardcode value.

Signed-off-by: Rui Teng <rui.t...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/hash_utils_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8033493..b829f8e 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -506,7 +506,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned 
long node,
printk(KERN_INFO "Huge page(16GB) memory: "
"addr = 0x%lX size = 0x%lX pages = %d\n",
phys_addr, block_size, expected_pages);
-   if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
+   if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
memblock_reserve(phys_addr, block_size * expected_pages);
add_gpage(phys_addr, block_size, expected_pages);
}
-- 
2.9.0



[PATCH] tools/testing/selftests/powerpc: Add Anton's null_syscall benchmark to the selftests

2016-09-27 Thread Rui Teng
From: Anton Blanchard <an...@au.ibm.com>

Pull in a version of Anton's null_syscall benchmark:
http://ozlabs.org/~anton/junkcode/null_syscall.c
Into tools/testing/selftests/powerpc/benchmarks.

Suggested-by: Michael Ellerman <m...@ellerman.id.au>
Signed-off-by: Anton Blanchard <an...@au.ibm.com>
Signed-off-by: Rui Teng <rui.t...@linux.vnet.ibm.com>
---
 .../testing/selftests/powerpc/benchmarks/Makefile  |   2 +-
 .../selftests/powerpc/benchmarks/null_syscall.c| 157 +
 2 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/benchmarks/null_syscall.c

diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile 
b/tools/testing/selftests/powerpc/benchmarks/Makefile
index a9adfb7..545077f 100644
--- a/tools/testing/selftests/powerpc/benchmarks/Makefile
+++ b/tools/testing/selftests/powerpc/benchmarks/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := gettimeofday context_switch mmap_bench futex_bench
+TEST_PROGS := gettimeofday context_switch mmap_bench futex_bench null_syscall
 
 CFLAGS += -O2
 
diff --git a/tools/testing/selftests/powerpc/benchmarks/null_syscall.c 
b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c
new file mode 100644
index 000..59c2f45
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c
@@ -0,0 +1,157 @@
+/*
+ * Test null syscall performance
+ *
+ * Copyright (C) 2009-2015 Anton Blanchard <an...@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define NR_LOOPS 1000
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static volatile int soak_done;
+unsigned long long clock_frequency;
+unsigned long long timebase_frequency;
+double timebase_multiplier;
+
+static inline unsigned long long mftb(void)
+{
+   unsigned long low;
+
+   asm volatile("mftb %0" : "=r" (low));
+
+   return low;
+}
+
+static void sigalrm_handler(int unused)
+{
+   soak_done = 1;
+}
+
+/*
+ * Use a timer instead of busy looping on clock_gettime() so we don't
+ * pollute profiles with glibc and VDSO hits.
+ */
+static void cpu_soak_usecs(unsigned long usecs)
+{
+   struct itimerval val;
+
+   memset(, 0, sizeof(val));
+   val.it_value.tv_usec = usecs;
+
+   signal(SIGALRM, sigalrm_handler);
+   setitimer(ITIMER_REAL, , NULL);
+
+   while (1) {
+   if (soak_done)
+   break;
+   }
+
+   signal(SIGALRM, SIG_DFL);
+}
+
+/*
+ * This only works with recent kernels where cpufreq modifies
+ * /proc/cpuinfo dynamically.
+ */
+static void get_proc_frequency(void)
+{
+   FILE *f;
+   char line[128];
+   char *p, *end;
+   unsigned long v;
+   double d;
+   char *override;
+
+   /* Try to get out of low power/low frequency mode */
+   cpu_soak_usecs(0.25 * 100);
+
+   f = fopen("/proc/cpuinfo", "r");
+   if (f == NULL)
+   return;
+
+   timebase_frequency = 0;
+
+   while (fgets(line, sizeof(line), f) != NULL) {
+   if (strncmp(line, "timebase", 8) == 0) {
+   p = strchr(line, ':');
+   if (p != NULL) {
+   v = strtoull(p + 1, , 0);
+   if (end != p + 1)
+   timebase_frequency = v;
+   }
+   }
+
+   if (((strncmp(line, "clock", 5) == 0) ||
+(strncmp(line, "cpu MHz", 7) == 0))) {
+   p = strchr(line, ':');
+   if (p != NULL) {
+   d = strtod(p + 1, );
+   if (end != p + 1) {
+   /* Find fastest clock frequency */
+   if ((d * 100ULL) > clock_frequency)
+   clock_frequency = d * 
100ULL;
+   }
+   }
+   }
+   }
+
+   fclose(f);
+
+   override = getenv("FREQUENCY");
+   if (override)
+   clock_frequency = strtoull(override, NULL, 10);
+
+   if (timebase_frequency)
+   timebase_multiplier = (double)clock_frequency
+   / timebase_frequency;
+   else
+   timebase_multiplier = 1;
+}
+
+static void do_null_syscall(unsigned long nr)
+{
+   unsigned long i;
+
+   for (i = 0; i < nr; i++)
+   getppid();
+}
+
+#define TIME(A, STR) \
+
+int main(void)
+{
+   unsigned long tb_start, tb_now;
+

[PATCH] [v2] powerpc: Clean up tm_abort duplication in hash_utils_64.c

2016-09-02 Thread Rui Teng
The same logic appears twice and should probably be pulled out into a function.

Suggested-by: Michael Ellerman <m...@ellerman.id.au>
Signed-off-by: Rui Teng <rui.t...@linux.vnet.ibm.com>
---
Changes in V2:
- Change function to static and inline
- Use #else block to define an empty static inline function

---
 arch/powerpc/mm/hash_utils_64.c | 55 +++--
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0821556..b68f6d0 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1460,6 +1460,29 @@ out_exit:
local_irq_restore(flags);
 }
 
+/*
+ * Transactions are not aborted by tlbiel, only tlbie.
+ * Without, syncing a page back to a block device w/ PIO could pick up
+ * transactional data (bad!) so we force an abort here.  Before the
+ * sync the page will be made read-only, which will flush_hash_page.
+ * BIG ISSUE here: if the kernel uses a page from userspace without
+ * unmapping it first, it may see the speculated version.
+ */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void local_tm_abort(int local)
+{
+   if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+   MSR_TM_ACTIVE(current->thread.regs->msr)) {
+   tm_enable();
+   tm_abort(TM_CAUSE_TLBI);
+   }
+}
+#else
+static inline void local_tm_abort(int local)
+{
+}
+#endif
+
 /* WARNING: This is called from hash_low_64.S, if you change this prototype,
  *  do not forget to update the assembly call site !
  */
@@ -1486,21 +1509,7 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, 
int psize, int ssize,
 ssize, local);
} pte_iterate_hashed_end();
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   /* Transactions are not aborted by tlbiel, only tlbie.
-* Without, syncing a page back to a block device w/ PIO could pick up
-* transactional data (bad!) so we force an abort here.  Before the
-* sync the page will be made read-only, which will flush_hash_page.
-* BIG ISSUE here: if the kernel uses a page from userspace without
-* unmapping it first, it may see the speculated version.
-*/
-   if (local && cpu_has_feature(CPU_FTR_TM) &&
-   current->thread.regs &&
-   MSR_TM_ACTIVE(current->thread.regs->msr)) {
-   tm_enable();
-   tm_abort(TM_CAUSE_TLBI);
-   }
-#endif
+   local_tm_abort(local);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1557,21 +1566,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned 
long addr,
 MMU_PAGE_16M, ssize, local);
}
 tm_abort:
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   /* Transactions are not aborted by tlbiel, only tlbie.
-* Without, syncing a page back to a block device w/ PIO could pick up
-* transactional data (bad!) so we force an abort here.  Before the
-* sync the page will be made read-only, which will flush_hash_page.
-* BIG ISSUE here: if the kernel uses a page from userspace without
-* unmapping it first, it may see the speculated version.
-*/
-   if (local && cpu_has_feature(CPU_FTR_TM) &&
-   current->thread.regs &&
-   MSR_TM_ACTIVE(current->thread.regs->msr)) {
-   tm_enable();
-   tm_abort(TM_CAUSE_TLBI);
-   }
-#endif
+   local_tm_abort(local);
return;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.7.4



Re: [PATCH] powerpc: Clean up tm_abort duplication in hash_utils_64.c

2016-09-01 Thread Rui Teng

On 9/1/16 11:46 PM, Thiago Jung Bauermann wrote:

Am Freitag, 26 August 2016, 11:50:10 schrieb Rui Teng:

The same logic appears twice and should probably be pulled out into a
function.

Suggested-by: Michael Ellerman <m...@ellerman.id.au>
Signed-off-by: Rui Teng <rui.t...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/hash_utils_64.c | 45
+ 1 file changed, 19
insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/mm/hash_utils_64.c
b/arch/powerpc/mm/hash_utils_64.c index 0821556..69ef702 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1460,6 +1460,23 @@ out_exit:
local_irq_restore(flags);
 }

+/*
+ * Transactions are not aborted by tlbiel, only tlbie.
+ * Without, syncing a page back to a block device w/ PIO could pick up
+ * transactional data (bad!) so we force an abort here.  Before the
+ * sync the page will be made read-only, which will flush_hash_page.
+ * BIG ISSUE here: if the kernel uses a page from userspace without
+ * unmapping it first, it may see the speculated version.
+ */
+void local_tm_abort(int local)
+{
+   if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+   MSR_TM_ACTIVE(current->thread.regs->msr)) {
+   tm_enable();
+   tm_abort(TM_CAUSE_TLBI);
+   }
+}
+


Since local_tm_abort is only used in this file, it should be static.

OK


Also, since both places calling it are guarded by
CONFIG_PPC_TRANSACTIONAL_MEM, wouldn't it be cleaner if the #ifdef was here
instead and the #else block defined an empty static inline function? Then
the call sites wouldn't need to be guarded.

I have considered this style before, but I am worried about the call
stacks increased by empty function and forgot the inline function.
Will send v2 with your comments.

Thanks!







[PATCH] [V3] powerpc/mm: Add validation for platform reserved memory ranges

2016-09-01 Thread Rui Teng
From: Anshuman Khandual <khand...@linux.vnet.ibm.com>

For partition running on PHYP, there can be a adjunct partition
which shares the virtual address range with the operating system.
Virtual address ranges which can be used by the adjunct partition
are communicated with virtual device node of the device tree with
a property known as "ibm,reserved-virtual-addresses". This patch
introduces a new function named 'validate_reserved_va_range' which
is called  during initialization to validate that these reserved
virtual address ranges do not overlap with the address ranges used
by the kernel for all supported memory contexts. This helps prevent
the possibility of getting return codes similar to H_RESOURCE for
H_PROTECT hcalls for conflicting HPTE entries.

Signed-off-by: Anshuman Khandual <khand...@linux.vnet.ibm.com>
Signed-off-by: Rui Teng <rui.t...@linux.vnet.ibm.com>
---
- Tested on both POWER8 LE and BE platforms

Changes in V3:
- Use u32 and u64 to store the virtual address and use CPU endian mask.

Changes in V2:
- Added braces to the definition of LINUX_VA_BITS
- Adjusted tabs as spaces for the definition of PARTIAL_LINUX_VA_MASK

---
 arch/powerpc/mm/hash_utils_64.c | 68 +
 1 file changed, 68 insertions(+)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0821556..85c5123 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1733,3 +1733,71 @@ void hash__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
/* Finally limit subsequent allocations */
memblock_set_current_limit(ppc64_rma_size);
 }
+
+/*
+ * PAPR says that each reserved virtual address range record
+ * contains three be32 elements which is of toal 12 bytes.
+ * First two be32 elements contain the abbreviated virtual
+ * address (high order 32 bits and low order 32 bits that
+ * generate the abbreviated virtual address of 64 bits which
+ * need to be concatenated with 24 bits of 0 at the end) and
+ * the third be32 element contains the size of the reserved
+ * virtual address range as number of consecutive 4K pages.
+ */
+struct reserved_va_record {
+   u32 high_addr;
+   u32 low_addr;
+   u32 nr_pages_4K;
+};
+
+/*
+ * Linux uses 65 bits (CONTEXT_BITS + ESID_BITS + SID_SHIFT)
+ * of virtual address. As reserved virtual address comes in
+ * as an abbreviated form (64 bits) from the device tree, we
+ * will use a partial address bit mask (65 >> 24) to match it
+ * for simplicity.
+ */
+#define RVA_LESS_BITS  24
+#define LINUX_VA_BITS  (CONTEXT_BITS + ESID_BITS + SID_SHIFT)
+#define PARTIAL_LINUX_VA_MASK  ((1ULL << (LINUX_VA_BITS - RVA_LESS_BITS)) - 1)
+
+static int __init validate_reserved_va_range(void)
+{
+   struct reserved_va_record rva;
+   struct device_node *np;
+   int records, i;
+   u64 vaddr;
+
+   np = of_find_node_by_name(NULL, "vdevice");
+   if (!np)
+   return -ENODEV;
+
+   records = of_property_count_elems_of_size(np,
+   "ibm,reserved-virtual-addresses",
+   sizeof(struct reserved_va_record));
+   if (records < 0)
+   return records;
+
+   for (i = 0; i < records; i++) {
+   of_property_read_u32_index(np,
+   "ibm,reserved-virtual-addresses",
+   3 * i, _addr);
+   of_property_read_u32_index(np,
+   "ibm,reserved-virtual-addresses",
+   3 * i + 1, _addr);
+   of_property_read_u32_index(np,
+   "ibm,reserved-virtual-addresses",
+   3 * i + 2, _pages_4K);
+
+   vaddr =  rva.high_addr;
+   vaddr =  (vaddr << 32) | rva.low_addr;
+   if (unlikely(!(vaddr & ~PARTIAL_LINUX_VA_MASK))) {
+   pr_err("RVA [0x%llx00 (0x%x in bytes)] 
overlapped\n",
+   vaddr, rva.nr_pages_4K * 4096);
+   BUG();
+   }
+   }
+   of_node_put(np);
+   return 0;
+}
+device_initcall(validate_reserved_va_range);
-- 
2.7.4



[PATCH] powerpc: Clean up tm_abort duplication in hash_utils_64.c

2016-08-25 Thread Rui Teng
The same logic appears twice and should probably be pulled out into a function.

Suggested-by: Michael Ellerman <m...@ellerman.id.au>
Signed-off-by: Rui Teng <rui.t...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/hash_utils_64.c | 45 +
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0821556..69ef702 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1460,6 +1460,23 @@ out_exit:
local_irq_restore(flags);
 }
 
+/*
+ * Transactions are not aborted by tlbiel, only tlbie.
+ * Without, syncing a page back to a block device w/ PIO could pick up
+ * transactional data (bad!) so we force an abort here.  Before the
+ * sync the page will be made read-only, which will flush_hash_page.
+ * BIG ISSUE here: if the kernel uses a page from userspace without
+ * unmapping it first, it may see the speculated version.
+ */
+void local_tm_abort(int local)
+{
+   if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+   MSR_TM_ACTIVE(current->thread.regs->msr)) {
+   tm_enable();
+   tm_abort(TM_CAUSE_TLBI);
+   }
+}
+
 /* WARNING: This is called from hash_low_64.S, if you change this prototype,
  *  do not forget to update the assembly call site !
  */
@@ -1487,19 +1504,7 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, 
int psize, int ssize,
} pte_iterate_hashed_end();
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   /* Transactions are not aborted by tlbiel, only tlbie.
-* Without, syncing a page back to a block device w/ PIO could pick up
-* transactional data (bad!) so we force an abort here.  Before the
-* sync the page will be made read-only, which will flush_hash_page.
-* BIG ISSUE here: if the kernel uses a page from userspace without
-* unmapping it first, it may see the speculated version.
-*/
-   if (local && cpu_has_feature(CPU_FTR_TM) &&
-   current->thread.regs &&
-   MSR_TM_ACTIVE(current->thread.regs->msr)) {
-   tm_enable();
-   tm_abort(TM_CAUSE_TLBI);
-   }
+   local_tm_abort(local);
 #endif
 }
 
@@ -1558,19 +1563,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned 
long addr,
}
 tm_abort:
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   /* Transactions are not aborted by tlbiel, only tlbie.
-* Without, syncing a page back to a block device w/ PIO could pick up
-* transactional data (bad!) so we force an abort here.  Before the
-* sync the page will be made read-only, which will flush_hash_page.
-* BIG ISSUE here: if the kernel uses a page from userspace without
-* unmapping it first, it may see the speculated version.
-*/
-   if (local && cpu_has_feature(CPU_FTR_TM) &&
-   current->thread.regs &&
-   MSR_TM_ACTIVE(current->thread.regs->msr)) {
-   tm_enable();
-   tm_abort(TM_CAUSE_TLBI);
-   }
+   local_tm_abort(local);
 #endif
return;
 }
-- 
2.7.4



[PATCH] powerpc: Remove suspect CONFIG_PPC_BOOK3E #ifdefs in nohash/64/pgtable.h

2016-08-25 Thread Rui Teng
There are three #ifdef CONFIG_PPC_BOOK3E sections in nohash/64/pgtable.h.
And there should be no configurations possible which use nohash/64/pgtable.h
but don't also enable CONFIG_PPC_BOOK3E.

Suggested-by: Michael Ellerman <m...@ellerman.id.au>
Signed-off-by: Rui Teng <rui.t...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/nohash/64/pgtable.h | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h 
b/arch/powerpc/include/asm/nohash/64/pgtable.h
index d4d808c..6213fc1 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -26,15 +26,11 @@
 #else
 #define PMD_CACHE_INDEXPMD_INDEX_SIZE
 #endif
+
 /*
  * Define the address range of the kernel non-linear virtual area
  */
-
-#ifdef CONFIG_PPC_BOOK3E
 #define KERN_VIRT_START ASM_CONST(0x8000)
-#else
-#define KERN_VIRT_START ASM_CONST(0xD000)
-#endif
 #define KERN_VIRT_SIZE ASM_CONST(0x1000)
 
 /*
@@ -43,11 +39,7 @@
  * (we keep a quarter for the virtual memmap)
  */
 #define VMALLOC_START  KERN_VIRT_START
-#ifdef CONFIG_PPC_BOOK3E
 #define VMALLOC_SIZE   (KERN_VIRT_SIZE >> 2)
-#else
-#define VMALLOC_SIZE   (KERN_VIRT_SIZE >> 1)
-#endif
 #define VMALLOC_END(VMALLOC_START + VMALLOC_SIZE)
 
 /*
@@ -85,12 +77,8 @@
  * Defines the address of the vmemap area, in its own region on
  * hash table CPUs and after the vmalloc space on Book3E
  */
-#ifdef CONFIG_PPC_BOOK3E
 #define VMEMMAP_BASE   VMALLOC_END
 #define VMEMMAP_ENDKERN_IO_START
-#else
-#define VMEMMAP_BASE   (VMEMMAP_REGION_ID << REGION_SHIFT)
-#endif
 #define vmemmap((struct page *)VMEMMAP_BASE)
 
 
-- 
2.7.4



[PATCH] [PATCH] [V3] powerpc/mm: Add validation for platform reserved memory ranges

2016-08-01 Thread Rui Teng
From: Anshuman Khandual <khand...@linux.vnet.ibm.com>

For partition running on PHYP, there can be a adjunct partition
which shares the virtual address range with the operating system.
Virtual address ranges which can be used by the adjunct partition
are communicated with virtual device node of the device tree with
a property known as "ibm,reserved-virtual-addresses". This patch
introduces a new function named 'validate_reserved_va_range' which
is called  during initialization to validate that these reserved
virtual address ranges do not overlap with the address ranges used
by the kernel for all supported memory contexts. This helps prevent
the possibility of getting return codes similar to H_RESOURCE for
H_PROTECT hcalls for conflicting HPTE entries.

Signed-off-by: Anshuman Khandual <khand...@linux.vnet.ibm.com>
Signed-off-by: Rui Teng <rui.t...@linux.vnet.ibm.com>
---
- Tested on both POWER8 LE and BE platforms

Changes in V3:
- Use u32 and u64 to store the virtual address and use CPU endian mask.

Changes in V2:
- Added braces to the definition of LINUX_VA_BITS
- Adjusted tabs as spaces for the definition of PARTIAL_LINUX_VA_MASK

---
 arch/powerpc/mm/hash_utils_64.c | 68 +
 1 file changed, 68 insertions(+)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 2971ea1..6918198 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1723,3 +1723,71 @@ void hash__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
/* Finally limit subsequent allocations */
memblock_set_current_limit(ppc64_rma_size);
 }
+
+/*
+ * PAPR says that each reserved virtual address range record
+ * contains three be32 elements which is of toal 12 bytes.
+ * First two be32 elements contain the abbreviated virtual
+ * address (high order 32 bits and low order 32 bits that
+ * generate the abbreviated virtual address of 64 bits which
+ * need to be concatenated with 24 bits of 0 at the end) and
+ * the third be32 element contains the size of the reserved
+ * virtual address range as number of consecutive 4K pages.
+ */
+struct reserved_va_record {
+   u32 high_addr;
+   u32 low_addr;
+   u32 nr_pages_4K;
+};
+
+/*
+ * Linux uses 65 bits (CONTEXT_BITS + ESID_BITS + SID_SHIFT)
+ * of virtual address. As reserved virtual address comes in
+ * as an abbreviated form (64 bits) from the device tree, we
+ * will use a partial address bit mask (65 >> 24) to match it
+ * for simplicity.
+ */
+#define RVA_LESS_BITS  24
+#define LINUX_VA_BITS  (CONTEXT_BITS + ESID_BITS + SID_SHIFT)
+#define PARTIAL_LINUX_VA_MASK  ((1ULL << (LINUX_VA_BITS - RVA_LESS_BITS)) - 1)
+
+static int __init validate_reserved_va_range(void)
+{
+   struct reserved_va_record rva;
+   struct device_node *np;
+   int records, i;
+   u64 vaddr;
+
+   np = of_find_node_by_name(NULL, "vdevice");
+   if (!np)
+   return -ENODEV;
+
+   records = of_property_count_elems_of_size(np,
+   "ibm,reserved-virtual-addresses",
+   sizeof(struct reserved_va_record));
+   if (records < 0)
+   return records;
+
+   for (i = 0; i < records; i++) {
+   of_property_read_u32_index(np,
+   "ibm,reserved-virtual-addresses",
+   3 * i, _addr);
+   of_property_read_u32_index(np,
+   "ibm,reserved-virtual-addresses",
+   3 * i + 1, _addr);
+   of_property_read_u32_index(np,
+   "ibm,reserved-virtual-addresses",
+   3 * i + 2, _pages_4K);
+
+   vaddr =  rva.high_addr;
+   vaddr =  (vaddr << 32) | rva.low_addr;
+   if (unlikely(!(vaddr & ~PARTIAL_LINUX_VA_MASK))) {
+   pr_err("RVA [0x%llx00 (0x%x in bytes)] 
overlapped\n",
+   vaddr, rva.nr_pages_4K * 4096);
+   BUG();
+   }
+   }
+   of_node_put(np);
+   return 0;
+}
+device_initcall(validate_reserved_va_range);
-- 
2.7.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC 6/9] powerpc/hugetlb: Enable ARCH_WANT_GENERAL_HUGETLB for BOOK3S 64K

2016-03-21 Thread Rui Teng

On 3/9/16 8:10 PM, Anshuman Khandual wrote:

This enables ARCH_WANT_GENERAL_HUGETLB for BOOK3S 64K in Kconfig.
It also implements a new function 'pte_huge' which is required by
function 'huge_pte_alloc' from generic VM. Existing BOOK3S 64K
specific functions 'huge_pte_alloc' and 'huge_pte_offset' (which
are no longer required) are removed with this change.

Signed-off-by: Anshuman Khandual 
---
  arch/powerpc/Kconfig  |  4 ++
  arch/powerpc/include/asm/book3s/64/hash-64k.h |  8 
  arch/powerpc/mm/hugetlbpage.c | 60 ---
  3 files changed, 12 insertions(+), 60 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9faa18c..c6920bb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -33,6 +33,10 @@ config HAVE_SETUP_PER_CPU_AREA
  config NEED_PER_CPU_EMBED_FIRST_CHUNK
def_bool PPC64

+config ARCH_WANT_GENERAL_HUGETLB
+   depends on PPC_64K_PAGES && PPC_BOOK3S_64
+   def_bool y
+
On the source code, the PowerPC specified huge_pte_alloc() function will 
not be defined if the configure logic is "!PPC_4K_PAGES && 
PPC_BOOK3S_64", but on the Kconfig file the general huge_pte_alloc() 
function will only be defined if the logic is "PPC_64K_PAGES && 
PPC_BOOK3S_64".


It works if PPC_4K_PAGES and PPC_64K_PAGES always against each other, 
but I also find PPC_16K_PAGES and PPC_256K_PAGES on the same Kconfig 
file. What happens if we configure PPC_16K_PAGES instead of PPC_4K_PAGES?



  config NR_IRQS
int "Number of virtual interrupt numbers"
range 32 32768
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 849bbec..5e9b9b9 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -143,6 +143,14 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned 
long index);
   * Defined in such a way that we can optimize away code block at build time
   * if CONFIG_HUGETLB_PAGE=n.
   */
+static inline int pte_huge(pte_t pte)
+{
+   /*
+* leaf pte for huge page
+*/
+   return !!(pte_val(pte) & _PAGE_PTE);
+}
+
  static inline int pmd_huge(pmd_t pmd)
  {
/*
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f834a74..f6e4712 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -59,42 +59,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr)
/* Only called for hugetlbfs pages, hence can ignore THP */
return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
  }
-#else
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-   pgd_t pgd, *pgdp;
-   pud_t pud, *pudp;
-   pmd_t pmd, *pmdp;
-
-   pgdp = mm->pgd + pgd_index(addr);
-   pgd  = READ_ONCE(*pgdp);
-
-   if (pgd_none(pgd))
-   return NULL;
-
-   if (pgd_huge(pgd))
-   return (pte_t *)pgdp;
-
-   pudp = pud_offset(, addr);
-   pud  = READ_ONCE(*pudp);
-   if (pud_none(pud))
-   return NULL;
-
-   if (pud_huge(pud))
-   return (pte_t *)pudp;

-   pmdp = pmd_offset(, addr);
-   pmd  = READ_ONCE(*pmdp);
-   if (pmd_none(pmd))
-   return NULL;
-
-   if (pmd_huge(pmd))
-   return (pte_t *)pmdp;
-   return NULL;
-}
-#endif /* !defined(CONFIG_PPC_64K_PAGES) || !defined(CONFIG_PPC_BOOK3S_64) */
-
-#if !defined(CONFIG_PPC_64K_PAGES) || !defined(CONFIG_PPC_BOOK3S_64)
  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
   unsigned long address, unsigned pdshift, unsigned 
pshift)
  {
@@ -211,31 +176,6 @@ hugepd_search:

return hugepte_offset(*hpdp, addr, pdshift);
  }
-
-#else
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long 
sz)
-{
-   pgd_t *pg;
-   pud_t *pu;
-   pmd_t *pm;
-   unsigned pshift = __ffs(sz);
-
-   addr &= ~(sz-1);
-   pg = pgd_offset(mm, addr);
-
-   if (pshift == PGDIR_SHIFT)  /* 16GB Huge Page */
-   return (pte_t *)pg;
-
-   pu = pud_alloc(mm, pg, addr);   /* NA, skipped */
-   if (pshift == PUD_SHIFT)
-   return (pte_t *)pu;
-
-   pm = pmd_alloc(mm, pu, addr);   /* 16MB Huge Page */
-   if (pshift == PMD_SHIFT)
-   return (pte_t *)pm;
-
-   return NULL;
-}
  #endif
  #else


Why these code need to be added on patch 4/9 but removed on 6/9?

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev