Re: [PATCH] watchdog: mpc8xxx: use the core worker function

2017-11-07 Thread Christophe LEROY


Le 07/11/2017 à 23:56, Guenter Roeck a écrit :

On Tue, Nov 07, 2017 at 05:23:56PM +0100, Christophe Leroy wrote:

The watchdog core includes a worker function which pings the
watchdog until user app starts pinging it and which also
pings it if the HW require more frequent pings.
Use that function instead of the dedicated timer.
In the mean time, we can allow the user to change the timeout.

Then change the timeout module parameter to use seconds and
use the watchdog_init_timeout() core function.

On some HW (eg: the 8xx), SWCRR contains bits unrelated to the
watchdog which have to be preserved upon write.

This driver has nothing preventing the use of the magic close, so
enable it.

Signed-off-by: Christophe Leroy 
---
  drivers/watchdog/mpc8xxx_wdt.c | 86 --
  1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
index 366e5c7e650b..c2ac4b6b1253 100644
--- a/drivers/watchdog/mpc8xxx_wdt.c
+++ b/drivers/watchdog/mpc8xxx_wdt.c
@@ -22,7 +22,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 
  #include 
  #include 
@@ -31,10 +30,13 @@
  #include 
  #include 
  
+#define WATCHDOG_TIMEOUT 10

+
  struct mpc8xxx_wdt {
__be32 res0;
__be32 swcrr; /* System watchdog control register */
  #define SWCRR_SWTC 0x /* Software Watchdog Time Count. */
+#define SWCRR_SWF  0x0008 /* Software Watchdog Freeze (mpc8xx). */
  #define SWCRR_SWEN 0x0004 /* Watchdog Enable bit. */
  #define SWCRR_SWRI 0x0002 /* Software Watchdog Reset/Interrupt Select 
bit.*/
  #define SWCRR_SWPR 0x0001 /* Software Watchdog Counter Prescale bit. */
@@ -52,14 +54,15 @@ struct mpc8xxx_wdt_type {
  struct mpc8xxx_wdt_ddata {
struct mpc8xxx_wdt __iomem *base;
struct watchdog_device wdd;
-   struct timer_list timer;
spinlock_t lock;
+   int prescaler;
  };
  
-static u16 timeout = 0x;

+static u16 timeout;
  module_param(timeout, ushort, 0);
  MODULE_PARM_DESC(timeout,
-   "Watchdog timeout in ticks. (0lock);
  }
  
-static void mpc8xxx_wdt_timer_ping(unsigned long arg)

-{
-   struct mpc8xxx_wdt_ddata *ddata = (void *)arg;
-
-   mpc8xxx_wdt_keepalive(ddata);
-   /* We're pinging it twice faster than needed, just to be sure. */
-   mod_timer(>timer, jiffies + HZ * ddata->wdd.timeout / 2);
-}
-
  static int mpc8xxx_wdt_start(struct watchdog_device *w)
  {
struct mpc8xxx_wdt_ddata *ddata =
container_of(w, struct mpc8xxx_wdt_ddata, wdd);
-
-   u32 tmp = SWCRR_SWEN | SWCRR_SWPR;
+   u32 tmp;
+   u32 timeout;
  
  	/* Good, fire up the show */

+   tmp = in_be32(>base->swcrr);
+   tmp &= ~(SWCRR_SWTC | SWCRR_SWF | SWCRR_SWEN | SWCRR_SWRI | SWCRR_SWPR);
+   tmp |= SWCRR_SWEN | SWCRR_SWPR;
+
if (reset)
tmp |= SWCRR_SWRI;
  
-	tmp |= timeout << 16;

+   timeout = ddata->wdd.timeout * fsl_get_sys_freq() / ddata->prescaler;
+   tmp |= min(timeout, 0xU) << 16;
  
  	out_be32(>base->swcrr, tmp);
  
-	del_timer_sync(>timer);

+   tmp = in_be32(>base->swcrr);
+   if (!(tmp & SWCRR_SWEN))
+   return -EOPNOTSUPP;
+
+   ddata->wdd.max_hw_heartbeat_ms = ((tmp >> 16) * ddata->prescaler) /
+(fsl_get_sys_freq() / 1000);
+   ddata->wdd.min_timeout = ddata->wdd.max_hw_heartbeat_ms / 1000;


That is odd. Why set the minimum timeout period to the maximum hardware
timeout, rounded down to a full second ?


Once the HW watchdog timer has been configured with a given timeout 
value, it cannot be changed anymore. This means that any higher timeout 
can be managed via the watchdog core worker function, but no timeout 
below the one configured in HW can be used.




Note that it is wrong to set the values for min_timeout and
max_hw_heartbeat_ms in the _start function; they should be set
in the probe function. The start function should also not update
ddata->wdd.timeout; that should happen in set_timeout if supported,
or in probe.


The mpc8xxx HW watchdog can be (re)configured only once after startup, 
after that it cannot be changed anymore. Therefore we have the following 
cases to handle:

1/ The HW watchdog is started at reset and not (re)configured yet.
2/ The HW watchdog is no started at reset and not configured yet.
3/ The HW watchdog is configured by the boot loader.

Case 1/ The kernel has to poll it from the begining in accordance with 
the default reset timeout. It can however reconfigure it once.

Case 2/ The kernel has nothing to do until a user app want to 

Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Florian Weimer

On 11/08/2017 07:08 AM, Michael Ellerman wrote:

"Kirill A. Shutemov"  writes:


On Tue, Nov 07, 2017 at 02:05:42PM +0100, Florian Weimer wrote:

On 11/07/2017 12:44 PM, Kirill A. Shutemov wrote:

On Tue, Nov 07, 2017 at 12:26:12PM +0100, Florian Weimer wrote:

On 11/07/2017 12:15 PM, Kirill A. Shutemov wrote:


First of all, using addr and MAP_FIXED to develop our heuristic can
never really give unchanged ABI. It's an in-band signal. brk() is a
good example that steadily keeps incrementing address, so depending
on malloc usage and address space randomization, you will get a brk()
that ends exactly at 128T, then the next one will be >
DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.


No, it won't. You will hit stack first.


That's not actually true on POWER in some cases.  See the process maps I
posted here:




Hm? I see that in all three cases the [stack] is the last mapping.
Do I miss something?


Hah, I had not noticed.  Occasionally, the order of heap and stack is
reversed.  This happens in approximately 15% of the runs.


Heh. I guess ASLR on Power is too fancy :)


Fancy implies we're doing it on purpose :P


That's strange layout. It doesn't give that much (relatively speaking)
virtual address space for both stack and heap to grow.


I'm pretty sure it only happens when you're running an ELF interpreter
directly, because of Kees patch which changed the logic to load ELF
interpreters in the mmap region, vs PIE binaries which go to
ELF_ET_DYN_BASE. (eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only
for PIE"))


From that commit:

+   * There are effectively two types of ET_DYN
+   * binaries: programs (i.e. PIE: ET_DYN with INTERP)
+   * and loaders (ET_DYN without INTERP, since they
+   * _are_ the ELF interpreter). The loaders must

Note that the comment is a bit misleading: statically linked PIE 
binaries are ET_DYN without INTERP, too.


So any oddity which is observable today with an explicitly ld.so 
invocation only will gain more relevance once we get static PIE support 
in user space because it will then affect regular applications, too. 
(Well, statically linked ones.)  In this sense, process layouts which 
cause premature brk failure or insufficient stack allocations are real bugs.


Thanks,
Florian


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Michael Ellerman
"Kirill A. Shutemov"  writes:

> On Tue, Nov 07, 2017 at 02:05:42PM +0100, Florian Weimer wrote:
>> On 11/07/2017 12:44 PM, Kirill A. Shutemov wrote:
>> > On Tue, Nov 07, 2017 at 12:26:12PM +0100, Florian Weimer wrote:
>> > > On 11/07/2017 12:15 PM, Kirill A. Shutemov wrote:
>> > > 
>> > > > > First of all, using addr and MAP_FIXED to develop our heuristic can
>> > > > > never really give unchanged ABI. It's an in-band signal. brk() is a
>> > > > > good example that steadily keeps incrementing address, so depending
>> > > > > on malloc usage and address space randomization, you will get a brk()
>> > > > > that ends exactly at 128T, then the next one will be >
>> > > > > DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.
>> > > > 
>> > > > No, it won't. You will hit stack first.
>> > > 
>> > > That's not actually true on POWER in some cases.  See the process maps I
>> > > posted here:
>> > > 
>> > >
>> > 
>> > Hm? I see that in all three cases the [stack] is the last mapping.
>> > Do I miss something?
>> 
>> Hah, I had not noticed.  Occasionally, the order of heap and stack is
>> reversed.  This happens in approximately 15% of the runs.
>
> Heh. I guess ASLR on Power is too fancy :)

Fancy implies we're doing it on purpose :P

> That's strange layout. It doesn't give that much (relatively speaking)
> virtual address space for both stack and heap to grow.

I'm pretty sure it only happens when you're running an ELF interpreter
directly, because of Kees patch which changed the logic to load ELF
interpreters in the mmap region, vs PIE binaries which go to
ELF_ET_DYN_BASE. (eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only
for PIE"))

It only happens with ASLR enabled. Presumably it's because our brk_rnd()
is overly aggressive in this case, it randomises up to 1GB, and the heap
jumps over the stack.

cheers


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Michael Ellerman
"Aneesh Kumar K.V"  writes:

>> 
>> If it is decided to keep these kind of heuristics, can we get just a
>> small but reasonably precise description of each change to the
>> interface and ways for using the new functionality, such that would be
>> suitable for the man page? I couldn't fix powerpc because nothing
>> matches and even Aneesh and you differ on some details (MAP_FIXED
>> behaviour).
>
>
> I would consider MAP_FIXED as my mistake. We never discussed this 
> explicitly and I kind of assumed it to behave the same way. ie, we 
> search in lower address space (128TB) if the hint addr is below 128TB.
>
> IIUC we agree on the below.
>
> 1) MAP_FIXED allow the addr to be used, even if hint addr is below 128TB 
> but hint_addr + len is > 128TB.

So:
  mmap(0x7000, 0x2000, ..., MAP_FIXED ...) = 0x7000

> 2) For everything else we search in < 128TB space if hint addr is below 
> 128TB

  mmap((x < 128T), 0x1000, ...) = (y < 128T)
  ...
  mmap(0x7000, 0x1000, ...) = 0x7000
  mmap(0x8000, 0x1000, ...) = 0x8000
  ...
  mmap((x >= 128T), 0x1000, ...) = (y >= 128T)

> 3) We don't switch to large address space if hint_addr + len > 128TB. 
> The decision to switch to large address space is primarily based on hint 
> addr

But does the mmap succeed in that case or not?

ie:  mmap(0x7000, 0x2000, ...) = ?

cheers


[PATCH v3 18/18] powerpc/vas: Add support for user receive window

2017-11-07 Thread Sukadev Bhattiprolu
Add support for user space receive window (for the Fast thread-wakeup
coprocessor type)

Signed-off-by: Sukadev Bhattiprolu 
---
Changelog[v3]
- [Nick Piggin] Drop CP_ABORT since set_thread_uses_vas() does
  that now (in earlier patch) and add a check for return value.
---
 arch/powerpc/platforms/powernv/vas-window.c | 56 +
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 8275492..2b3eb01 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -16,7 +16,8 @@
 #include 
 #include 
 #include 
-
+#include 
+#include 
 #include "vas.h"
 #include "copy-paste.h"
 
@@ -598,6 +599,32 @@ static void put_rx_win(struct vas_window *rxwin)
 }
 
 /*
+ * Find the user space receive window given the @pswid.
+ *  - We must have a valid vasid and it must belong to this instance.
+ *(so both send and receive windows are on the same VAS instance)
+ *  - The window must refer to an OPEN, FTW, RECEIVE window.
+ *
+ * NOTE: We access ->windows[] table and assume that vinst->mutex is held.
+ */
+static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
+{
+   int vasid, winid;
+   struct vas_window *rxwin;
+
+   decode_pswid(pswid, , );
+
+   if (vinst->vas_id != vasid)
+   return ERR_PTR(-EINVAL);
+
+   rxwin = vinst->windows[winid];
+
+   if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW)
+   return ERR_PTR(-EINVAL);
+
+   return rxwin;
+}
+
+/*
  * Get the VAS receive window associated with NX engine identified
  * by @cop and if applicable, @pswid.
  *
@@ -610,10 +637,10 @@ static struct vas_window *get_vinst_rxwin(struct 
vas_instance *vinst,
 
mutex_lock(>mutex);
 
-   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI)
-   rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
+   if (cop == VAS_COP_TYPE_FTW)
+   rxwin = get_user_rxwin(vinst, pswid);
else
-   rxwin = ERR_PTR(-EINVAL);
+   rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
 
if (!IS_ERR(rxwin))
atomic_inc(>num_txwins);
@@ -937,10 +964,9 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
winctx->tx_word_mode = txattr->tx_win_ord_mode;
winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
 
-   if (winctx->nx_win) {
+   winctx->intr_disable = true;
+   if (winctx->nx_win)
winctx->data_stamp = true;
-   winctx->intr_disable = true;
-   }
 
winctx->lpid = txattr->lpid;
winctx->pidr = txattr->pidr;
@@ -985,6 +1011,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
if (!tx_win_args_valid(cop, attr))
return ERR_PTR(-EINVAL);
 
+   /*
+* If caller did not specify a vasid but specified the PSWID of a
+* receive window (applicable only to FTW windows), use the vasid
+* from that receive window.
+*/
+   if (vasid == -1 && attr->pswid)
+   decode_pswid(attr->pswid, , NULL);
+
vinst = find_vas_instance(vasid);
if (!vinst) {
pr_devel("vasid %d not found!\n", vasid);
@@ -1031,6 +1065,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
}
}
 
+   /*
+* Now that we have a send window, ensure context switch issues
+* CP_ABORT for this thread.
+*/
+   rc = -EINVAL;
+   if (set_thread_uses_vas() < 0)
+   goto free_window;
+
set_vinst_win(vinst, txwin);
 
return txwin;
-- 
2.7.4



[PATCH v3 16/18] powerpc/vas: Define vas_win_paste_addr()

2017-11-07 Thread Sukadev Bhattiprolu
Define an interface that the NX drivers can use to find the physical
paste address of a send window. This interface is expected to be used
with the mmap() operation of the NX driver's device. i.e the user space
process can use driver's mmap() operation to map the send window's paste
address into their address space and then use copy and paste instructions
to submit the CRBs to the NX engine.

Note that kernel drivers will use vas_paste_crb() directly and don't need
this interface.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/vas.h  |  7 +++
 arch/powerpc/platforms/powernv/vas-window.c | 10 ++
 2 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 044748f..f98ade8 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -10,6 +10,8 @@
 #ifndef _ASM_POWERPC_VAS_H
 #define _ASM_POWERPC_VAS_H
 
+struct vas_window;
+
 /*
  * Min and max FIFO sizes are based on Version 1.05 Section 3.1.4.25
  * (Local FIFO Size Register) of the VAS workbook.
@@ -165,4 +167,9 @@ int vas_copy_crb(void *crb, int offset);
  */
 int vas_paste_crb(struct vas_window *win, int offset, bool re);
 
+/*
+ * Return the power bus paste address associated with @win so the caller
+ * can map that address into their address space.
+ */
+extern u64 vas_win_paste_addr(struct vas_window *win);
 #endif /* __ASM_POWERPC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index c030d4c..d7d0653 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -40,6 +40,16 @@ static void compute_paste_address(struct vas_window *window, 
u64 *addr, int *len
pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
 }
 
+u64 vas_win_paste_addr(struct vas_window *win)
+{
+   u64 addr;
+
+   compute_paste_address(win, , NULL);
+
+   return addr;
+}
+EXPORT_SYMBOL(vas_win_paste_addr);
+
 static inline void get_hvwc_mmio_bar(struct vas_window *window,
u64 *start, int *len)
 {
-- 
2.7.4



[PATCH v3 17/18] powerpc/vas: Define vas_win_id()

2017-11-07 Thread Sukadev Bhattiprolu
Define an interface to return a system-wide unique id for a given VAS
window.

The vas_win_id() will be used in a follow-on patch to generate an unique
handle for a user space receive window. Applications can use this handle
to pair send and receive windows for fast thread-wakeup.

The hardware refers to this system-wide unique id as a Partition Send
Window ID which is expected to be used during fault handling. Hence the
"pswid" in the function names.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/vas.h  |  5 +
 arch/powerpc/platforms/powernv/vas-window.c |  9 +
 arch/powerpc/platforms/powernv/vas.h| 28 
 3 files changed, 42 insertions(+)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index f98ade8..7714562 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -168,6 +168,11 @@ int vas_copy_crb(void *crb, int offset);
 int vas_paste_crb(struct vas_window *win, int offset, bool re);
 
 /*
+ * Return a system-wide unique id for the VAS window @win.
+ */
+extern u32 vas_win_id(struct vas_window *win);
+
+/*
  * Return the power bus paste address associated with @win so the caller
  * can map that address into their address space.
  */
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index d7d0653..8275492 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1235,3 +1235,12 @@ int vas_win_close(struct vas_window *window)
return 0;
 }
 EXPORT_SYMBOL_GPL(vas_win_close);
+
+/*
+ * Return a system-wide unique window id for the window @win.
+ */
+u32 vas_win_id(struct vas_window *win)
+{
+   return encode_pswid(win->vinst->vas_id, win->winid);
+}
+EXPORT_SYMBOL_GPL(vas_win_id);
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 756cbc5..ae0100f 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -447,4 +447,32 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
return in_be64(win->hvwc_map+reg);
 }
 
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ * BitsUsage
+ * 0:7 VAS id (8 bits)
+ * 8:15Unused, 0 (3 bits)
+ * 16:31   Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+   u32 pswid = 0;
+
+   pswid |= vasid << (31 - 7);
+   pswid |= winid;
+
+   return pswid;
+}
+
+static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
+{
+   if (vasid)
+   *vasid = pswid >> (31 - 7) & 0xFF;
+
+   if (winid)
+   *winid = pswid & 0x;
+}
 #endif /* _VAS_H */
-- 
2.7.4



[PATCH v3 15/18] powerpc: Emulate paste instruction

2017-11-07 Thread Sukadev Bhattiprolu
From: Michael Neuling 

On POWER9 DD2.1 and below there are issues when the paste instruction
generates an error. If an error occurs when thread reconfiguration
happens (ie another thread in the core goes into/out of powersave) the
core may hang.

To avoid this a special sequence is required which stops thread
configuration so that the paste can be safely executed.

This patch assumes paste executed in userspace are trapped into the
illegal instruction exception at 0xe40.

Here we re-execute the paste instruction but with the required
sequence to ensure thread reconfiguration doesn't occur.

Cc: Aneesh Kumar K.V 
Signed-off-by: Michael Neuling 
Signed-off-by: Sukadev Bhattiprolu 
---
Changlog[v3]:
- [Michael Ellerman] We don't need to disable/enable pagefaults
  when emulating paste;
- [Michael Ellerman, Aneesh Kumar] Fix retval from emulate_paste()

Edit by Sukadev: Use PPC_PASTE() rather than the paste instruction since
in older versions the instruction required a third parameter.
---
 arch/powerpc/include/asm/emulated_ops.h |  1 +
 arch/powerpc/include/asm/ppc-opcode.h   |  1 +
 arch/powerpc/include/asm/reg.h  |  2 +
 arch/powerpc/kernel/traps.c | 67 +
 4 files changed, 71 insertions(+)

diff --git a/arch/powerpc/include/asm/emulated_ops.h 
b/arch/powerpc/include/asm/emulated_ops.h
index f00e10e..9247af9 100644
--- a/arch/powerpc/include/asm/emulated_ops.h
+++ b/arch/powerpc/include/asm/emulated_ops.h
@@ -55,6 +55,7 @@ extern struct ppc_emulated {
struct ppc_emulated_entry mfdscr;
struct ppc_emulated_entry mtdscr;
struct ppc_emulated_entry lq_stq;
+   struct ppc_emulated_entry paste;
 #endif
 } ppc_emulated;
 
diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index ce0930d..a55d2ef 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -229,6 +229,7 @@
 #define PPC_INST_MTTMR 0x7c0003dc
 #define PPC_INST_NOP   0x6000
 #define PPC_INST_PASTE 0x7c20070d
+#define PPC_INST_PASTE_MASK0xfc2007ff
 #define PPC_INST_POPCNTB   0x7cf4
 #define PPC_INST_POPCNTB_MASK  0xfc0007fe
 #define PPC_INST_POPCNTD   0x7c0003f4
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index b779f3c..3495ecf 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -469,6 +469,8 @@
 #define SPRN_DBAT7U0x23E   /* Data BAT 7 Upper Register */
 #define SPRN_PPR   0x380   /* SMT Thread status Register */
 #define SPRN_TSCR  0x399   /* Thread Switch Control Register */
+#define SPRN_TRIG1 0x371   /* WAT Trigger 1 */
+#define SPRN_TRIG2 0x372   /* WAT Trigger 2 */
 
 #define SPRN_DEC   0x016   /* Decrement Register */
 #define SPRN_DER   0x095   /* Debug Enable Register */
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 13c9dcd..c2cce25 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -956,6 +956,68 @@ static inline bool tm_abort_check(struct pt_regs *regs, 
int reason)
 }
 #endif
 
+static DEFINE_SPINLOCK(paste_emulation_lock);
+
+static inline int paste(void *i)
+{
+   int cr;
+   long retval = 0;
+
+   /* Need per core lock to ensure trig1/2 writes don't race */
+   spin_lock(_emulation_lock);
+   mtspr(SPRN_TRIG1, 0); /* data doesn't matter */
+   mtspr(SPRN_TRIG1, 0); /* HW says do this twice */
+   asm volatile(
+   "1: " PPC_PASTE(0, %2) "\n"
+   "2: mfcr %1\n"
+   ".section .fixup,\"ax\"\n"
+   "3: li %0,%3\n"
+   "   li %2,0\n"
+   "   b 2b\n"
+   ".previous\n"
+   EX_TABLE(1b, 3b)
+   : "=r" (retval), "=r" (cr)
+   : "b" (i), "i" (-EFAULT), "0" (retval));
+   mtspr(SPRN_TRIG2, 0);
+   spin_unlock(_emulation_lock);
+
+   return retval ?: cr;
+}
+
+static int emulate_paste(struct pt_regs *regs, u32 instword)
+{
+   const void __user *addr;
+   unsigned long ea;
+   u8 ra, rb;
+   int rc;
+
+   if (!cpu_has_feature(CPU_FTR_ARCH_300))
+   return -EINVAL;
+
+   ra = (instword >> 16) & 0x1f;
+   rb = (instword >> 11) & 0x1f;
+
+   ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0ul);
+   if (is_32bit_task())
+   ea &= 0xul;
+   addr = (__force const void __user *)ea;
+
+   if (!access_ok(VERIFY_WRITE, addr, 128)) // cacheline size == 128
+   return -EFAULT;
+
+   hard_irq_disable(); /* FIXME: could we just soft disable ?? */
+
+   PPC_WARN_EMULATED(paste, regs);
+   rc = paste((void *)addr);
+
+   /* set cr0 to 0 to indicate a 

[PATCH v3 14/18] powerpc: Define set_thread_uses_vas()

2017-11-07 Thread Sukadev Bhattiprolu
A CP_ABORT instruction is required in processes that have mapped a VAS
"paste address" with the intention of using COPY/PASTE instructions.
But since CP_ABORT is expensive, we want to restrict it to only processes
that use/intend to use COPY/PASTE.

Define an interface, set_thread_uses_vas(), that VAS can use to indicate
that the current process opened a send window. During context switch,
issue CP_ABORT only for processes that have the flag set.

Thanks for input from Nick Piggin, Michael Ellerman.

Cc: Nicholas Piggin 
Signed-off-by: Sukadev Bhattiprolu 
---
Changelog[v3]:
- [Nick Piggin] Rename interface to set_thread_uses_vas(), tweak
  comment and move the CP_ABORT from callers into the interface.
---
 arch/powerpc/include/asm/processor.h |  2 ++
 arch/powerpc/include/asm/switch_to.h |  2 ++
 arch/powerpc/kernel/process.c| 41 +++-
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 58cc212..bdab3b74 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -341,7 +341,9 @@ struct thread_struct {
unsigned long   sier;
unsigned long   mmcr2;
unsignedmmcr0;
+
unsignedused_ebb;
+   unsigned intused_vas;
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/switch_to.h 
b/arch/powerpc/include/asm/switch_to.h
index ad2d762..c3ca42c 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -92,6 +92,8 @@ static inline void clear_task_ebb(struct task_struct *t)
 #endif
 }
 
+extern int set_thread_uses_vas(void);
+
 extern int set_thread_tidr(struct task_struct *t);
 extern void clear_thread_tidr(struct task_struct *t);
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index d861fcd..395ca80 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1234,17 +1234,17 @@ struct task_struct *__switch_to(struct task_struct 
*prev,
 * The copy-paste buffer can only store into foreign real
 * addresses, so unprivileged processes can not see the
 * data or use it in any way unless they have foreign real
-* mappings. We don't have a VAS driver that allocates those
-* yet, so no cpabort is required.
+* mappings. If the new process has the foreign real address
+* mappings, we must issue a cp_abort to clear any state and
+* prevent snooping, corruption or a covert channel.
+*
+* DD1 allows paste into normal system memory so we do an
+* unpaired copy, rather than cp_abort, to clear the buffer,
+* since cp_abort is quite expensive.
 */
-   if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
-   /*
-* DD1 allows paste into normal system memory, so we
-* do an unpaired copy here to clear the buffer and
-* prevent a covert channel being set up.
-*
-* cpabort is not used because it is quite expensive.
-*/
+   if (new_thread->used_vas) {
+   asm volatile(PPC_CP_ABORT);
+   } else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
asm volatile(PPC_COPY(%0, %1)
: : "r"(dummy_copy_buffer), "r"(0));
}
@@ -1445,6 +1445,27 @@ void flush_thread(void)
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 }
 
+int set_thread_uses_vas(void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (!cpu_has_feature(CPU_FTR_ARCH_300))
+   return -EINVAL;
+
+   current->thread.used_vas = 1;
+
+   /*
+* Even a process that has no foreign real address mapping can use
+* an unpaired COPY instruction (to no real effect). Issue CP_ABORT
+* to clear any pending COPY and prevent a covert channel.
+*
+* __switch_to() will issue CP_ABORT on future context switches.
+*/
+   asm volatile(PPC_CP_ABORT);
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+   return 0;
+}
+
 #ifdef CONFIG_PPC64
 static DEFINE_SPINLOCK(vas_thread_id_lock);
 static DEFINE_IDA(vas_thread_ida);
-- 
2.7.4



[PATCH v3 13/18] powerpc: Add support for setting SPRN_TIDR

2017-11-07 Thread Sukadev Bhattiprolu
We need the SPRN_TIDR to be set for use with fast thread-wakeup (core-
to-core wakeup) and also with CAPI.

Each thread in a process needs to have a unique id within the process.
But as explained below, for now, we assign globally unique thread ids
to all threads in the system.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Philippe Bergheaud 
Signed-off-by: Christophe Lombard 
---
Changelog[v3]
- Merge changes with and address comments to Christophe's patch.
  (i.e drop CONFIG_PPC_VAS; use CONFIG_PPC64; check CPU_ARCH_300
  before setting TIDR). Defer following to separate patches:
- emulation parts of Christophe's patch,
- setting TIDR for tasks other than 'current'
- setting feature bit in AT_HWCAP2

Changelog[v2]
- Michael Ellerman: Use an interface to assign TIDR so it is
assigned to only threads that need it; move assignment to
restore_sprs(). Drop lint from rebase;
---
 arch/powerpc/include/asm/processor.h |   1 +
 arch/powerpc/include/asm/switch_to.h |   3 +
 arch/powerpc/kernel/process.c| 122 +++
 3 files changed, 126 insertions(+)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index fab7ff8..58cc212 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -329,6 +329,7 @@ struct thread_struct {
 */
int dscr_inherit;
unsigned long   ppr;/* used to save/restore SMT priority */
+   unsigned long   tidr;
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
unsigned long   tar;
diff --git a/arch/powerpc/include/asm/switch_to.h 
b/arch/powerpc/include/asm/switch_to.h
index bf820f5..ad2d762 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -92,4 +92,7 @@ static inline void clear_task_ebb(struct task_struct *t)
 #endif
 }
 
+extern int set_thread_tidr(struct task_struct *t);
+extern void clear_thread_tidr(struct task_struct *t);
+
 #endif /* _ASM_POWERPC_SWITCH_TO_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 37ed60b..d861fcd 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1120,6 +1120,13 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
mtspr(SPRN_TAR, new_thread->tar);
}
 #endif
+#ifdef CONFIG_PPC64
+   if (old_thread->tidr != new_thread->tidr) {
+   /* TIDR should be non-zero only with ISA3.0. */
+   WARN_ON_ONCE(!cpu_has_feature(CPU_FTR_ARCH_300));
+   mtspr(SPRN_TIDR, new_thread->tidr);
+   }
+#endif
 }
 
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -1438,9 +1445,117 @@ void flush_thread(void)
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 }
 
+#ifdef CONFIG_PPC64
+static DEFINE_SPINLOCK(vas_thread_id_lock);
+static DEFINE_IDA(vas_thread_ida);
+
+/*
+ * We need to assign a unique thread id to each thread in a process.
+ *
+ * This thread id, referred to as TIDR, and separate from the Linux's tgid,
+ * is intended to be used to direct an ASB_Notify from the hardware to the
+ * thread, when a suitable event occurs in the system.
+ *
+ * One such event is a "paste" instruction in the context of Fast Thread
+ * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard
+ * (VAS) in POWER9.
+ *
+ * To get a unique TIDR per process we could simply reuse task_pid_nr() but
+ * the problem is that task_pid_nr() is not yet available copy_thread() is
+ * called. Fixing that would require changing more intrusive arch-neutral
+ * code in code path in copy_process()?.
+ *
+ * Further, to assign unique TIDRs within each process, we need an atomic
+ * field (or an IDR) in task_struct, which again intrudes into the arch-
+ * neutral code. So try to assign globally unique TIDRs for now.
+ *
+ * NOTE: TIDR 0 indicates that the thread does not need a TIDR value.
+ *  For now, only threads that expect to be notified by the VAS
+ *  hardware need a TIDR value and we assign values > 0 for those.
+ */
+#define MAX_THREAD_CONTEXT ((1 << 16) - 1)
+static int assign_thread_tidr(void)
+{
+   int index;
+   int err;
+
+again:
+   if (!ida_pre_get(_thread_ida, GFP_KERNEL))
+   return -ENOMEM;
+
+   spin_lock(_thread_id_lock);
+   err = ida_get_new_above(_thread_ida, 1, );
+   spin_unlock(_thread_id_lock);
+
+   if (err == -EAGAIN)
+   goto again;
+   else if (err)
+   return err;
+
+   if (index > MAX_THREAD_CONTEXT) {
+   spin_lock(_thread_id_lock);
+   ida_remove(_thread_ida, index);
+   spin_unlock(_thread_id_lock);
+   return -ENOMEM;
+   }
+
+   return index;
+}
+
+static void free_thread_tidr(int id)
+{
+   

[PATCH v3 12/18] powerpc: have copy depend on CONFIG_BOOK3S_64

2017-11-07 Thread Sukadev Bhattiprolu
Have the COPY/PASTE instructions depend on CONFIG_BOOK3S_64 rather than
CONFIG_PPC_STD_MMU_64.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/kernel/process.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index a0c74bb..37ed60b 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1215,10 +1215,14 @@ struct task_struct *__switch_to(struct task_struct 
*prev,
batch = this_cpu_ptr(_tlb_batch);
batch->active = 1;
}
+#endif /* CONFIG_PPC_STD_MMU_64 */
 
if (current_thread_info()->task->thread.regs) {
+#ifdef CONFIG_PPC_STD_MMU_64
restore_math(current_thread_info()->task->thread.regs);
+#endif /* CONFIG_PPC_STD_MMU_64 */
 
+#ifdef CONFIG_PPC_BOOK3S_64
/*
 * The copy-paste buffer can only store into foreign real
 * addresses, so unprivileged processes can not see the
@@ -1237,8 +1241,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
asm volatile(PPC_COPY(%0, %1)
: : "r"(dummy_copy_buffer), "r"(0));
}
+#endif /* CONFIG_PPC_BOOK3S_64 */
}
-#endif /* CONFIG_PPC_STD_MMU_64 */
 
return last;
 }
-- 
2.7.4



[PATCH v3 11/18] powerpc/vas: Export HVWC to debugfs

2017-11-07 Thread Sukadev Bhattiprolu
Export the VAS Window context information to debugfs.

We need to hold a mutex when closing the window to prevent a race
with the debugfs read(). Rather than introduce a per-instance mutex,
we use the global vas_mutex for now, since it is not heavily contended.

The window->cop field is only relevant to a receive window so we were
not setting it for a send window (which is is paired to a receive window
anyway). But to simplify reporting in debugfs, set the 'cop' field for the
send window also.

Signed-off-by: Sukadev Bhattiprolu 

---
Changelog[v3]:
- [Michael Ellerman] Fix couple of unininitialized variables
---
 arch/powerpc/platforms/powernv/Makefile |   3 +-
 arch/powerpc/platforms/powernv/vas-debug.c  | 209 
 arch/powerpc/platforms/powernv/vas-window.c |  34 -
 arch/powerpc/platforms/powernv/vas.c|   6 +-
 arch/powerpc/platforms/powernv/vas.h|  14 ++
 5 files changed, 257 insertions(+), 9 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/vas-debug.c

diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index 7a31c26..3732118 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -15,4 +15,5 @@ obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
-obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o
+obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_PPC_FTW)  += nx-ftw.o
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c 
b/arch/powerpc/platforms/powernv/vas-debug.c
new file mode 100644
index 000..ca22f1e
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include "vas.h"
+
+static struct dentry *vas_debugfs;
+
+static char *cop_to_str(int cop)
+{
+   switch (cop) {
+   case VAS_COP_TYPE_FAULT:return "Fault";
+   case VAS_COP_TYPE_842:  return "NX-842 Normal Priority";
+   case VAS_COP_TYPE_842_HIPRI:return "NX-842 High Priority";
+   case VAS_COP_TYPE_GZIP: return "NX-GZIP Normal Priority";
+   case VAS_COP_TYPE_GZIP_HIPRI:   return "NX-GZIP High Priority";
+   case VAS_COP_TYPE_FTW:  return "Fast Thread-wakeup";
+   default:return "Unknown";
+   }
+}
+
+static int info_dbg_show(struct seq_file *s, void *private)
+{
+   struct vas_window *window = s->private;
+
+   mutex_lock(_mutex);
+
+   /* ensure window is not unmapped */
+   if (!window->hvwc_map)
+   goto unlock;
+
+   seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
+   window->tx_win ? "Send" : "Receive");
+   seq_printf(s, "Pid : %d\n", window->pid);
+
+unlock:
+   mutex_unlock(_mutex);
+   return 0;
+}
+
+static int info_dbg_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, info_dbg_show, inode->i_private);
+}
+
+static const struct file_operations info_fops = {
+   .open   = info_dbg_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= single_release,
+};
+
+static inline void print_reg(struct seq_file *s, struct vas_window *win,
+   char *name, u32 reg)
+{
+   seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name);
+}
+
+static int hvwc_dbg_show(struct seq_file *s, void *private)
+{
+   struct vas_window *window = s->private;
+
+   mutex_lock(_mutex);
+
+   /* ensure window is not unmapped */
+   if (!window->hvwc_map)
+   goto unlock;
+
+   print_reg(s, window, VREG(LPID));
+   print_reg(s, window, VREG(PID));
+   print_reg(s, window, VREG(XLATE_MSR));
+   print_reg(s, window, VREG(XLATE_LPCR));
+   print_reg(s, window, VREG(XLATE_CTL));
+   print_reg(s, window, VREG(AMR));
+   print_reg(s, window, VREG(SEIDR));
+   print_reg(s, window, VREG(FAULT_TX_WIN));
+   print_reg(s, window, VREG(OSU_INTR_SRC_RA));
+   print_reg(s, window, VREG(HV_INTR_SRC_RA));
+   print_reg(s, window, VREG(PSWID));
+   print_reg(s, window, VREG(LFIFO_BAR));
+   print_reg(s, window, VREG(LDATA_STAMP_CTL));
+   print_reg(s, window, VREG(LDMA_CACHE_CTL));
+   print_reg(s, window, VREG(LRFIFO_PUSH));
+   print_reg(s, window, VREG(CURR_MSG_COUNT));
+   print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT));
+   print_reg(s, 

[PATCH v3 10/18] powerpc/vas, nx-842: Define and use chip_to_vas_id()

2017-11-07 Thread Sukadev Bhattiprolu
Define a helper, chip_to_vas_id() to map a given chip id to corresponding
vas id.

Normally, callers of vas_rx_win_open() and vas_tx_win_open() want the VAS
window to be on the same chip where the calling thread is executing. These
callers can pass in -1 for the VAS id.

This interface will be useful if a thread running on one chip wants to open
a window on another chip (like the NX-842 driver does during start up).

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/vas.h   |  9 +
 arch/powerpc/platforms/powernv/vas.c | 11 +++
 drivers/crypto/nx/nx-842-powernv.c   | 18 +++---
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index fd5963a..044748f 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -104,6 +104,15 @@ struct vas_tx_win_attr {
 };
 
 /*
+ * Helper to map a chip id to VAS id.
+ * For POWER9, this is a 1:1 mapping. In the future this maybe a 1:N
+ * mapping in which case, we will need to update this helper.
+ *
+ * Return the VAS id or -1 if no matching vasid is found.
+ */
+int chip_to_vas_id(int chipid);
+
+/*
  * Helper to initialize receive window attributes to defaults for an
  * NX window.
  */
diff --git a/arch/powerpc/platforms/powernv/vas.c 
b/arch/powerpc/platforms/powernv/vas.c
index abb7090..cd9a733 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -123,6 +123,17 @@ struct vas_instance *find_vas_instance(int vasid)
return NULL;
 }
 
+int chip_to_vas_id(int chipid)
+{
+   int cpu;
+
+   for_each_possible_cpu(cpu) {
+   if (cpu_to_chip_id(cpu) == chipid)
+   return per_cpu(cpu_vas_id, cpu);
+   }
+   return -1;
+}
+
 static int vas_probe(struct platform_device *pdev)
 {
return init_vas_instance(pdev);
diff --git a/drivers/crypto/nx/nx-842-powernv.c 
b/drivers/crypto/nx/nx-842-powernv.c
index 874ddf5..eb221ed 100644
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -847,24 +847,12 @@ static int __init nx842_powernv_probe_vas(struct 
device_node *pn)
return -EINVAL;
}
 
-   for_each_compatible_node(dn, NULL, "ibm,power9-vas-x") {
-   if (of_get_ibm_chip_id(dn) == chip_id)
-   break;
-   }
-
-   if (!dn) {
-   pr_err("Missing VAS device node\n");
+   vasid = chip_to_vas_id(chip_id);
+   if (vasid < 0) {
+   pr_err("Unable to map chip_id %d to vasid\n", chip_id);
return -EINVAL;
}
 
-   if (of_property_read_u32(dn, "ibm,vas-id", )) {
-   pr_err("Missing ibm,vas-id device property\n");
-   of_node_put(dn);
-   return -EINVAL;
-   }
-
-   of_node_put(dn);
-
for_each_child_of_node(pn, dn) {
if (of_device_is_compatible(dn, "ibm,p9-nx-842")) {
ret = vas_cfg_coproc_info(dn, chip_id, vasid);
-- 
2.7.4



[PATCH v3 09/18] powerpc/vas: Create cpu to vas id mapping

2017-11-07 Thread Sukadev Bhattiprolu
Create a cpu to vasid mapping so callers can specify -1 instead of
trying to find a VAS id.

Changelog[v2]
[Michael Ellerman] Use per-cpu variables to simplify code.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/vas.c 
b/arch/powerpc/platforms/powernv/vas.c
index 565a487..abb7090 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -18,15 +18,18 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "vas.h"
 
 static DEFINE_MUTEX(vas_mutex);
 static LIST_HEAD(vas_instances);
 
+static DEFINE_PER_CPU(int, cpu_vas_id);
+
 static int init_vas_instance(struct platform_device *pdev)
 {
-   int rc, vasid;
+   int rc, cpu, vasid;
struct resource *res;
struct vas_instance *vinst;
struct device_node *dn = pdev->dev.of_node;
@@ -74,6 +77,11 @@ static int init_vas_instance(struct platform_device *pdev)
"paste_win_id_shift 0x%llx\n", pdev->name, vasid,
vinst->paste_base_addr, vinst->paste_win_id_shift);
 
+   for_each_possible_cpu(cpu) {
+   if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
+   per_cpu(cpu_vas_id, cpu) = vasid;
+   }
+
mutex_lock(_mutex);
list_add(>node, _instances);
mutex_unlock(_mutex);
@@ -98,6 +106,10 @@ struct vas_instance *find_vas_instance(int vasid)
struct vas_instance *vinst;
 
mutex_lock(_mutex);
+
+   if (vasid == -1)
+   vasid = per_cpu(cpu_vas_id, smp_processor_id());
+
list_for_each(ent, _instances) {
vinst = list_entry(ent, struct vas_instance, node);
if (vinst->vas_id == vasid) {
-- 
2.7.4



[PATCH v3 08/18] powerpc/vas: poll for return of window credits

2017-11-07 Thread Sukadev Bhattiprolu
Normally, the NX driver waits for the CRBs to be processed before closing
the window. But it is better to ensure that the credits are returned before
the window gets reassigned later.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 45 +
 1 file changed, 45 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index a59a187..23c13a7 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1063,6 +1063,49 @@ int vas_paste_crb(struct vas_window *txwin, int offset, 
bool re)
 EXPORT_SYMBOL_GPL(vas_paste_crb);
 
 /*
+ * If credit checking is enabled for this window, poll for the return
+ * of window credits (i.e for NX engines to process any outstanding CRBs).
+ * Since NX-842 waits for the CRBs to be processed before closing the
+ * window, we should not have to wait for too long.
+ *
+ * TODO: We retry in 10ms intervals now. We could/should probably peek at
+ * the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending
+ * CRBs on the FIFO and compute the delay dynamically on each retry.
+ * But that is not really needed until we support NX-GZIP access from
+ * user space. (NX-842 driver waits for CSB and Fast thread-wakeup
+ * doesn't use credit checking).
+ */
+static void poll_window_credits(struct vas_window *window)
+{
+   u64 val;
+   int creds, mode;
+
+   val = read_hvwc_reg(window, VREG(WINCTL));
+   if (window->tx_win)
+   mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val);
+   else
+   mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val);
+
+   if (!mode)
+   return;
+retry:
+   if (window->tx_win) {
+   val = read_hvwc_reg(window, VREG(TX_WCRED));
+   creds = GET_FIELD(VAS_TX_WCRED, val);
+   } else {
+   val = read_hvwc_reg(window, VREG(LRX_WCRED));
+   creds = GET_FIELD(VAS_LRX_WCRED, val);
+   }
+
+   if (creds < window->wcreds_max) {
+   val = 0;
+   set_current_state(TASK_UNINTERRUPTIBLE);
+   schedule_timeout(msecs_to_jiffies(10));
+   goto retry;
+   }
+}
+
+/*
  * Wait for the window to go to "not-busy" state. It should only take a
  * short time to queue a CRB, so window should not be busy for too long.
  * Trying 5ms intervals.
@@ -1149,6 +1192,8 @@ int vas_win_close(struct vas_window *window)
 
unpin_close_window(window);
 
+   poll_window_credits(window);
+
poll_window_castout(window);
 
/* if send window, drop reference to matching receive window */
-- 
2.7.4



[PATCH v3 07/18] powerpc/vas: Save configured window credits

2017-11-07 Thread Sukadev Bhattiprolu
Save the configured max window credits for a window in the vas_window
structure. We will need this when polling for return of window credits.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 --
 arch/powerpc/platforms/powernv/vas.h| 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 1422cdd..a59a187 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -674,7 +674,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 
winctx->rx_fifo = rxattr->rx_fifo;
winctx->rx_fifo_size = rxattr->rx_fifo_size;
-   winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+   winctx->wcreds_max = rxwin->wcreds_max;
winctx->pin_win = rxattr->pin_win;
 
winctx->nx_win = rxattr->nx_win;
@@ -844,6 +844,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum 
vas_cop_type cop,
rxwin->nx_win = rxattr->nx_win;
rxwin->user_win = rxattr->user_win;
rxwin->cop = cop;
+   rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
if (rxattr->user_win)
rxwin->pid = task_pid_vnr(current);
 
@@ -893,7 +894,7 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
 */
memset(winctx, 0, sizeof(struct vas_winctx));
 
-   winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+   winctx->wcreds_max = txwin->wcreds_max;
 
winctx->user_win = txattr->user_win;
winctx->nx_win = txwin->rxwin->nx_win;
@@ -978,6 +979,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
txwin->nx_win = txwin->rxwin->nx_win;
txwin->pid = attr->pid;
txwin->user_win = attr->user_win;
+   txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
 
init_winctx_for_txwin(txwin, attr, );
 
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 63e8e03..02d8a31 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -332,6 +332,7 @@ struct vas_window {
void *hvwc_map; /* HV window context */
void *uwc_map;  /* OS/User window context */
pid_t pid;  /* Linux process id of owner */
+   int wcreds_max; /* Window credits */
 
/* Fields applicable only to send windows */
void *paste_kaddr;
-- 
2.7.4



[PATCH v3 06/18] powerpc/vas: Reduce polling interval for busy state

2017-11-07 Thread Sukadev Bhattiprolu
A VAS window is normally in "busy" state for only a short duration.
Reduce the time we wait for the window to go to "not-busy" state to
speed-up vas_win_close() a bit.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 95622a9..1422cdd 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1060,21 +1060,23 @@ int vas_paste_crb(struct vas_window *txwin, int offset, 
bool re)
 }
 EXPORT_SYMBOL_GPL(vas_paste_crb);
 
+/*
+ * Wait for the window to go to "not-busy" state. It should only take a
+ * short time to queue a CRB, so window should not be busy for too long.
+ * Trying 5ms intervals.
+ */
 static void poll_window_busy_state(struct vas_window *window)
 {
int busy;
u64 val;
 
 retry:
-   /*
-* Poll Window Busy flag
-*/
val = read_hvwc_reg(window, VREG(WIN_STATUS));
busy = GET_FIELD(VAS_WIN_BUSY, val);
if (busy) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
-   schedule_timeout(HZ);
+   schedule_timeout(msecs_to_jiffies(5));
goto retry;
}
 }
-- 
2.7.4



[PATCH v3 04/18] powerpc/vas: Drop poll_window_cast_out().

2017-11-07 Thread Sukadev Bhattiprolu
Polling for window cast out is listed in the spec, but turns out that
it is not strictly necessary and slows down window close. Making it a
stub for now.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 34 ++---
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 67ffc5d..8ab8a82 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1079,25 +1079,25 @@ static void poll_window_busy_state(struct vas_window 
*window)
}
 }
 
+/*
+ * Have the hardware cast a window out of cache and wait for it to
+ * be completed.
+ *
+ * NOTE: It can take a relatively long time to cast the window context
+ * out of the cache. It is not strictly necessary to cast out if:
+ *
+ * - we clear the "Pin Window" bit (so hardware is free to evict)
+ *
+ * - we re-initialize the window context when it is reassigned.
+ *
+ * We do the former in vas_win_close() and latter in vas_win_open().
+ * So, ignoring the cast-out for now. We can add it as needed. If
+ * casting out becomes necessary we should consider offloading the
+ * job to a worker thread, so the window close can proceed quickly.
+ */
 static void poll_window_castout(struct vas_window *window)
 {
-   int cached;
-   u64 val;
-
-   /* Cast window context out of the cache */
-retry:
-   val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL));
-   cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val);
-   if (cached) {
-   val = 0ULL;
-   val = SET_FIELD(VAS_CASTOUT_REQ, val, 1);
-   val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0);
-   write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val);
-
-   set_current_state(TASK_UNINTERRUPTIBLE);
-   schedule_timeout(HZ);
-   goto retry;
-   }
+   /* stub for now */
 }
 
 /*
-- 
2.7.4



[PATCH v3 05/18] powerpc/vas: Use helper to unpin/close window

2017-11-07 Thread Sukadev Bhattiprolu
Use a helper to have the hardware unpin and mark a window closed.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 22 +++---
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 8ab8a82..95622a9 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1101,6 +1101,20 @@ static void poll_window_castout(struct vas_window 
*window)
 }
 
 /*
+ * Unpin and close a window so no new requests are accepted and the
+ * hardware can evict this window from cache if necessary.
+ */
+static void unpin_close_window(struct vas_window *window)
+{
+   u64 val;
+
+   val = read_hvwc_reg(window, VREG(WINCTL));
+   val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
+   val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
+   write_hvwc_reg(window, VREG(WINCTL), val);
+}
+
+/*
  * Close a window.
  *
  * See Section 1.12.1 of VAS workbook v1.05 for details on closing window:
@@ -1114,8 +1128,6 @@ static void poll_window_castout(struct vas_window *window)
  */
 int vas_win_close(struct vas_window *window)
 {
-   u64 val;
-
if (!window)
return 0;
 
@@ -1131,11 +1143,7 @@ int vas_win_close(struct vas_window *window)
 
poll_window_busy_state(window);
 
-   /* Unpin window from cache and close it */
-   val = read_hvwc_reg(window, VREG(WINCTL));
-   val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
-   val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
-   write_hvwc_reg(window, VREG(WINCTL), val);
+   unpin_close_window(window);
 
poll_window_castout(window);
 
-- 
2.7.4



[PATCH v3 03/18] powerpc/vas: Cleanup some debug code

2017-11-07 Thread Sukadev Bhattiprolu
Clean up vas.h and the debug code around ifdef vas_debug.

Signed-off-by: Sukadev Bhattiprolu 

---
Changelog[v3]
- Minor tweak to a debug message
---
 arch/powerpc/platforms/powernv/vas-window.c |  8 +++--
 arch/powerpc/platforms/powernv/vas.h| 54 ++---
 2 files changed, 17 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index a2fe120..67ffc5d 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -726,7 +726,10 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 static bool rx_win_args_valid(enum vas_cop_type cop,
struct vas_rx_win_attr *attr)
 {
-   dump_rx_win_attr(attr);
+   pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n",
+   attr->fault_win, attr->notify_disable,
+   attr->intr_disable, attr->notify_early,
+   attr->rx_fifo_size);
 
if (cop >= VAS_COP_TYPE_MAX)
return false;
@@ -1050,7 +1053,8 @@ int vas_paste_crb(struct vas_window *txwin, int offset, 
bool re)
else
rc = -EINVAL;
 
-   print_fifo_msg_count(txwin);
+   pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid,
+   read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
 
return rc;
 }
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index fea0de4..63e8e03 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -259,6 +259,16 @@
 #define VAS_NX_UTIL_ADDER  PPC_BITMASK(32, 63)
 
 /*
+ * VREG(x):
+ * Expand a register's short name (eg: LPID) into two parameters:
+ * - the register's short name in string form ("LPID"), and
+ * - the name of the macro (eg: VAS_LPID_OFFSET), defining the
+ *   register's offset in the window context
+ */
+#define VREG_SFX(n, s) __stringify(n), VAS_##n##s
+#define VREG(r)VREG_SFX(r, _OFFSET)
+
+/*
  * Local Notify Scope Control Register. (Receive windows only).
  */
 enum vas_notify_scope {
@@ -385,43 +395,15 @@ struct vas_winctx {
 
 extern struct vas_instance *find_vas_instance(int vasid);
 
-/*
- * VREG(x):
- * Expand a register's short name (eg: LPID) into two parameters:
- * - the register's short name in string form ("LPID"), and
- * - the name of the macro (eg: VAS_LPID_OFFSET), defining the
- *   register's offset in the window context
- */
-#define VREG_SFX(n, s) __stringify(n), VAS_##n##s
-#define VREG(r)VREG_SFX(r, _OFFSET)
-
-#ifdef vas_debug
-static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr)
-{
-   pr_err("fault %d, notify %d, intr %d early %d\n",
-   attr->fault_win, attr->notify_disable,
-   attr->intr_disable, attr->notify_early);
-
-   pr_err("rx_fifo_size %d, max value %d\n",
-   attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX);
-}
-
 static inline void vas_log_write(struct vas_window *win, char *name,
void *regptr, u64 val)
 {
if (val)
-   pr_err("%swin #%d: %s reg %p, val 0x%016llx\n",
+   pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n",
win->tx_win ? "Tx" : "Rx", win->winid, name,
regptr, val);
 }
 
-#else  /* vas_debug */
-
-#define vas_log_write(win, name, reg, val)
-#define dump_rx_win_attr(attr)
-
-#endif /* vas_debug */
-
 static inline void write_uwc_reg(struct vas_window *win, char *name,
s32 reg, u64 val)
 {
@@ -450,18 +432,4 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
return in_be64(win->hvwc_map+reg);
 }
 
-#ifdef vas_debug
-
-static void print_fifo_msg_count(struct vas_window *txwin)
-{
-   uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o);
-   pr_devel("Winid %d, Msg count %llu\n", txwin->winid,
-   (uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
-}
-#else  /* vas_debug */
-
-#define print_fifo_msg_count(window)
-
-#endif /* vas_debug */
-
 #endif /* _VAS_H */
-- 
2.7.4



[PATCH v3 02/18] powerpc/vas: Validate window credits

2017-11-07 Thread Sukadev Bhattiprolu
NX-842, the only user of VAS, sets the window credits to default values
but VAS should check the credits against the possible max values.

The VAS_WCREDS_MIN is not needed and can be dropped.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++
 arch/powerpc/platforms/powernv/vas.h| 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index cec7ab7..a2fe120 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -738,6 +738,9 @@ static bool rx_win_args_valid(enum vas_cop_type cop,
if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
return false;
 
+   if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+   return false;
+
if (attr->nx_win) {
/* cannot be fault or user window if it is nx */
if (attr->fault_win || attr->user_win)
@@ -927,6 +930,9 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
if (cop > VAS_COP_TYPE_MAX)
return false;
 
+   if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
+   return false;
+
if (attr->user_win &&
(cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
return false;
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 38dee5d..fea0de4 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -106,8 +106,8 @@
  *
  * TODO: Needs tuning for per-process credits
  */
-#define VAS_WCREDS_MIN 16
-#define VAS_WCREDS_MAX ((64 << 10) - 1)
+#define VAS_RX_WCREDS_MAX  ((64 << 10) - 1)
+#define VAS_TX_WCREDS_MAX  ((4 << 10) - 1)
 #define VAS_WCREDS_DEFAULT (1 << 10)
 
 /*
-- 
2.7.4



[PATCH v3 01/18] powerpc/vas: init missing fields from [rt]xattr

2017-11-07 Thread Sukadev Bhattiprolu
Initialize a few missing window context fields from the window attributes
specified by the caller. These fields are currently set to their default
values by the caller (NX-842), but would be good to apply them anyway.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 5aae845..cec7ab7 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -679,10 +679,13 @@ static void init_winctx_for_rxwin(struct vas_window 
*rxwin,
 
winctx->nx_win = rxattr->nx_win;
winctx->fault_win = rxattr->fault_win;
+   winctx->user_win = rxattr->user_win;
+   winctx->rej_no_credit = rxattr->rej_no_credit;
winctx->rx_word_mode = rxattr->rx_win_ord_mode;
winctx->tx_word_mode = rxattr->tx_win_ord_mode;
winctx->rx_wcred_mode = rxattr->rx_wcred_mode;
winctx->tx_wcred_mode = rxattr->tx_wcred_mode;
+   winctx->notify_early = rxattr->notify_early;
 
if (winctx->nx_win) {
winctx->data_stamp = true;
@@ -889,11 +892,14 @@ static void init_winctx_for_txwin(struct vas_window 
*txwin,
winctx->user_win = txattr->user_win;
winctx->nx_win = txwin->rxwin->nx_win;
winctx->pin_win = txattr->pin_win;
+   winctx->rej_no_credit = txattr->rej_no_credit;
+   winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable;
 
winctx->rx_wcred_mode = txattr->rx_wcred_mode;
winctx->tx_wcred_mode = txattr->tx_wcred_mode;
winctx->rx_word_mode = txattr->rx_win_ord_mode;
winctx->tx_word_mode = txattr->tx_win_ord_mode;
+   winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
 
if (winctx->nx_win) {
winctx->data_stamp = true;
-- 
2.7.4



[PATCH v3 00/18] powerpc/vas: Add support for FTW

2017-11-07 Thread Sukadev Bhattiprolu
The first 10 patches in this set sanitize cpu/chip id to VAS id mapping,
improve vas_win_close() performance, add a check for return of credits
and cleans up some code.

Patch 11 adds debugfs support for the VAS window contexts.

Patches 12-18 add support for user space aka Fast thread-wakeup windows
in VAS. These include a patch from Michael Neuling to support emulating
the paste instruction.

Changelog[v3]:
- [Michael Ellerman] We don't need to disable/enable pagefaults
  when emulating paste;
- [Michael Ellerman, Aneesh Kumar] Fix retval from emulate_paste()
  and paste().
- [Nick Piggin] Drop CP_ABORT since set_thread_uses_vas() does
  that now (in earlier patch) and add a check for return value.
- [Michael Ellerman] Fix couple of unininitialized variables
- Minor tweak to a debug message

Michael Neuling (1):
  powerpc: Emulate paste instruction

Sukadev Bhattiprolu (17):
  powerpc/vas: init missing fields from [rt]xattr
  powerpc/vas: Validate window credits
  powerpc/vas: Cleanup some debug code
  powerpc/vas: Drop poll_window_cast_out().
  powerpc/vas: Use helper to unpin/close window
  powerpc/vas: Reduce polling interval for busy state
  powerpc/vas: Save configured window credits
  powerpc/vas: poll for return of window credits
  powerpc/vas: Create cpu to vas id mapping
  powerpc/vas, nx-842: Define and use chip_to_vas_id()
  powerpc/vas: Export HVWC to debugfs
  powerpc: have copy depend on CONFIG_BOOK3S_64
  powerpc: Add support for setting SPRN_TIDR
  powerpc: Define set_thread_uses_vas()
  powerpc/vas: Define vas_win_paste_addr()
  powerpc/vas: Define vas_win_id()
  powerpc/vas: Add support for user receive window

 arch/powerpc/include/asm/emulated_ops.h |   1 +
 arch/powerpc/include/asm/ppc-opcode.h   |   1 +
 arch/powerpc/include/asm/processor.h|   3 +
 arch/powerpc/include/asm/reg.h  |   2 +
 arch/powerpc/include/asm/switch_to.h|   5 +
 arch/powerpc/include/asm/vas.h  |  21 +++
 arch/powerpc/kernel/process.c   | 169 +--
 arch/powerpc/kernel/traps.c |  67 
 arch/powerpc/platforms/powernv/Makefile |   3 +-
 arch/powerpc/platforms/powernv/vas-debug.c  | 209 
 arch/powerpc/platforms/powernv/vas-window.c | 242 +++-
 arch/powerpc/platforms/powernv/vas.c|  31 +++-
 arch/powerpc/platforms/powernv/vas.h|  93 ++-
 drivers/crypto/nx/nx-842-powernv.c  |  18 +--
 14 files changed, 751 insertions(+), 114 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/vas-debug.c

-- 
2.7.4



Re: [PATCH v9 00/51] powerpc, mm: Memory Protection Keys

2017-11-07 Thread Ram Pai
On Tue, Nov 07, 2017 at 02:47:10PM -0800, Dave Hansen wrote:
> On 11/07/2017 02:39 PM, Ram Pai wrote:
> > 
> > As per the current semantics of sys_pkey_free(); the way I understand it,
> > the calling thread is saying disassociate me from this key.
> 
> No.  It is saying: "this *process* no longer has any uses of this key,
> it can be reused".

ok, in light of the corrected semantics, I see no bug in the implimentation.

> On Mon, Nov 06, 2017 at 10:28:41PM +0100, Florian Weimer wrote:
...
> The problem is a pkey_alloc/pthread_create/pkey_free/pkey_alloc
> sequence.  The pthread_create call makes the new thread inherit the
> access rights of the current thread, but then the key is deallocated.
> Reallocation of the same key will have that thread retain its access
> rights, which is IMHO not correct.

Again.. in light of the corrected semantics --
 the child thread or any thread should not free
a key without cleaning up. 
(a) disassociate the key from its address space
(b) reset the permission on the key across all the threads of the
process.

Because any such uncleaned bits can cause unexpected behavior if the 
same key gets reallocated on sys_pkey_alloc().


-- 
Ram Pai



Re: KVM: PPC: Book3S HV: Handle host system reset in guest mode

2017-11-07 Thread Michael Ellerman
On Sun, 2017-11-05 at 12:33:55 UTC, Nicholas Piggin wrote:
> If the host takes a system reset interrupt while a guest is running,
> the CPU must exit the guest before processing the host exception
> handler.
> 
> After this patch, taking a sysrq+x with a CPU running in a guest
> gives a trace like this:
> 
>cpu 0x27: Vector: 100 (System Reset) at [c00fdf5776f0]
>pc: c00810158b80: kvmppc_run_core+0x16b8/0x1ad0 [kvm_hv]
>lr: c00810158b80: kvmppc_run_core+0x16b8/0x1ad0 [kvm_hv]
>sp: c00fdf577850
>   msr: 92803033
>  current = 0xc00fdf4b1e00
>  paca= 0xcfd4d680  softe: 3irq_happened: 0x01
>pid   = 6608, comm = qemu-system-ppc
>Linux version 4.14.0-rc7-01489-g47e1893a404a-dirty #26 SMP
>[c00fdf577a00] c00810159dd4 kvmppc_vcpu_run_hv+0x3dc/0x12d0 
> [kvm_hv]
>[c00fdf577b30] c008100a537c kvmppc_vcpu_run+0x44/0x60 [kvm]
>[c00fdf577b60] c008100a1ae0 kvm_arch_vcpu_ioctl_run+0x118/0x310 
> [kvm]
>[c00fdf577c00] c00810093e98 kvm_vcpu_ioctl+0x530/0x7c0 [kvm]
>[c00fdf577d50] c0357bf8 do_vfs_ioctl+0xd8/0x8c0
>[c00fdf577df0] c0358448 SyS_ioctl+0x68/0x100
>[c00fdf577e30] c000b220 system_call+0x58/0x6c
>--- Exception: c01 (System Call) at 7fff76868df0
>SP (7fff7069baf0) is in userspace
> 
> Fixes: e36d0a2ed5 ("powerpc/powernv: Implement NMI IPI with 
> OPAL_SIGNAL_SYSTEM_RESET")
> Signed-off-by: Nicholas Piggin 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/6de6638b35daa0dfb7daefd78d5501

cheers


Re: powerpc: eeh: stop using do_gettimeofday()

2017-11-07 Thread Michael Ellerman
On Sat, 2017-11-04 at 21:26:52 UTC, Arnd Bergmann wrote:
> This interface is inefficient and deprecated because of the y2038
> overflow.
> 
> ktime_get_seconds() is an appropriate replacement here, since it
> has sufficient granularity but is more efficient and uses monotonic
> time.
> 
> Signed-off-by: Arnd Bergmann 
> Reviewed-by: Andrew Donnellan 
> Acked-by: Russell Currey 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/edfd17ff39bce59886e8249e645d6e

cheers


Re: [v3,1/3] powerpc: add POWER9_DD20 feature

2017-11-07 Thread Michael Ellerman
On Fri, 2017-11-03 at 04:13:19 UTC, Nicholas Piggin wrote:
> Cc: Michael Neuling 
> Signed-off-by: Nicholas Piggin 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/b6b3755e9bec9c686a34ec81eacced

cheers


Re: powerpc/mm: Add a CONFIG option to choose if radix is used by default

2017-11-07 Thread Michael Ellerman
On Tue, 2017-10-24 at 15:48:49 UTC, Michael Ellerman wrote:
> Currently if the hardware supports the radix MMU we will use
> it, *unless* "disable_radix" is passed on the kernel command line.
> 
> However some users would like the reverse semantics. ie. The kernel
> uses the hash MMU by default, unless radix is explicitly requested on
> the command line.
> 
> So add a CONFIG option to choose whether we use radix by default or
> not, and expand the disable_radix command line option to allow
> "disable_radix=no" which *enables* radix.
> 
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/1fd6c02207107c8892219dacef01de

cheers


Re: [v5, 01/10] mtd: powernv_flash: Use WARN_ON_ONCE() rather than BUG_ON()

2017-11-07 Thread Michael Ellerman
On Fri, 2017-11-03 at 02:41:37 UTC, Cyril Bur wrote:
> BUG_ON() should be reserved in situations where we can not longer
> guarantee the integrity of the system. In the case where
> powernv_flash_async_op() receives an impossible op, we can still
> guarantee the integrity of the system.
> 
> Signed-off-by: Cyril Bur 
> Acked-by: Boris Brezillon 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/44e2aa2b16a872fa8aa4901b379313

cheers


Re: [v3,1/4] powerpc: Don't enable FP/Altivec if not checkpointed

2017-11-07 Thread Michael Ellerman
On Thu, 2017-11-02 at 03:09:03 UTC, Cyril Bur wrote:
> Lazy save and restore of FP/Altivec means that a userspace process can
> be sent to userspace with FP or Altivec disabled and loaded only as
> required (by way of an FP/Altivec unavailable exception). Transactional
> Memory complicates this situation as a transaction could be started
> without FP/Altivec being loaded up. This causes the hardware to
> checkpoint incorrect registers. Handling FP/Altivec unavailable
> exceptions while a thread is transactional requires a reclaim and
> recheckpoint to ensure the CPU has correct state for both sets of
> registers.
> 
> Lazy save and restore of FP/Altivec cannot be done if a process is
> transactional. If a facility was enabled it must remain enabled whenever
> a thread is transactional.
> 
> Commit dc16b553c949 ("powerpc: Always restore FPU/VEC/VSX if hardware
> transactional memory in use") ensures that the facilities are always
> enabled if a thread is transactional. A bug in the introduced code may
> cause it to inadvertently enable a facility that was (and should remain)
> disabled. The problem with this extraneous enablement is that the
> registers for the erroneously enabled facility have not been correctly
> recheckpointed - the recheckpointing code assumed the facility would
> remain disabled.
> 
> Further compounding the issue, the transactional {fp,altivec,vsx}
> unavailable code has been incorrectly using the MSR to enable
> facilities. The presence of the {FP,VEC,VSX} bit in the regs->msr simply
> means if the registers are live on the CPU, not if the kernel should
> load them before returning to userspace. This has worked due to the bug
> mentioned above.
> 
> This causes transactional threads which return to their failure handler
> to observe incorrect checkpointed registers. Perhaps an example will
> help illustrate the problem:
> 
> A userspace process is running and uses both FP and Altivec registers.
> This process then continues to run for some time without touching
> either sets of registers. The kernel subsequently disables the
> facilities as part of lazy save and restore. The userspace process then
> performs a tbegin and the CPU checkpoints 'junk' FP and Altivec
> registers. The process then performs a floating point instruction
> triggering a fp unavailable exception in the kernel.
> 
> The kernel then loads the FP registers - and only the FP registers.
> Since the thread is transactional it must perform a reclaim and
> recheckpoint to ensure both the checkpointed registers and the
> transactional registers are correct. It then (correctly) enables
> MSR[FP] for the process. Later (on exception exist) the kernel also
> (inadvertently) enables MSR[VEC]. The process is then returned to
> userspace.
> 
> Since the act of loading the FP registers doomed the transaction we know
> CPU will fail the transaction, restore its checkpointed registers, and
> return the process to its failure handler. The problem is that we're
> now running with Altivec enabled and the 'junk' checkpointed registers
> are restored. The kernel had only recheckpointed FP.
> 
> This patch solves this by only activating FP/Altivec if userspace was
> using them when it entered the kernel and not simply if the process is
> transactional.
> 
> Fixes: dc16b553c949 ("powerpc: Always restore FPU/VEC/VSX if hardware
> transactional memory in use")
> 
> Signed-off-by: Cyril Bur 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/a7771176b4392fbc3a17399c51a8c1

cheers


Re: [v2, 3/3] powerpc/64s/radix: Fix process table entry cache invalidation

2017-11-07 Thread Michael Ellerman
On Tue, 2017-10-24 at 13:06:54 UTC, Nicholas Piggin wrote:
> According to the architecture, the process table entry cache must be
> flushed with tlbie RIC=2.
> 
> Currently the process table entry is set to invalid right before the
> PID is returned to the allocator, with no invalidation. This works on
> existing implementations that are known to not cache the process table
> entry for any except the current PIDR.
> 
> It is architecturally correct and cleaner to invalidate with RIC=2
> after clearing the process table entry and before the PID is returned
> to the allocator. This can be done in arch_exit_mmap that runs before
> the final flush, and to ensure the final flush (fullmm) is always a
> RIC=2 variant.
> 
> Signed-off-by: Nicholas Piggin 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/30b49ec798f0984b905fd94d1957d6

cheers


Re: powerpc/64s: Replace CONFIG_PPC_STD_MMU_64 with CONFIG_PPC_BOOK3S_64

2017-11-07 Thread Michael Ellerman
On Thu, 2017-10-19 at 04:08:43 UTC, Michael Ellerman wrote:
> CONFIG_PPC_STD_MMU_64 indicates support for the "standard" powerpc MMU
> on 64-bit CPUs. The "standard" MMU refers to the hash page table MMU
> found in "server" processors, from IBM mainly.
> 
> Currently CONFIG_PPC_STD_MMU_64 is == CONFIG_PPC_BOOK3S_64. While it's
> annoying to have two symbols that always have the same value, it's not
> quite annoying enough to bother removing one.
> 
> However with the arrival of Power9, we now have the situation where
> CONFIG_PPC_STD_MMU_64 is enabled, but the kernel is running using the
> Radix MMU - *not* the "standard" MMU. So it is now actively confusing
> to use it, because it implies that code is disabled or inactive when
> the Radix MMU is in use, however that is not necessarily true.
> 
> So s/CONFIG_PPC_STD_MMU_64/CONFIG_PPC_BOOK3S_64/, and do some minor
> formatting updates of some of the affected lines.
> 
> This will be a pain for backports, but c'est la vie.
> 
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/4e003747043d57aa75c9762fa148ef

cheers


Re: [v2,2/3] powerpc/64s/radix: tlbie improve preempt handling

2017-11-07 Thread Michael Ellerman
On Tue, 2017-10-24 at 13:06:53 UTC, Nicholas Piggin wrote:
> Preempt should be consistently disabled for mm_is_thread_local tests,
> so bring the rest of these under preempt_disable().
> 
> Preempt does not need to be disabled for the mm->context.id tests,
> which allows simplification and removal of gotos.
> 
> Signed-off-by: Nicholas Piggin 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/dffe8449c5dd63ff18b47709de7555

cheers


Re: powerpc/64: Fix latency tracing for lazy irq replay

2017-11-07 Thread Michael Ellerman
On Sat, 2017-10-21 at 07:56:06 UTC, Nicholas Piggin wrote:
> When returning from an exception to a soft-enabled context, pending
> IRQs are replayed but IRQ tracing is not reset, so a number of them
> can get chained together into the same IRQ-disabled trace.
> 
> Fix this by having __check_irq_replay re-set IRQ trace. This is
> conceptually where we respond to the next interrupt, so it fits the
> semantics of the IRQ tracer.
> 
> Signed-off-by: Nicholas Piggin 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/ff967900c9d4740f6337c6456f

cheers


Re: [1/3] powerpc/book3s: use label for FIXUP_ENDIAN macro branch

2017-11-07 Thread Michael Ellerman
On Mon, 2017-10-23 at 07:08:13 UTC, Nicholas Piggin wrote:
> Signed-off-by: Nicholas Piggin 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/f848ea7f5960ec2684c3bd1c0692e6

cheers


Re: powerpc/64: Free up CPU_FTR_ICSWX

2017-11-07 Thread Michael Ellerman
On Thu, 2017-10-19 at 04:08:19 UTC, Michael Ellerman wrote:
> The last user of CPU_FTR_ICSWX was removed in commit
> 6ff4d3e96652 ("powerpc: Remove old unused icswx based coprocessor
> support"), so free the bit up for future use.
> 
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/c1807e3f84668c4c1e838386fdc3f6

cheers


Re: powerpc: ipic - fix status get and status clear

2017-11-07 Thread Michael Ellerman
On Wed, 2017-10-18 at 09:16:47 UTC, Christophe Leroy wrote:
> IPIC Status is provided by register IPIC_SERSR and not by IPIC_SERMR
> which is the mask register.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/6b148a7ce72a7f87c81cbcde48af01

cheers


Re: powerpc/tm: Don't check for WARN in TM Bad Thing handling

2017-11-07 Thread Michael Ellerman
On Thu, 2017-10-12 at 04:45:25 UTC, Michael Ellerman wrote:
> Currently when we take a TM Bad Thing program check exception, we
> search the bug table to see if the program check was generated by a
> WARN/WARN_ON etc.
> 
> That makes no sense, the WARN macros use trap instructions, which
> should never generate a TM Bad Thing exception. If they ever did that
> would be a bug and we should oops.
> 
> We do have some hand-coded bugs in tm.S, using EMIT_BUG_ENTRY, but
> those are all BUGs not WARNs, and they all use trap instructions
> anyway. Almost certainly this check was incorrectly copied from the
> REASON_TRAP handling in the same function.
> 
> Remove it.
> 
> Signed-off-by: Michael Ellerman 
> Acked-By: Michael Neuling 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/632f0574167ad3f5d646dad6af87d9

cheers


Re: powerpc/mm/hash: Add pr_fmt() to hash_utils64.c

2017-11-07 Thread Michael Ellerman
On Mon, 2017-10-16 at 07:01:40 UTC, "Aneesh Kumar K.V" wrote:
> Make the printks look a bit nicer by adding a prefix.
> 
> Radix config now do
>  radix-mmu: Page sizes from device-tree:
>  radix-mmu: Page size shift = 12 AP=0x0
>  radix-mmu: Page size shift = 16 AP=0x5
>  radix-mmu: Page size shift = 21 AP=0x1
>  radix-mmu: Page size shift = 30 AP=0x2
> 
> This patch update hash config to do similar dmesg output. With the patch we 
> have
> 
>  hash-mmu: Page sizes from device-tree:
>  hash-mmu: base_shift=12: shift=12, sllp=0x, avpnm=0x, tlbiel=1, 
> penc=0
>  hash-mmu: base_shift=12: shift=16, sllp=0x, avpnm=0x, tlbiel=1, 
> penc=7
>  hash-mmu: base_shift=12: shift=24, sllp=0x, avpnm=0x, tlbiel=1, 
> penc=56
>  hash-mmu: base_shift=16: shift=16, sllp=0x0110, avpnm=0x, tlbiel=1, 
> penc=1
>  hash-mmu: base_shift=16: shift=24, sllp=0x0110, avpnm=0x, tlbiel=1, 
> penc=8
>  hash-mmu: base_shift=20: shift=20, sllp=0x0111, avpnm=0x, tlbiel=0, 
> penc=2
>  hash-mmu: base_shift=24: shift=24, sllp=0x0100, avpnm=0x0001, tlbiel=0, 
> penc=0
>  hash-mmu: base_shift=34: shift=34, sllp=0x0120, avpnm=0x07ff, tlbiel=0, 
> penc=3
> 
> Signed-off-by: Aneesh Kumar K.V 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/7f142661d41e5243c99c6ad2ff72aa

cheers


Re: cxl: Rework the implementation of cxl_stop_trace_psl9()

2017-11-07 Thread Michael Ellerman
On Wed, 2017-10-11 at 12:30:20 UTC, Vaibhav Jain wrote:
> Presently the PSL9 specific cxl_stop_trace_psl9() only stops the RX0
> traces on the CXL adapter when a PSL error irq is triggered. The patch
> updates the function to stop all the traces arrays and move them to
> the FIN state. The implementation issues the mmio to TRACECFG register
> to stop the trace array iff it already not in FIN state. This prevents
> the issue of trace data being reset in case of multiple stop mmio
> issued for a single trace array.
> 
> Also the patch does some refactoring of existing cxl_stop_trace_psl9()
> and cxl_stop_trace_psl8() functions by moving them to 'pci.c' from
> 'debugfs.c' file and marking them as static.
> 
> Signed-off-by: Vaibhav Jain 
> Acked-by: Frederic Barrat 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/cbb55eeb49b116bb3880137661ad8c

cheers


Re: powerpc/vio: dispose of virq mapping on vdevice unregister

2017-11-07 Thread Michael Ellerman
On Fri, 2017-09-29 at 00:19:20 UTC, Tyrel Datwyler wrote:
> When a vdevice is DLPAR removed from the system the vio subsystem doesn't
> bother unmapping the virq from the irq_domain. As a result we have a virq
> mapped to a hardware irq that is no longer valid for the irq_domain. A side
> effect is that we are left with /proc/irq/ affinity entries, and
> attempts to modify the smp_affinity of the irq will fail.
> 
> In the following observed example the kernel log is spammed by
> ics_rtas_set_affinity errors after the removal of a VSCSI adapter. This is a
> result of irqbalance trying to adjust the affinity every 10 seconds.
> 
> rpadlpar_io: slot U8408.E8E.10A7ACV-V5-C25 removed
> ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3
> ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3
> 
> This patch fixes the issue by calling irq_dispose_mapping() on the virq of the
> viodev on unregister.
> 
> Signed-off-by: Tyrel Datwyler 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/b8f89fea599d91e674497aad572613

cheers


Re: [kernel, v2] powerpc/powernv: Reserve a hole which appears after enabling IOV

2017-11-07 Thread Michael Ellerman
On Wed, 2017-09-27 at 06:52:31 UTC, Alexey Kardashevskiy wrote:
> In order to make generic IOV code work, the physical function IOV BAR
> should start from offset of the first VF. Since M64 segments share
> PE number space across PHB, and some PEs may be in use at the time
> when IOV is enabled, the existing code shifts the IOV BAR to the index
> of the first PE/VF. This creates a hole in IOMEM space which can be
> potentially taken by some other device.
> 
> This reserves a temporary hole on a parent and releases it when IOV is
> disabled; the temporary resources are stored in pci_dn to avoid
> kmalloc/free.
> 
> Cc: linux-...@vger.kernel.org
> Cc: Benjamin Herrenschmidt 
> Cc: Bjorn Helgaas 
> Signed-off-by: Alexey Kardashevskiy 
> Acked-by: Bjorn Helgaas 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/d6f934fd48803d9e58040e2cbab2fe

cheers


Re: powerpc/opal: Fix EBUSY bug in acquiring tokens

2017-11-07 Thread Michael Ellerman
On Fri, 2017-09-22 at 23:58:00 UTC, "William A. Kennington III" wrote:
> The current code checks the completion map to look for the first token
> that is complete. In some cases, a completion can come in but the token
> can still be on lease to the caller processing the completion. If this
> completed but unreleased token is the first token found in the bitmap by
> another tasks trying to acquire a token, then the __test_and_set_bit
> call will fail since the token will still be on lease. The acquisition
> will then fail with an EBUSY.
> 
> This patch reorganizes the acquisition code to look at the
> opal_async_token_map for an unleased token. If the token has no lease it
> must have no outstanding completions so we should never see an EBUSY,
> unless we have leased out too many tokens. Since
> opal_async_get_token_inrerruptible is protected by a semaphore, we will
> practically never see EBUSY anymore.
> 
> Signed-off-by: William A. Kennington III 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/71e24d7731a2903b1ae2bba2b2971c

cheers


Re: [1/1] bpf: take advantage of stack_depth tracking in powerpc JIT

2017-11-07 Thread Michael Ellerman
On Fri, 2017-09-01 at 18:53:01 UTC, Sandipan Das wrote:
> Take advantage of stack_depth tracking, originally introduced for
> x64, in powerpc JIT as well. Round up allocated stack by 16 bytes
> to make sure it stays aligned for functions called from JITed bpf
> program.
> 
> Signed-off-by: Sandipan Das 
> Reviewed-by: Naveen N. Rao 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/ac0761ebcb08830d8f64b9181f6736

cheers


Re: [PATCH] watchdog: mpc8xxx: use the core worker function

2017-11-07 Thread Guenter Roeck
On Tue, Nov 07, 2017 at 05:23:56PM +0100, Christophe Leroy wrote:
> The watchdog core includes a worker function which pings the
> watchdog until user app starts pinging it and which also
> pings it if the HW require more frequent pings.
> Use that function instead of the dedicated timer.
> In the mean time, we can allow the user to change the timeout.
> 
> Then change the timeout module parameter to use seconds and
> use the watchdog_init_timeout() core function.
> 
> On some HW (eg: the 8xx), SWCRR contains bits unrelated to the
> watchdog which have to be preserved upon write.
> 
> This driver has nothing preventing the use of the magic close, so
> enable it.
> 
> Signed-off-by: Christophe Leroy 
> ---
>  drivers/watchdog/mpc8xxx_wdt.c | 86 
> --
>  1 file changed, 40 insertions(+), 46 deletions(-)
> 
> diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
> index 366e5c7e650b..c2ac4b6b1253 100644
> --- a/drivers/watchdog/mpc8xxx_wdt.c
> +++ b/drivers/watchdog/mpc8xxx_wdt.c
> @@ -22,7 +22,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>  #include 
>  #include 
> @@ -31,10 +30,13 @@
>  #include 
>  #include 
>  
> +#define WATCHDOG_TIMEOUT 10
> +
>  struct mpc8xxx_wdt {
>   __be32 res0;
>   __be32 swcrr; /* System watchdog control register */
>  #define SWCRR_SWTC 0x /* Software Watchdog Time Count. */
> +#define SWCRR_SWF  0x0008 /* Software Watchdog Freeze (mpc8xx). */
>  #define SWCRR_SWEN 0x0004 /* Watchdog Enable bit. */
>  #define SWCRR_SWRI 0x0002 /* Software Watchdog Reset/Interrupt Select 
> bit.*/
>  #define SWCRR_SWPR 0x0001 /* Software Watchdog Counter Prescale bit. */
> @@ -52,14 +54,15 @@ struct mpc8xxx_wdt_type {
>  struct mpc8xxx_wdt_ddata {
>   struct mpc8xxx_wdt __iomem *base;
>   struct watchdog_device wdd;
> - struct timer_list timer;
>   spinlock_t lock;
> + int prescaler;
>  };
>  
> -static u16 timeout = 0x;
> +static u16 timeout;
>  module_param(timeout, ushort, 0);
>  MODULE_PARM_DESC(timeout,
> - "Watchdog timeout in ticks. (0 + "Watchdog timeout in seconds. (1  
>  static bool reset = 1;
>  module_param(reset, bool, 0);
> @@ -80,31 +83,35 @@ static void mpc8xxx_wdt_keepalive(struct 
> mpc8xxx_wdt_ddata *ddata)
>   spin_unlock(>lock);
>  }
>  
> -static void mpc8xxx_wdt_timer_ping(unsigned long arg)
> -{
> - struct mpc8xxx_wdt_ddata *ddata = (void *)arg;
> -
> - mpc8xxx_wdt_keepalive(ddata);
> - /* We're pinging it twice faster than needed, just to be sure. */
> - mod_timer(>timer, jiffies + HZ * ddata->wdd.timeout / 2);
> -}
> -
>  static int mpc8xxx_wdt_start(struct watchdog_device *w)
>  {
>   struct mpc8xxx_wdt_ddata *ddata =
>   container_of(w, struct mpc8xxx_wdt_ddata, wdd);
> -
> - u32 tmp = SWCRR_SWEN | SWCRR_SWPR;
> + u32 tmp;
> + u32 timeout;
>  
>   /* Good, fire up the show */
> + tmp = in_be32(>base->swcrr);
> + tmp &= ~(SWCRR_SWTC | SWCRR_SWF | SWCRR_SWEN | SWCRR_SWRI | SWCRR_SWPR);
> + tmp |= SWCRR_SWEN | SWCRR_SWPR;
> +
>   if (reset)
>   tmp |= SWCRR_SWRI;
>  
> - tmp |= timeout << 16;
> + timeout = ddata->wdd.timeout * fsl_get_sys_freq() / ddata->prescaler;
> + tmp |= min(timeout, 0xU) << 16;
>  
>   out_be32(>base->swcrr, tmp);
>  
> - del_timer_sync(>timer);
> + tmp = in_be32(>base->swcrr);
> + if (!(tmp & SWCRR_SWEN))
> + return -EOPNOTSUPP;
> +
> + ddata->wdd.max_hw_heartbeat_ms = ((tmp >> 16) * ddata->prescaler) /
> +  (fsl_get_sys_freq() / 1000);
> + ddata->wdd.min_timeout = ddata->wdd.max_hw_heartbeat_ms / 1000;

That is odd. Why set the minimum timeout period to the maximum hardware
timeout, rounded down to a full second ?

Note that it is wrong to set the values for min_timeout and
max_hw_heartbeat_ms in the _start function; they should be set
in the probe function. The start function should also not update
ddata->wdd.timeout; that should happen in set_timeout if supported,
or in probe.

> + if (ddata->wdd.timeout < ddata->wdd.min_timeout)
> + ddata->wdd.timeout = ddata->wdd.min_timeout;
>  
>   return 0;
>  }
> @@ -118,17 +125,8 @@ static int mpc8xxx_wdt_ping(struct watchdog_device *w)
>   return 0;
>  }
>  
> -static int mpc8xxx_wdt_stop(struct watchdog_device *w)
> -{
> - struct mpc8xxx_wdt_ddata *ddata =
> - container_of(w, struct mpc8xxx_wdt_ddata, wdd);
> -
> - mod_timer(>timer, jiffies);
> - return 0;
> -}
> -
>  static struct watchdog_info mpc8xxx_wdt_info = {
> - .options = WDIOF_KEEPALIVEPING,
> + .options = WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE | WDIOF_SETTIMEOUT,
>   .firmware_version = 1,
>   .identity = "MPC8xxx",
>  };
> @@ 

Re: [PATCH v9 00/51] powerpc, mm: Memory Protection Keys

2017-11-07 Thread Dave Hansen
On 11/07/2017 02:39 PM, Ram Pai wrote:
> 
> As per the current semantics of sys_pkey_free(); the way I understand it,
> the calling thread is saying disassociate me from this key.

No.  It is saying: "this *process* no longer has any uses of this key,
it can be reused".


Re: [PATCH v9 00/51] powerpc, mm: Memory Protection Keys

2017-11-07 Thread Ram Pai
On Tue, Nov 07, 2017 at 08:32:16AM +0100, Florian Weimer wrote:
> * Ram Pai:
> 
> > On Mon, Nov 06, 2017 at 10:28:41PM +0100, Florian Weimer wrote:
> >> * Ram Pai:
> >> 
> >> > Testing:
> >> > ---
> >> > This patch series has passed all the protection key
> >> > tests available in the selftest directory.The
> >> > tests are updated to work on both x86 and powerpc.
> >> > The selftests have passed on x86 and powerpc hardware.
> >> 
> >> How do you deal with the key reuse problem?  Is it the same as x86-64,
> >> where it's quite easy to accidentally grant existing threads access to
> >> a just-allocated key, either due to key reuse or a changed init_pkru
> >> parameter?
> >
> > I am not sure how on x86-64, two threads get allocated the same key
> > at the same time? the key allocation is guarded under the mmap_sem
> > semaphore. So there cannot be a race where two threads get allocated
> > the same key.
> 
> The problem is a pkey_alloc/pthread_create/pkey_free/pkey_alloc
> sequence.  The pthread_create call makes the new thread inherit the
> access rights of the current thread, but then the key is deallocated.
> Reallocation of the same key will have that thread retain its access
> rights, which is IMHO not correct.

(Dave Hansen: please correct me if I miss-speak below)

As per the current semantics of sys_pkey_free(); the way I understand it,
the calling thread is saying disassociate me from this key. Other
threads continue to be associated with the key and could continue to
get key-faults, but this calling thread will not get key-faults on that
key any more.

Also the key should not get reallocated till all the threads in the process
have disassocated from the key; by calling sys_pkey_free().

>From that point of view, I think there is a bug in the implementation of
pkey on x86 and now on powerpc aswell.

> 
> > Can you point me to the issue, if it is already discussed somewhere?
> 
> See ‘MPK: pkey_free and key reuse’ on various lists (including
> linux-mm and linux-arch).
> 
> It has a test case attached which demonstrates the behavior.
> 
> > As far as the semantics is concerned, a key allocated in one thread's
> > context has no meaning if used in some other threads context within the
> > same process.  The app should not try to re-use a key allocated in a
> > thread's context in some other threads's context.
> 
> Uh-oh, that's not how this feature works on x86-64 at all.  There, the
> keys are a process-global resource.  Treating them per-thread
> seriously reduces their usefulness.

Sorry. I was not thinking right. Let me restate.

A key is a global resource, but the permissions on a key is
local to a thread. For eg: the same key could disable
access on a page for one thread, while it could disable write
on the same page on another thread.

> 
> >> What about siglongjmp from a signal handler?
> >
> > On powerpc there is some relief.  the permissions on a key can be
> > modified from anywhere, including from the signal handler, and the
> > effect will be immediate.  You dont have to wait till the
> > signal handler returns for the key permissions to be restore.
> 
> My concern is that the signal handler knows nothing about protection
> keys, but the current x86-64 semantics will cause it to clobber the
> access rights of the current thread.
> 
> > also after return from the sigsetjmp();
> > possibly caused by siglongjmp(), the program can restore the permission
> > on any key.
> 
> So that's not really an option.
> 
> > Atleast that is my theory. Can you give me a testcase; if you have one
> > handy.
> 
> The glibc patch I posted under the ‘MPK: pkey_free and key reuse’
> thread covers this, too.

thanks. will try the test case with my kernel patches. But, on
powerpc one can change the permissions on the key in the signal handler
which takes into effect immediately, there should not be a bug
in powerpc.

x86 has this requirement where it has to return from the signal handler
back to the kernel in order to change the permission on a key,
it can cause issues with longjump.

RP



linux-next: manual merge of the powerpc tree with Linus' tree

2017-11-07 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the powerpc tree got a conflict in:

  arch/powerpc/mm/tlb-radix.c

between commit:

  26e53d5ebe2e ("powerpc/64s/radix: Fix preempt imbalance in TLB flush")

from Linus' tree and commit:

  dffe8449c5dd ("powerpc/64s/radix: Improve preempt handling in TLB code")

from the powerpc tree.

I fixed it up (I effectively dropped the former as it seems to be fixed in
the latter as well) and can carry the fix as necessary. This is now fixed
as far as linux-next is concerned, but any non trivial conflicts should
be mentioned to your upstream maintainer when your tree is submitted for
merging.  You may also want to consider cooperating with the maintainer
of the conflicting tree to minimise any particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell


[PATCH] watchdog: mpc8xxx: use the core worker function

2017-11-07 Thread Christophe Leroy
The watchdog core includes a worker function which pings the
watchdog until user app starts pinging it and which also
pings it if the HW require more frequent pings.
Use that function instead of the dedicated timer.
In the mean time, we can allow the user to change the timeout.

Then change the timeout module parameter to use seconds and
use the watchdog_init_timeout() core function.

On some HW (eg: the 8xx), SWCRR contains bits unrelated to the
watchdog which have to be preserved upon write.

This driver has nothing preventing the use of the magic close, so
enable it.

Signed-off-by: Christophe Leroy 
---
 drivers/watchdog/mpc8xxx_wdt.c | 86 --
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
index 366e5c7e650b..c2ac4b6b1253 100644
--- a/drivers/watchdog/mpc8xxx_wdt.c
+++ b/drivers/watchdog/mpc8xxx_wdt.c
@@ -22,7 +22,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -31,10 +30,13 @@
 #include 
 #include 
 
+#define WATCHDOG_TIMEOUT 10
+
 struct mpc8xxx_wdt {
__be32 res0;
__be32 swcrr; /* System watchdog control register */
 #define SWCRR_SWTC 0x /* Software Watchdog Time Count. */
+#define SWCRR_SWF  0x0008 /* Software Watchdog Freeze (mpc8xx). */
 #define SWCRR_SWEN 0x0004 /* Watchdog Enable bit. */
 #define SWCRR_SWRI 0x0002 /* Software Watchdog Reset/Interrupt Select 
bit.*/
 #define SWCRR_SWPR 0x0001 /* Software Watchdog Counter Prescale bit. */
@@ -52,14 +54,15 @@ struct mpc8xxx_wdt_type {
 struct mpc8xxx_wdt_ddata {
struct mpc8xxx_wdt __iomem *base;
struct watchdog_device wdd;
-   struct timer_list timer;
spinlock_t lock;
+   int prescaler;
 };
 
-static u16 timeout = 0x;
+static u16 timeout;
 module_param(timeout, ushort, 0);
 MODULE_PARM_DESC(timeout,
-   "Watchdog timeout in ticks. (0lock);
 }
 
-static void mpc8xxx_wdt_timer_ping(unsigned long arg)
-{
-   struct mpc8xxx_wdt_ddata *ddata = (void *)arg;
-
-   mpc8xxx_wdt_keepalive(ddata);
-   /* We're pinging it twice faster than needed, just to be sure. */
-   mod_timer(>timer, jiffies + HZ * ddata->wdd.timeout / 2);
-}
-
 static int mpc8xxx_wdt_start(struct watchdog_device *w)
 {
struct mpc8xxx_wdt_ddata *ddata =
container_of(w, struct mpc8xxx_wdt_ddata, wdd);
-
-   u32 tmp = SWCRR_SWEN | SWCRR_SWPR;
+   u32 tmp;
+   u32 timeout;
 
/* Good, fire up the show */
+   tmp = in_be32(>base->swcrr);
+   tmp &= ~(SWCRR_SWTC | SWCRR_SWF | SWCRR_SWEN | SWCRR_SWRI | SWCRR_SWPR);
+   tmp |= SWCRR_SWEN | SWCRR_SWPR;
+
if (reset)
tmp |= SWCRR_SWRI;
 
-   tmp |= timeout << 16;
+   timeout = ddata->wdd.timeout * fsl_get_sys_freq() / ddata->prescaler;
+   tmp |= min(timeout, 0xU) << 16;
 
out_be32(>base->swcrr, tmp);
 
-   del_timer_sync(>timer);
+   tmp = in_be32(>base->swcrr);
+   if (!(tmp & SWCRR_SWEN))
+   return -EOPNOTSUPP;
+
+   ddata->wdd.max_hw_heartbeat_ms = ((tmp >> 16) * ddata->prescaler) /
+(fsl_get_sys_freq() / 1000);
+   ddata->wdd.min_timeout = ddata->wdd.max_hw_heartbeat_ms / 1000;
+   if (ddata->wdd.timeout < ddata->wdd.min_timeout)
+   ddata->wdd.timeout = ddata->wdd.min_timeout;
 
return 0;
 }
@@ -118,17 +125,8 @@ static int mpc8xxx_wdt_ping(struct watchdog_device *w)
return 0;
 }
 
-static int mpc8xxx_wdt_stop(struct watchdog_device *w)
-{
-   struct mpc8xxx_wdt_ddata *ddata =
-   container_of(w, struct mpc8xxx_wdt_ddata, wdd);
-
-   mod_timer(>timer, jiffies);
-   return 0;
-}
-
 static struct watchdog_info mpc8xxx_wdt_info = {
-   .options = WDIOF_KEEPALIVEPING,
+   .options = WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE | WDIOF_SETTIMEOUT,
.firmware_version = 1,
.identity = "MPC8xxx",
 };
@@ -137,7 +135,6 @@ static struct watchdog_ops mpc8xxx_wdt_ops = {
.owner = THIS_MODULE,
.start = mpc8xxx_wdt_start,
.ping = mpc8xxx_wdt_ping,
-   .stop = mpc8xxx_wdt_stop,
 };
 
 static int mpc8xxx_wdt_probe(struct platform_device *ofdev)
@@ -148,7 +145,6 @@ static int mpc8xxx_wdt_probe(struct platform_device *ofdev)
struct mpc8xxx_wdt_ddata *ddata;
u32 freq = fsl_get_sys_freq();
bool enabled;
-   unsigned int timeout_sec;
 
wdt_type = of_device_get_match_data(>dev);
if (!wdt_type)
@@ -173,35 +169,34 @@ static int mpc8xxx_wdt_probe(struct platform_device 

Re: [PATCH v3] kernel/module_64.c: Add REL24 relocation support of livepatch symbols

2017-11-07 Thread Josh Poimboeuf
On Tue, Nov 07, 2017 at 12:31:05PM +0100, Torsten Duwe wrote:
> On Tue, Nov 07, 2017 at 07:34:29PM +1100, Michael Ellerman wrote:
> > > So, just brainstorming a bit, here are the possible solutions I can
> > > think of:
> > >
> > > a) Create a special klp stub for such calls (as in Kamalesh's patch)
> > >
> > > b) Have kpatch-build rewrite the function to insert nops after calls to
> > >previously-local functions.  This would also involve adjusting the
> > >offsets of intra-function branches and relocations which come
> > >afterwards in the same section.  And also patching up the DWARF
> > >debuginfo, if we care about that (I think we do).  And also patching
> > >up the jump tables which GCC sometimes creates for switch statements.
> > >Yuck.  I'm pretty sure this is a horrible idea.
> > 
> > It's fairly horrible. It might be *less* horrible if you generated an
> > assembly listing using the compiler, modified that, and then fed that
> > through the assembler and linker.

Ah, that would work a lot better.

> > > c) Create a new GCC flag which treats all calls as global, which can be
> > >used by kpatch-build to generate the right code (assuming this flag
> > >doesn't already exist).  This would be a good option, I think.
> > 
> > That's not impossible, but I doubt it will be all that popular with the
> > toolchain folks who'd have to implement it :)  It will also take a long
> > time to percolate out to users.
> 
> BTDT ;-)

Yeah, maybe that's not so realistic.

> > > d) Have kpatch-build do some other kind of transformation?  For example,
> > >maybe it could generate klp stubs which the callee calls into.  Each
> > >klp stub could then do a proper global call to the SHN_LIVEPATCH
> > >symbol.
> > 
> > That could work.
> Indeed. There is no reason to load that off onto the kernel module loader.

I agree that doing the same work in tooling would be better than adding
complexity to the kernel.

> > > Do I understand the problem correctly?  Do the potential solutions make
> > > sense?  Any other possible solutions I missed?
> > 
> > Possibly, I'm not really across how kpatch works in detail and what the
> > constraints are.
> > 
> > One option would be to detect any local calls made by the patched
> > function and pull those in as well - ie. make them part of the patch.
> > The pathological case could obviously end up pulling in every function
> > in the kernel, but in practice that's probably unlikely. It already
> > happens to some extent anyway via inlining.

That's definitely a possibility, but I'd rather not go there because it
increases risk and cognitive load.

> > If modifying the source is an option, a sneaky solution is to mark the
> > local functions as weak, which means the compiler/linker has to assume
> > they could become global calls.

That could work, but it means the patch author has to modify the patch.
I'd rather have tooling hide this problem somehow.

> This might also be doable with a gcc "plugin", which would not require changes
> to the compiler itself.

Hm, that's not a bad idea.

> OTOH there's no such thing as a weak static function...

Yeah.  Instead of weak, maybe just make them global (remove the
"static")?

Anyway, thanks for the ideas.  I may try them out and see what sticks.

-- 
Josh


[PATCH] fbdev: controlfb: Add missing modes to fix out of bounds access

2017-11-07 Thread Geert Uytterhoeven
Dan's static analysis says:

drivers/video/fbdev/controlfb.c:560 control_setup()
error: buffer overflow 'control_mac_modes' 20 <= 21

Indeed, control_mac_modes[] has only 20 elements, while VMODE_MAX is 22,
which may lead to an out of bounds read when parsing vmode commandline
options.

The bug was introduced in v2.4.5.6, when 2 new modes were added to
macmodes.h, but control_mac_modes[] wasn't updated:

https://kernel.opensuse.org/cgit/kernel/diff/include/video/macmodes.h?h=v2.5.2=29f279c764808560eaceb88fef36cbc35c529aad

Augment control_mac_modes[] with the two new video modes to fix this.

Reported-by: Dan Carpenter 
Signed-off-by: Geert Uytterhoeven 
---
Compile-tested only.

The 1152x768 mode should be OK.
Given the 1600x1024 mode has a lower dotclock (112 MHz) than the
supported 1280x960 mode, it's probably OK, too.
---
 drivers/video/fbdev/controlfb.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/video/fbdev/controlfb.h b/drivers/video/fbdev/controlfb.h
index 6026c60fc1007e00..261522fabdac89ae 100644
--- a/drivers/video/fbdev/controlfb.h
+++ b/drivers/video/fbdev/controlfb.h
@@ -141,5 +141,7 @@ static struct max_cmodes control_mac_modes[] = {
{{ 1, 2}},  /* 1152x870, 75Hz */
{{ 0, 1}},  /* 1280x960, 75Hz */
{{ 0, 1}},  /* 1280x1024, 75Hz */
+   {{ 1, 2}},  /* 1152x768, 60Hz */
+   {{ 0, 1}},  /* 1600x1024, 60Hz */
 };
 
-- 
2.7.4



Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Kirill A. Shutemov
On Tue, Nov 07, 2017 at 07:15:58PM +0530, Aneesh Kumar K.V wrote:
> 
> > 
> > If it is decided to keep these kind of heuristics, can we get just a
> > small but reasonably precise description of each change to the
> > interface and ways for using the new functionality, such that would be
> > suitable for the man page? I couldn't fix powerpc because nothing
> > matches and even Aneesh and you differ on some details (MAP_FIXED
> > behaviour).
> 
> 
> I would consider MAP_FIXED as my mistake. We never discussed this explicitly
> and I kind of assumed it to behave the same way. ie, we search in lower
> address space (128TB) if the hint addr is below 128TB.
> 
> IIUC we agree on the below.
> 
> 1) MAP_FIXED allow the addr to be used, even if hint addr is below 128TB but
> hint_addr + len is > 128TB.
> 
> 2) For everything else we search in < 128TB space if hint addr is below
> 128TB
> 
> 3) We don't switch to large address space if hint_addr + len > 128TB. The
> decision to switch to large address space is primarily based on hint addr
> 
> Is there any other rule we need to outline? Or is any of the above not
> correct?

That's correct.

-- 
 Kirill A. Shutemov



Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Aneesh Kumar K.V




If it is decided to keep these kind of heuristics, can we get just a
small but reasonably precise description of each change to the
interface and ways for using the new functionality, such that would be
suitable for the man page? I couldn't fix powerpc because nothing
matches and even Aneesh and you differ on some details (MAP_FIXED
behaviour).



I would consider MAP_FIXED as my mistake. We never discussed this 
explicitly and I kind of assumed it to behave the same way. ie, we 
search in lower address space (128TB) if the hint addr is below 128TB.


IIUC we agree on the below.

1) MAP_FIXED allow the addr to be used, even if hint addr is below 128TB 
but hint_addr + len is > 128TB.


2) For everything else we search in < 128TB space if hint addr is below 
128TB


3) We don't switch to large address space if hint_addr + len > 128TB. 
The decision to switch to large address space is primarily based on hint 
addr


Is there any other rule we need to outline? Or is any of the above not 
correct?


-aneesh



Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Nicholas Piggin
On Tue, 7 Nov 2017 15:28:25 +0300
"Kirill A. Shutemov"  wrote:

> On Tue, Nov 07, 2017 at 10:56:36PM +1100, Nicholas Piggin wrote:
> > > No, it won't. You will hit stack first.  
> > 
> > I guess so. Florian's bug didn't crash there for some reason, okay
> > but I suppose my point about brk is not exactly where the standard
> > heap is, but the pattern of allocations. An allocator that uses
> > mmap for managing its address space might do the same thing, e.g.,
> > incrementally expand existing mmaps as necessary.  
> 
> With MAP_FIXED? I don't think so.

brk() based allocator effectively usees MAP_FIXED. If you know where
your addresses are, using MAP_FIXED can be used.

But okay let's ignore MAP_FIXED as a corner case. Then what happens
with !MAP_FIXED when an allocation ends exactly at 128TB and then next
one begins at 128TB? Won't that expand the address space? Should we
ignore that corner case too?

> 
> > > > Second, the kernel can never completely solve the problem this way.
> > > > How do we know a malloc library will not ask for > 128TB addresses
> > > > and pass them to an unknowing application?
> > > 
> > > The idea is that an application can provide hint (mallopt() ?) to malloc
> > > implementation that it's ready to full address space. In this case, malloc
> > > can use mmap((void *) -1,...) for its allocations and get full address
> > > space this way.  
> > 
> > Point is, there's nothing stopping an allocator library or runtime
> > from asking for mmap anywhere and returning it to userspace.  
> 
> Right. Nobody would stop it from doing stupid things. There are many
> things that a library may do that application would not be happy about.

Indeed.

> 
> > Do > 128TB pointers matter so much that we should add this heuristic
> > to prevent breakage, but little enough that we can accept some rare
> > cases getting through? Genuine question.  
> 
> At the end of the day what matters is if heuristic helps prevent breakage
> of existing userspace and doesn't stay in the way of legitimate use of
> full address space.
> 
> So far, it looks okay to me.

Well that wasn't really the point of my question. Yes of course that
is important. But the question is how are these heuristics chosen and
evaluated? Why is this a good change to make?

We've decided there is some benefit from preventing 128TB pointers, but
also not enough that we have to completely prevent them accidentally
being returned. Yet it's important enough to make mmap behaviour diverge
in about 5 ways around 128TB depending on what combination of address
and length and MAP_FIXED you specify, and introducing this new way to
use the interface to get an expanded address space?

And we're doing this because we don't want to add a prctl or personality
or whatever for SAP HANA, because it was decided that would be too complex?

> 
> > > The idea was we shouldn't allow to slip above 47-bits by accidentally.
> > > 
> > > Correctly functioning program would never request addr+len above 47-bit
> > > with MAP_FIXED, unless it's ready to handle such addresses. Otherwise the
> > > request would simply fail on machine that doesn't support large VA.
> > > 
> > > In contrast, addr+len above 47-bit without MAP_FIXED will not fail on
> > > machine that doesn't support large VA, kernel will find another place
> > > under 47-bit. And I can imagine a reasonable application that does
> > > something like this.
> > > 
> > > So we cannot rely that application is ready to handle large
> > > addresses if we see addr+len without MAP_FIXED.  
> > 
> > By the same logic, a request for addr > 128TB without MAP_FIXED will
> > not fail, therefore we can't rely on that either.
> > 
> > Or an app that links to a library that attempts MAP_FIXED allocation
> > of addr + len above 128TB might use high bits of pointer returned by
> > that library because those are never satisfied today and the library
> > would fall back.  
> 
> If you want to point that it's ABI break, yes it is.
> 
> But we allow ABI break as long as nobody notices. I think it's reasonable
> to expect that nobody relies on such corner cases.

I accept the point, but I do worry mmap syscall is very fundamental,
and that it is used in a lot of ways that are difficult to foresee,
so there's a lot of room for unintended consequences. Plus the state
of the code (that's not to pick on x86, powerpc is worse) is worrying.

> 
> If we would find any piece of software affect by the change we would need
> to reconsider.
> 

Problem is that if there was an issue caused by this, it's unlikely
to be found until a long time later. By that time, hopefully there
aren't too many other apps that are now rely on the behaviour if it
has to be changed.

I just don't see why that's a risk worth taking at all. I can't see how
the cost benefit is there. I have to confess I could have been more
involved in discussion, but is any other interface honestly too complex
to add?

If it is decided to keep these kind 

Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Kirill A. Shutemov
On Tue, Nov 07, 2017 at 02:05:42PM +0100, Florian Weimer wrote:
> On 11/07/2017 12:44 PM, Kirill A. Shutemov wrote:
> > On Tue, Nov 07, 2017 at 12:26:12PM +0100, Florian Weimer wrote:
> > > On 11/07/2017 12:15 PM, Kirill A. Shutemov wrote:
> > > 
> > > > > First of all, using addr and MAP_FIXED to develop our heuristic can
> > > > > never really give unchanged ABI. It's an in-band signal. brk() is a
> > > > > good example that steadily keeps incrementing address, so depending
> > > > > on malloc usage and address space randomization, you will get a brk()
> > > > > that ends exactly at 128T, then the next one will be >
> > > > > DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.
> > > > 
> > > > No, it won't. You will hit stack first.
> > > 
> > > That's not actually true on POWER in some cases.  See the process maps I
> > > posted here:
> > > 
> > >
> > 
> > Hm? I see that in all three cases the [stack] is the last mapping.
> > Do I miss something?
> 
> Hah, I had not noticed.  Occasionally, the order of heap and stack is
> reversed.  This happens in approximately 15% of the runs.

Heh. I guess ASLR on Power is too fancy :)

That's strange layout. It doesn't give that much (relatively speaking)
virtual address space for both stack and heap to grow.

-- 
 Kirill A. Shutemov


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Florian Weimer

On 11/07/2017 12:44 PM, Kirill A. Shutemov wrote:

On Tue, Nov 07, 2017 at 12:26:12PM +0100, Florian Weimer wrote:

On 11/07/2017 12:15 PM, Kirill A. Shutemov wrote:


First of all, using addr and MAP_FIXED to develop our heuristic can
never really give unchanged ABI. It's an in-band signal. brk() is a
good example that steadily keeps incrementing address, so depending
on malloc usage and address space randomization, you will get a brk()
that ends exactly at 128T, then the next one will be >
DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.


No, it won't. You will hit stack first.


That's not actually true on POWER in some cases.  See the process maps I
posted here:

   


Hm? I see that in all three cases the [stack] is the last mapping.
Do I miss something?


Hah, I had not noticed.  Occasionally, the order of heap and stack is 
reversed.  This happens in approximately 15% of the runs.


See the attached example.

Thanks,
Florian
7fffacc5-7fffacc9 rw-p  00:00 0 
7fffacc9-7fffaccf r--p  fd:00 25167925   
/usr/lib/locale/en_US.utf8/LC_CTYPE
7fffaccf-7fffacd0 r--p  fd:00 25167928   
/usr/lib/locale/en_US.utf8/LC_NUMERIC
7fffacd0-7fffacd1 r--p  fd:00 16798929   
/usr/lib/locale/en_US.utf8/LC_TIME
7fffacd1-7ffface4 r--p  fd:00 25167924   
/usr/lib/locale/en_US.utf8/LC_COLLATE
7ffface4-7ffface5 r--p  fd:00 16798927   
/usr/lib/locale/en_US.utf8/LC_MONETARY
7ffface5-7ffface6 r--p  fd:00 2511   
/usr/lib/locale/en_US.utf8/LC_MESSAGES/SYS_LC_MESSAGES
7ffface6-7ffface7 r--p  fd:00 16798942   
/usr/lib/locale/en_US.utf8/LC_PAPER
7ffface7-7ffface8 r--p  fd:00 25167927   
/usr/lib/locale/en_US.utf8/LC_NAME
7ffface8-7ffface9 r--p  fd:00 16798924   
/usr/lib/locale/en_US.utf8/LC_ADDRESS
7ffface9-7fffacea r--p  fd:00 16798928   
/usr/lib/locale/en_US.utf8/LC_TELEPHONE
7fffacea-7fffaceb r--p  fd:00 16798926   
/usr/lib/locale/en_US.utf8/LC_MEASUREMENT
7fffaceb-7fffacec r--s  fd:00 8390657
/usr/lib64/gconv/gconv-modules.cache
7fffacec-7fffad0d r-xp  fd:00 8390335
/usr/lib64/libc-2.25.so
7fffad0d-7fffad0e ---p 0021 fd:00 8390335
/usr/lib64/libc-2.25.so
7fffad0e-7fffad0f r--p 0021 fd:00 8390335
/usr/lib64/libc-2.25.so
7fffad0f-7fffad10 rw-p 0022 fd:00 8390335
/usr/lib64/libc-2.25.so
7fffad10-7fffad11 r--p  fd:00 16798925   
/usr/lib/locale/en_US.utf8/LC_IDENTIFICATION
7fffad11-7fffad12 r-xp  fd:00 63543  
/usr/bin/cat
7fffad12-7fffad13 r--p  fd:00 63543  
/usr/bin/cat
7fffad13-7fffad14 rw-p 0001 fd:00 63543  
/usr/bin/cat
7fffad14-7fffad16 r-xp  00:00 0  [vdso]
7fffad16-7fffad1a r-xp  fd:00 8390328
/usr/lib64/ld-2.25.so
7fffad1a-7fffad1b r--p 0003 fd:00 8390328
/usr/lib64/ld-2.25.so
7fffad1b-7fffad1c rw-p 0004 fd:00 8390328
/usr/lib64/ld-2.25.so
7fffc2cf-7fffc2d2 rw-p  00:00 0  [stack]
7fffc8c1-7fffc8c4 rw-p  00:00 0  [heap]


Re: [PATCH v1] powerpc/pci: convert to use for_each_pci_bridge() helper

2017-11-07 Thread Andy Shevchenko
On Tue, 2017-10-31 at 20:40 +0200, Andy Shevchenko wrote:
> On Tue, 2017-10-31 at 13:33 -0500, Bjorn Helgaas wrote:
> > On Tue, Oct 31, 2017 at 10:12 AM, Andy Shevchenko
> >  wrote:
> > > On Fri, 2017-10-13 at 19:52 +0300, Andy Shevchenko wrote:
> > > > ...which makes code slightly cleaner.
> > > 
> > > +Cc: Bjorn
> > > 
> > > Perhaps it makes sense to pass this through PCI if no one objects?
> > 
> > Fine with me, but I only apply things that appear on the
> > linux-...@vger.kernel.org mailing list.
> 
> I want to wait for powerpc maintainers to Ack and then I'll resend.

Michael, Paul, Benjamin, do you have any concerns on this patch?


-- 
Andy Shevchenko 
Intel Finland Oy


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Kirill A. Shutemov
On Tue, Nov 07, 2017 at 10:56:36PM +1100, Nicholas Piggin wrote:
> > No, it won't. You will hit stack first.
> 
> I guess so. Florian's bug didn't crash there for some reason, okay
> but I suppose my point about brk is not exactly where the standard
> heap is, but the pattern of allocations. An allocator that uses
> mmap for managing its address space might do the same thing, e.g.,
> incrementally expand existing mmaps as necessary.

With MAP_FIXED? I don't think so.

> > > Second, the kernel can never completely solve the problem this way.
> > > How do we know a malloc library will not ask for > 128TB addresses
> > > and pass them to an unknowing application?  
> > 
> > The idea is that an application can provide hint (mallopt() ?) to malloc
> > implementation that it's ready to full address space. In this case, malloc
> > can use mmap((void *) -1,...) for its allocations and get full address
> > space this way.
> 
> Point is, there's nothing stopping an allocator library or runtime
> from asking for mmap anywhere and returning it to userspace.

Right. Nobody would stop it from doing stupid things. There are many
things that a library may do that application would not be happy about.

> Do > 128TB pointers matter so much that we should add this heuristic
> to prevent breakage, but little enough that we can accept some rare
> cases getting through? Genuine question.

At the end of the day what matters is if heuristic helps prevent breakage
of existing userspace and doesn't stay in the way of legitimate use of
full address space.

So far, it looks okay to me.

> > The idea was we shouldn't allow to slip above 47-bits by accidentally.
> > 
> > Correctly functioning program would never request addr+len above 47-bit
> > with MAP_FIXED, unless it's ready to handle such addresses. Otherwise the
> > request would simply fail on machine that doesn't support large VA.
> > 
> > In contrast, addr+len above 47-bit without MAP_FIXED will not fail on
> > machine that doesn't support large VA, kernel will find another place
> > under 47-bit. And I can imagine a reasonable application that does
> > something like this.
> > 
> > So we cannot rely that application is ready to handle large
> > addresses if we see addr+len without MAP_FIXED.
> 
> By the same logic, a request for addr > 128TB without MAP_FIXED will
> not fail, therefore we can't rely on that either.
> 
> Or an app that links to a library that attempts MAP_FIXED allocation
> of addr + len above 128TB might use high bits of pointer returned by
> that library because those are never satisfied today and the library
> would fall back.

If you want to point that it's ABI break, yes it is.

But we allow ABI break as long as nobody notices. I think it's reasonable
to expect that nobody relies on such corner cases.

If we would find any piece of software affect by the change we would need
to reconsider.

-- 
 Kirill A. Shutemov


Re: [PATCH v2 11/18] powerpc/vas: Export HVWC to debugfs

2017-11-07 Thread Michael Ellerman
Sukadev Bhattiprolu  writes:

> diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
> b/arch/powerpc/platforms/powernv/vas-window.c
> index 23c13a7..088ce56 100644
> --- a/arch/powerpc/platforms/powernv/vas-window.c
> +++ b/arch/powerpc/platforms/powernv/vas-window.c
> @@ -145,24 +145,42 @@ static void unmap_paste_region(struct vas_window 
> *window)
>  }
>  
>  /*
> - * Unmap the MMIO regions for a window.
> + * Unmap the MMIO regions for a window. Hold the vas_mutex so we don't
> + * unmap when the window's debugfs dir is in use. This serializes close
> + * of a window even on another VAS instance but since its not a critical
> + * path, just minimize the time we hold the mutex for now. We can add
> + * a per-instance mutex later if necessary.
>   */
>  static void unmap_winctx_mmio_bars(struct vas_window *window)
>  {
>   int len;
> + void *uwc_map;
> + void *hvwc_map;
>   u64 busaddr_start;
>  
> + mutex_lock(_mutex);
> +
>   if (window->hvwc_map) {
> - get_hvwc_mmio_bar(window, _start, );
> - unmap_region(window->hvwc_map, busaddr_start, len);
> + hvwc_map = window->hvwc_map;
>   window->hvwc_map = NULL;
>   }
>  
>   if (window->uwc_map) {
> - get_uwc_mmio_bar(window, _start, );
> - unmap_region(window->uwc_map, busaddr_start, len);
> + uwc_map = window->uwc_map;
>   window->uwc_map = NULL;
>   }
> +
> + mutex_unlock(_mutex);
> +
> + if (hvwc_map) {
> + get_hvwc_mmio_bar(window, _start, );
> + unmap_region(hvwc_map, busaddr_start, len);
> + }
> +
> + if (uwc_map) {
> + get_uwc_mmio_bar(window, _start, );
> + unmap_region(uwc_map, busaddr_start, len);
> + }

arch/powerpc/platforms/powernv/vas-window.c: In function 
'unmap_winctx_mmio_bars':
arch/powerpc/platforms/powernv/vas-window.c:137:2: error: 'uwc_map' may be used 
uninitialized in this function [-Werror=maybe-uninitialized]
  iounmap(addr);
  ^
arch/powerpc/platforms/powernv/vas-window.c:168:8: note: 'uwc_map' was declared 
here
  void *uwc_map;
^
arch/powerpc/platforms/powernv/vas-window.c:137:2: error: 'hvwc_map' may be 
used uninitialized in this function [-Werror=maybe-uninitialized]
  iounmap(addr);
  ^
arch/powerpc/platforms/powernv/vas-window.c:169:8: note: 'hvwc_map' was 
declared here
  void *hvwc_map;
^
cc1: all warnings being treated as errors


cheers


Re: [PATCH v2] powerpc/kernel/sysfs: Export ldbar spr to sysfs

2017-11-07 Thread Anju T Sudhakar

Hi mpe,


On Wednesday 01 November 2017 06:20 AM, Michael Ellerman wrote:

Anju T Sudhakar  writes:


Add ldbar spr to sysfs. The spr will hold thread level In-Memory Collection 
(IMC)
counter configuration data.

This is missing any justification for why we would want to expose this,
and in particular why we would make it *writable*.

cheers



Thank you for reviewing the patch.

LDBAR, holds the thread-level counter configuration. Exposing this will help
us to understand  the current status of thread-level counters in the system.
Primarily, Bit 0 of ldbar tells whether the counters are enabled or not.
And bit 1  tells the mode (if 0-Accumulation Mode/if 1-Trace Mode).

But regarding the permission, you are right. On a reassessment I think 
that the permission
should be read only, because it is possible that we may write an 
incorrect value to the ldbar, that is wrong.

So I will change the permission here.



Thanks,
Anju



diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 4437c70..f8caee0 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -485,6 +485,7 @@ SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
  SYSFS_SPRSETUP(purr, SPRN_PURR);
  SYSFS_SPRSETUP(spurr, SPRN_SPURR);
  SYSFS_SPRSETUP(pir, SPRN_PIR);
+SYSFS_SPRSETUP(ldbar, SPRN_LDBAR);
  
  /*

Lets only enable read for phyp resources and
@@ -492,6 +493,7 @@ SYSFS_SPRSETUP(pir, SPRN_PIR);
Lets be conservative and default to pseries.
  */
  static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
+static DEVICE_ATTR(ldbar, 0600, show_ldbar, store_ldbar);
  static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
  static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
  static DEVICE_ATTR(pir, 0400, show_pir, NULL);
@@ -757,6 +759,9 @@ static int register_cpu_online(unsigned int cpu)
device_create_file(s, _attrs[i]);
  
  #ifdef CONFIG_PPC64

+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   device_create_file(s, _attr_ldbar);
+
if (cpu_has_feature(CPU_FTR_MMCRA))
device_create_file(s, _attr_mmcra);
  
@@ -842,6 +847,9 @@ static int unregister_cpu_online(unsigned int cpu)

device_remove_file(s, _attrs[i]);
  
  #ifdef CONFIG_PPC64

+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   device_remove_file(s, _attr_ldbar);
+
if (cpu_has_feature(CPU_FTR_MMCRA))
device_remove_file(s, _attr_mmcra);
  
--

2.7.4




Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Nicholas Piggin
On Tue, 7 Nov 2017 14:15:43 +0300
"Kirill A. Shutemov"  wrote:

> On Tue, Nov 07, 2017 at 04:07:05PM +1100, Nicholas Piggin wrote:
> > C'ing everyone who was on the x86 56-bit user virtual address patch.
> > 
> > I think we need more time to discuss this behaviour, in light of the
> > regression Florian uncovered. I would propose we turn off the 56-bit
> > user virtual address support for x86 for 4.14, and powerpc would
> > follow and turn off its 512T support until we can get a better handle
> > on the problems. (Actually Florian initially hit a couple of bugs in
> > powerpc implementation, but pulling that string uncovers a whole lot
> > of difficulties.)
> > 
> > The bi-modal behavior switched based on a combination of mmap address
> > hint and MAP_FIXED just sucks. It's segregating our VA space with
> > some non-standard heuristics, and it doesn't seem to work very well.
> > 
> > What are we trying to do? Allow SAP HANA etc use huge address spaces
> > by coding to these specific mmap heuristics we're going to add,
> > rather than solving it properly in a way that requires adding a new
> > syscall or personality or prctl or sysctl. Okay, but the cost is that
> > despite best efforts, it still changes ABI behaviour for existing
> > applications and these heuristics will become baked into the ABI that
> > we will have to support. Not a good tradeoff IMO.
> > 
> > First of all, using addr and MAP_FIXED to develop our heuristic can
> > never really give unchanged ABI. It's an in-band signal. brk() is a
> > good example that steadily keeps incrementing address, so depending
> > on malloc usage and address space randomization, you will get a brk()
> > that ends exactly at 128T, then the next one will be >
> > DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.  
> 
> No, it won't. You will hit stack first.

I guess so. Florian's bug didn't crash there for some reason, okay
but I suppose my point about brk is not exactly where the standard
heap is, but the pattern of allocations. An allocator that uses
mmap for managing its address space might do the same thing, e.g.,
incrementally expand existing mmaps as necessary.

> > Second, the kernel can never completely solve the problem this way.
> > How do we know a malloc library will not ask for > 128TB addresses
> > and pass them to an unknowing application?  
> 
> The idea is that an application can provide hint (mallopt() ?) to malloc
> implementation that it's ready to full address space. In this case, malloc
> can use mmap((void *) -1,...) for its allocations and get full address
> space this way.

Point is, there's nothing stopping an allocator library or runtime
from asking for mmap anywhere and returning it to userspace.

Do > 128TB pointers matter so much that we should add this heuristic
to prevent breakage, but little enough that we can accept some rare
cases getting through? Genuine question.

> 
> > And lastly, there are a fair few bugs and places where description
> > in changelogs and mailing lists does not match code. You don't want
> > to know the mess in powerpc, but even x86 has two I can see:
> > MAP_FIXED succeeds even when crossing 128TB addresses (where changelog
> > indicated it should not),  
> 
> Hm. I don't see where the changelog indicated that MAP_FIXED across 128TB
> shouldn't work. My intention was that it should, although I haven't stated
> it in the changelog.

To mitigate this, we are not going to allocate virtual address space
above 47-bit by default.

But userspace can ask for allocation from full address space by
specifying hint address (with or without MAP_FIXED) above 47-bits.

Yet we got 48 bit address with 47 bit address (with MAP_FIXED).

> 
> The idea was we shouldn't allow to slip above 47-bits by accidentally.
> 
> Correctly functioning program would never request addr+len above 47-bit
> with MAP_FIXED, unless it's ready to handle such addresses. Otherwise the
> request would simply fail on machine that doesn't support large VA.
> 
> In contrast, addr+len above 47-bit without MAP_FIXED will not fail on
> machine that doesn't support large VA, kernel will find another place
> under 47-bit. And I can imagine a reasonable application that does
> something like this.
> 
> So we cannot rely that application is ready to handle large
> addresses if we see addr+len without MAP_FIXED.

By the same logic, a request for addr > 128TB without MAP_FIXED will
not fail, therefore we can't rely on that either.

Or an app that links to a library that attempts MAP_FIXED allocation
of addr + len above 128TB might use high bits of pointer returned by
that library because those are never satisfied today and the library
would fall back.

> 
> > arch_get_unmapped_area_topdown() with an address hint is checking
> > against TASK_SIZE rather than the limited 128TB address, so it looks
> > like it won't follow the heuristics.  
> 
> You are right. This is broken. If user would request 

Re: [linux-next][0692229e] next-20171106 fails to boot on Power 7

2017-11-07 Thread Michal Hocko
On Tue 07-11-17 11:28:54, Michal Hocko wrote:
> On Tue 07-11-17 15:20:29, Abdul Haleem wrote:
> > Hi,
> > 
> > Today's next kernel fails to boot on Power 7 Machine with below errors
> > in boot log messages.
> > 
> > 'Uhuuh, elf segement at 1004 requested but the memory is
> > mapped already'
> > 
> > It was introduced with commit:
> > 
> > 0692229e : fs/binfmt_elf.c: drop MAP_FIXED usage from elf_map
> 
> Weird. Clashes shouldn't really happen. Maybe power is doing something
> different from other platforms. Could you apply the following debugging
> patch to see what is going on there?

I have checked the ppc arch_get_unmapped_area and somethig I wouldn't be
surprised if we triped over
if (!fixed && addr) {
addr = _ALIGN_UP(addr, 1ul << pshift);
slice_dbg(" aligned addr=%lx\n", addr);
/* Ignore hint if it's too large or overlaps a VMA */
if (addr > mm->task_size - len ||
!slice_area_is_free(mm, addr, len))
addr = 0;
}

I do not pretend to understand why this is needed or what this actually
does but if the debugging check simply shows that there is no
overlapping VMA then we simply have to come up with a ppc specific code
to handle this...
-- 
Michal Hocko
SUSE Labs


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Kirill A. Shutemov
On Tue, Nov 07, 2017 at 12:26:12PM +0100, Florian Weimer wrote:
> On 11/07/2017 12:15 PM, Kirill A. Shutemov wrote:
> 
> > > First of all, using addr and MAP_FIXED to develop our heuristic can
> > > never really give unchanged ABI. It's an in-band signal. brk() is a
> > > good example that steadily keeps incrementing address, so depending
> > > on malloc usage and address space randomization, you will get a brk()
> > > that ends exactly at 128T, then the next one will be >
> > > DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.
> > 
> > No, it won't. You will hit stack first.
> 
> That's not actually true on POWER in some cases.  See the process maps I
> posted here:
> 
>   

Hm? I see that in all three cases the [stack] is the last mapping.
Do I miss something?

-- 
 Kirill A. Shutemov


Re: [PATCH v3] kernel/module_64.c: Add REL24 relocation support of livepatch symbols

2017-11-07 Thread Torsten Duwe
On Tue, Nov 07, 2017 at 07:34:29PM +1100, Michael Ellerman wrote:
> Josh Poimboeuf  writes:
> 
> > On Tue, Oct 31, 2017 at 07:39:59PM +0100, Torsten Duwe wrote:
> >> On Tue, Oct 31, 2017 at 09:53:16PM +0530, Naveen N . Rao wrote:
> >> > On 2017/10/31 03:30PM, Torsten Duwe wrote:
> >> > > 
> >> > > Maybe I failed to express my views properly; I find the whole approach
> >> [...]
> >> > > NAK'd-by: Torsten Duwe 
> >> > 
> >> > Hmm... that wasn't evident at all given Balbir's reponse to your 
> >> > previous concerns and your lack of response for the same:
> >> > https://www.mail-archive.com/linuxppc-dev@lists.ozlabs.org/msg125350.html
> >> 
> >> To me it was obvious that the root cause was kpatch's current inability to
> >> deal with ppc calling conventions when copying binary functions. Hence my
> >> hint at the discussion about a possible source-level solution that would
> >> work nicely for all architectures.
> >
> > That other discussion isn't relevant.  Even if we do eventually decide
> > to go with a source-based approach, that's still a long ways off.
> 
> OK, that was my thinking, but good to have it confirmed.

It depends. We can write and compile live patching modules right away. From
my point of view it's a matter of proceedingly automating this task.

> > For the foreseeable future, kpatch-build is the only available safe way
> > to create live patches.  We need to figure out a way to make it work,
> > one way or another.

As stated, I disagree here, but let's leave that aside, and stick to
your ( :-) problem.

> > If I understand correctly, the main problem here is that a call to a
> > previously-local-but-now-global function is missing a needed nop
> > instruction after the call, which is needed for restoring r2 (the TOC
> > pointer).
> 
> Yes, that's the root of the problem.
Yes.

> > So, just brainstorming a bit, here are the possible solutions I can
> > think of:
> >
> > a) Create a special klp stub for such calls (as in Kamalesh's patch)
> >
> > b) Have kpatch-build rewrite the function to insert nops after calls to
> >previously-local functions.  This would also involve adjusting the
> >offsets of intra-function branches and relocations which come
> >afterwards in the same section.  And also patching up the DWARF
> >debuginfo, if we care about that (I think we do).  And also patching
> >up the jump tables which GCC sometimes creates for switch statements.
> >Yuck.  I'm pretty sure this is a horrible idea.
> 
> It's fairly horrible. It might be *less* horrible if you generated an
> assembly listing using the compiler, modified that, and then fed that
> through the assembler and linker.
> 
> > c) Create a new GCC flag which treats all calls as global, which can be
> >used by kpatch-build to generate the right code (assuming this flag
> >doesn't already exist).  This would be a good option, I think.
> 
> That's not impossible, but I doubt it will be all that popular with the
> toolchain folks who'd have to implement it :)  It will also take a long
> time to percolate out to users.

BTDT ;-)

> > d) Have kpatch-build do some other kind of transformation?  For example,
> >maybe it could generate klp stubs which the callee calls into.  Each
> >klp stub could then do a proper global call to the SHN_LIVEPATCH
> >symbol.
> 
> That could work.
Indeed. There is no reason to load that off onto the kernel module loader.

> > Do I understand the problem correctly?  Do the potential solutions make
> > sense?  Any other possible solutions I missed?
> 
> Possibly, I'm not really across how kpatch works in detail and what the
> constraints are.
> 
> One option would be to detect any local calls made by the patched
> function and pull those in as well - ie. make them part of the patch.
> The pathological case could obviously end up pulling in every function
> in the kernel, but in practice that's probably unlikely. It already
> happens to some extent anyway via inlining.
> 
> If modifying the source is an option, a sneaky solution is to mark the
> local functions as weak, which means the compiler/linker has to assume
> they could become global calls.

This might also be doable with a gcc "plugin", which would not require changes
to the compiler itself. OTOH there's no such thing as a weak static function...

Torsten



Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Florian Weimer

On 11/07/2017 12:15 PM, Kirill A. Shutemov wrote:


First of all, using addr and MAP_FIXED to develop our heuristic can
never really give unchanged ABI. It's an in-band signal. brk() is a
good example that steadily keeps incrementing address, so depending
on malloc usage and address space randomization, you will get a brk()
that ends exactly at 128T, then the next one will be >
DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.


No, it won't. You will hit stack first.


That's not actually true on POWER in some cases.  See the process maps I 
posted here:


  

Thanks,
Florian


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Kirill A. Shutemov
On Tue, Nov 07, 2017 at 09:15:21AM +0100, Florian Weimer wrote:
> MAP_FIXED is near-impossible to use correctly.  I hope you don't expect
> applications to do that.  If you want address-based opt in, it should work
> without MAP_FIXED.  Sure, in obscure cases, applications might still see
> out-of-range addresses, but I expected a full opt-out based on RLIMIT_AS
> would be sufficient for them.

Just use mmap(-1), without MAP_FIXED to get full address space.

-- 
 Kirill A. Shutemov


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Kirill A. Shutemov
On Tue, Nov 07, 2017 at 04:07:05PM +1100, Nicholas Piggin wrote:
> C'ing everyone who was on the x86 56-bit user virtual address patch.
> 
> I think we need more time to discuss this behaviour, in light of the
> regression Florian uncovered. I would propose we turn off the 56-bit
> user virtual address support for x86 for 4.14, and powerpc would
> follow and turn off its 512T support until we can get a better handle
> on the problems. (Actually Florian initially hit a couple of bugs in
> powerpc implementation, but pulling that string uncovers a whole lot
> of difficulties.)
> 
> The bi-modal behavior switched based on a combination of mmap address
> hint and MAP_FIXED just sucks. It's segregating our VA space with
> some non-standard heuristics, and it doesn't seem to work very well.
> 
> What are we trying to do? Allow SAP HANA etc use huge address spaces
> by coding to these specific mmap heuristics we're going to add,
> rather than solving it properly in a way that requires adding a new
> syscall or personality or prctl or sysctl. Okay, but the cost is that
> despite best efforts, it still changes ABI behaviour for existing
> applications and these heuristics will become baked into the ABI that
> we will have to support. Not a good tradeoff IMO.
> 
> First of all, using addr and MAP_FIXED to develop our heuristic can
> never really give unchanged ABI. It's an in-band signal. brk() is a
> good example that steadily keeps incrementing address, so depending
> on malloc usage and address space randomization, you will get a brk()
> that ends exactly at 128T, then the next one will be >
> DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.

No, it won't. You will hit stack first.

> Second, the kernel can never completely solve the problem this way.
> How do we know a malloc library will not ask for > 128TB addresses
> and pass them to an unknowing application?

The idea is that an application can provide hint (mallopt() ?) to malloc
implementation that it's ready to full address space. In this case, malloc
can use mmap((void *) -1,...) for its allocations and get full address
space this way.

> And lastly, there are a fair few bugs and places where description
> in changelogs and mailing lists does not match code. You don't want
> to know the mess in powerpc, but even x86 has two I can see:
> MAP_FIXED succeeds even when crossing 128TB addresses (where changelog
> indicated it should not),

Hm. I don't see where the changelog indicated that MAP_FIXED across 128TB
shouldn't work. My intention was that it should, although I haven't stated
it in the changelog.

The idea was we shouldn't allow to slip above 47-bits by accidentally.

Correctly functioning program would never request addr+len above 47-bit
with MAP_FIXED, unless it's ready to handle such addresses. Otherwise the
request would simply fail on machine that doesn't support large VA.

In contrast, addr+len above 47-bit without MAP_FIXED will not fail on
machine that doesn't support large VA, kernel will find another place
under 47-bit. And I can imagine a reasonable application that does
something like this.

So we cannot rely that application is ready to handle large
addresses if we see addr+len without MAP_FIXED.

> arch_get_unmapped_area_topdown() with an address hint is checking
> against TASK_SIZE rather than the limited 128TB address, so it looks
> like it won't follow the heuristics.

You are right. This is broken. If user would request mapping above vdso,
but below DEFAULT_MAP_WINDOW it will succeed.

I'll send patch to fix this. But it doesn't look as a show-stopper to me.

Re-checking things for this reply I found actual bug, see:

http://lkml.kernel.org/r/20171107103804.47341-1-kirill.shute...@linux.intel.com

> So unless everyone else thinks I'm crazy and disagrees, I'd ask for
> a bit more time to make sure we get this interface right. I would
> hope for something like prctl PR_SET_MM which can be used to set
> our user virtual address bits on a fine grained basis. Maybe a
> sysctl, maybe a personality. Something out-of-band. I don't wan to
> get too far into that discussion yet. First we need to agree whether
> or not the code in the tree today is a problem.

Well, we've discussed before all options you are proposing.
Linus wanted a minimalistic interface, so we took this path for now.
We can always add more ways to get access to full address space later.

-- 
 Kirill A. Shutemov


Re: [linux-next][0692229e] next-20171106 fails to boot on Power 7

2017-11-07 Thread Michal Hocko
On Tue 07-11-17 15:20:29, Abdul Haleem wrote:
> Hi,
> 
> Today's next kernel fails to boot on Power 7 Machine with below errors
> in boot log messages.
> 
> 'Uhuuh, elf segement at 1004 requested but the memory is
> mapped already'
> 
> It was introduced with commit:
> 
> 0692229e : fs/binfmt_elf.c: drop MAP_FIXED usage from elf_map

Weird. Clashes shouldn't really happen. Maybe power is doing something
different from other platforms. Could you apply the following debugging
patch to see what is going on there?
---
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0abc30d681ae..f098aaf60039 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -361,8 +361,28 @@ static unsigned long elf_vm_mmap(struct file *filep, 
unsigned long addr,
return map_addr;
 
if ((type & MAP_FIXED) && map_addr != addr) {
-   pr_info("Uhuuh, elf segement at %p requested but the memory is 
mapped already\n",
-   (void*)addr);
+   struct mm_struct *mm = current->mm;
+   struct vm_area_struct *vma;
+   pr_info("Uhuuh, elf segement at %p requested but the memory is 
mapped already, got %p\n",
+   (void*)addr, (void*)map_addr);
+   down_read(>mmap_sem);
+   vma = find_vma(mm, addr);
+   if (vma) {
+   const char *name;
+   pr_info("Clashing vma [%lx, %lx] flags:%lx", 
vma->vm_start, vma->vm_end, vma->vm_flags);
+   name = arch_vma_name(vma);
+   if (!name) {
+   if (vma->vm_start <= mm->brk &&
+   vma->vm_end >= mm->start_brk)
+   name = "[heap]";
+   else if (vma->vm_start <= 
vma->vm_mm->start_stack &&
+   vma->vm_end >= 
vma->vm_mm->start_stack)
+   name = "[stack]";
+   }
+   pr_cont(" name:%s\n", name);
+   } else
+   pr_info("Uhm, no clashing VMA\n");
+   up_read(>mmap_sem);
return -EAGAIN;
}
 

-- 
Michal Hocko
SUSE Labs


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Nicholas Piggin
On Tue, 7 Nov 2017 09:15:21 +0100
Florian Weimer  wrote:

> On 11/07/2017 06:07 AM, Nicholas Piggin wrote:
> 
> > First of all, using addr and MAP_FIXED to develop our heuristic can
> > never really give unchanged ABI. It's an in-band signal. brk() is a
> > good example that steadily keeps incrementing address, so depending
> > on malloc usage and address space randomization, you will get a brk()
> > that ends exactly at 128T, then the next one will be >
> > DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.  
> 
> Note that this brk phenomenon is only a concern for some currently 
> obscure process memory layouts where the heap ends up at the top of the 
> address space.  Usually, there is something above it which eliminates 
> the possibility that it can cross into the 128 TiB wilderness.  So the 
> brk problem only happens on some architectures (e.g., not x86-64), and 
> only with strange ways of running programs (explicitly ld.so invocation 
> and likely static PIE, too).

That's true, but there was an ABI change and the result is that it
changed behaviour.

And actually if it were not for a powerpc bug that caused a segfault,
the allocation would have worked, and you probably would never have
filed the bug. However the program would have been given a pointer >
128TB, which is what we were trying to avoid -- if your app also put
metadata in the top of the pointers, it would have broken.

So, are any obscure apps that could break? I don't know. Maybe it's
quite unlikely. Is that enough to go ahead with changing behaviour?

I'm not going to put up a big fight about this if others feel it's
not a problem. I raise it because in debugging this and seeing the
changed behaviour, the differences between implementations (x86,
powerpc hash, and powerpc radix, all have different edge case
behaviour), and that none of them seem to conform exactly to the
heuristics described in the changelogs, and there seems to be no man
page updates that I can see, it raises some red flags.

So I'm calling for one last opportunity to 1) agree on the desired
behaviour, and 2) ensure implementations are conforming.

> > So unless everyone else thinks I'm crazy and disagrees, I'd ask for
> > a bit more time to make sure we get this interface right. I would
> > hope for something like prctl PR_SET_MM which can be used to set
> > our user virtual address bits on a fine grained basis. Maybe a
> > sysctl, maybe a personality. Something out-of-band. I don't wan to
> > get too far into that discussion yet. First we need to agree whether
> > or not the code in the tree today is a problem.  
> 
> There is certainly more demand for similar functionality, like creating 
> mappings below 2 GB/4 GB/32 GB, and probably other bit patterns. 
> Hotspot would use this to place the heap with compressed oops, instead 
> of manually hunting for a suitable place for the mapping.  (Essentially, 
> 32-bit pointers on 64-bit architectures for sufficiently small heap 
> sizes.)  It would perhaps be possible to use the hints address as a 
> source of the bit count, for full flexibility.  And the mapping should 
> be placed into the upper half of the selected window if possible.
> 
> MAP_FIXED is near-impossible to use correctly.  I hope you don't expect 
> applications to do that.  If you want address-based opt in, it should 
> work without MAP_FIXED.  Sure, in obscure cases, applications might 
> still see out-of-range addresses, but I expected a full opt-out based on 
> RLIMIT_AS would be sufficient for them.

All good points, and no I don't think MAP_FIXED is a substitute.

I don't want to get too far ahead of ourselves, but I don't see why
some new interfaces with reasonably flexible and extensible ways to
specify VA behaviour is a bad idea. We already went through similar
with the ADDR_COMPAT_LAYOUT, and some various address space
randomization options, and now this, and they're all different...
As you noted, for many years now with compressed pointers and data
in pointers, address space bits have *mattered* to applications and
we can't change that.

Surely we're going to have to solve it *properly* sooner or later,
why not now? We could do both existing heuristic *and* make nicer
interfaces, but the >128TB support seems like a good place to do it
because very few apps will have to change code, and we can leave
behaviour exactly unchanged for 99.9% that will never need so
much memory.

Thanks,
Nick


Re: [PATCH v3] kernel/module_64.c: Add REL24 relocation support of livepatch symbols

2017-11-07 Thread Michael Ellerman
Josh Poimboeuf  writes:

> On Tue, Oct 31, 2017 at 07:39:59PM +0100, Torsten Duwe wrote:
>> On Tue, Oct 31, 2017 at 09:53:16PM +0530, Naveen N . Rao wrote:
>> > On 2017/10/31 03:30PM, Torsten Duwe wrote:
>> > > 
>> > > Maybe I failed to express my views properly; I find the whole approach
>> [...]
>> > > NAK'd-by: Torsten Duwe 
>> > 
>> > Hmm... that wasn't evident at all given Balbir's reponse to your 
>> > previous concerns and your lack of response for the same:
>> > https://www.mail-archive.com/linuxppc-dev@lists.ozlabs.org/msg125350.html
>> 
>> To me it was obvious that the root cause was kpatch's current inability to
>> deal with ppc calling conventions when copying binary functions. Hence my
>> hint at the discussion about a possible source-level solution that would
>> work nicely for all architectures.
>
> That other discussion isn't relevant.  Even if we do eventually decide
> to go with a source-based approach, that's still a long ways off.

OK, that was my thinking, but good to have it confirmed.

> For the foreseeable future, kpatch-build is the only available safe way
> to create live patches.  We need to figure out a way to make it work,
> one way or another.
>
> If I understand correctly, the main problem here is that a call to a
> previously-local-but-now-global function is missing a needed nop
> instruction after the call, which is needed for restoring r2 (the TOC
> pointer).

Yes, that's the root of the problem.

> So, just brainstorming a bit, here are the possible solutions I can
> think of:
>
> a) Create a special klp stub for such calls (as in Kamalesh's patch)
>
> b) Have kpatch-build rewrite the function to insert nops after calls to
>previously-local functions.  This would also involve adjusting the
>offsets of intra-function branches and relocations which come
>afterwards in the same section.  And also patching up the DWARF
>debuginfo, if we care about that (I think we do).  And also patching
>up the jump tables which GCC sometimes creates for switch statements.
>Yuck.  I'm pretty sure this is a horrible idea.

It's fairly horrible. It might be *less* horrible if you generated an
assembly listing using the compiler, modified that, and then fed that
through the assembler and linker.

> c) Create a new GCC flag which treats all calls as global, which can be
>used by kpatch-build to generate the right code (assuming this flag
>doesn't already exist).  This would be a good option, I think.

That's not impossible, but I doubt it will be all that popular with the
toolchain folks who'd have to implement it :)  It will also take a long
time to percolate out to users.

> d) Have kpatch-build do some other kind of transformation?  For example,
>maybe it could generate klp stubs which the callee calls into.  Each
>klp stub could then do a proper global call to the SHN_LIVEPATCH
>symbol.

That could work.

> Do I understand the problem correctly?  Do the potential solutions make
> sense?  Any other possible solutions I missed?

Possibly, I'm not really across how kpatch works in detail and what the
constraints are.

One option would be to detect any local calls made by the patched
function and pull those in as well - ie. make them part of the patch.
The pathological case could obviously end up pulling in every function
in the kernel, but in practice that's probably unlikely. It already
happens to some extent anyway via inlining.

If modifying the source is an option, a sneaky solution is to mark the
local functions as weak, which means the compiler/linker has to assume
they could become global calls.

cheers


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-07 Thread Florian Weimer

On 11/07/2017 06:07 AM, Nicholas Piggin wrote:


First of all, using addr and MAP_FIXED to develop our heuristic can
never really give unchanged ABI. It's an in-band signal. brk() is a
good example that steadily keeps incrementing address, so depending
on malloc usage and address space randomization, you will get a brk()
that ends exactly at 128T, then the next one will be >
DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.


Note that this brk phenomenon is only a concern for some currently 
obscure process memory layouts where the heap ends up at the top of the 
address space.  Usually, there is something above it which eliminates 
the possibility that it can cross into the 128 TiB wilderness.  So the 
brk problem only happens on some architectures (e.g., not x86-64), and 
only with strange ways of running programs (explicitly ld.so invocation 
and likely static PIE, too).



So unless everyone else thinks I'm crazy and disagrees, I'd ask for
a bit more time to make sure we get this interface right. I would
hope for something like prctl PR_SET_MM which can be used to set
our user virtual address bits on a fine grained basis. Maybe a
sysctl, maybe a personality. Something out-of-band. I don't wan to
get too far into that discussion yet. First we need to agree whether
or not the code in the tree today is a problem.


There is certainly more demand for similar functionality, like creating 
mappings below 2 GB/4 GB/32 GB, and probably other bit patterns. 
Hotspot would use this to place the heap with compressed oops, instead 
of manually hunting for a suitable place for the mapping.  (Essentially, 
32-bit pointers on 64-bit architectures for sufficiently small heap 
sizes.)  It would perhaps be possible to use the hints address as a 
source of the bit count, for full flexibility.  And the mapping should 
be placed into the upper half of the selected window if possible.


MAP_FIXED is near-impossible to use correctly.  I hope you don't expect 
applications to do that.  If you want address-based opt in, it should 
work without MAP_FIXED.  Sure, in obscure cases, applications might 
still see out-of-range addresses, but I expected a full opt-out based on 
RLIMIT_AS would be sufficient for them.


Thanks,
Florian


[PATCH v2 7/7] powerpc/64s/radix: Improve TLB flushing for page table freeing

2017-11-07 Thread Nicholas Piggin
Unmaps that free page tables always flush the entire PID, which is
sub-optimal. Provide TLB range flushing with an additional PWC flush
that can be use for va range invalidations with PWC flush.

 Time to munmap N pages of memory including last level page table
 teardown (after mmap, touch), local invalidate:
 N   1   2  4  8 16 32 64
 vanilla  3.2us  3.3us  3.4us  3.6us  4.1us  5.2us  7.2us
 patched  1.4us  1.5us  1.7us  1.9us  2.6us  3.7us  6.2us

 Global invalidate:
 N   1   2  4  8 16  32 64
 vanilla  2.2us  2.3us  2.4us  2.6us  3.2us   4.1us  6.2us
 patched  2.1us  2.5us  3.4us  5.2us  8.7us  15.7us  6.2us

Local invalidates get much better across the board. Global ones have
the same issue where multiple tlbies for va flush do get slower than
the single tlbie to invalidate the PID. None of this test captures
the TLB benefits of avoiding killing everything.

Global gets worse, but it is brought in to line with global invalidate
for munmap()s that do not free page tables.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 90 ++---
 1 file changed, 61 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 5842c98fbe48..078f7da11ce1 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -39,6 +39,20 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+   unsigned long rb,rs,prs,r;
+
+   rb = PPC_BIT(53); /* IS = 1 */
+   rs = pid << PPC_BITLSHIFT(31);
+   prs = 1; /* process scoped */
+   r = 1;   /* raidx format */
+
+   asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : 
"memory");
+   trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
 /*
  * We use 128 set in radix mode and 256 set in hpt mode.
  */
@@ -70,18 +84,9 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned 
long ric)
 
 static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 {
-   unsigned long rb,rs,prs,r;
-
-   rb = PPC_BIT(53); /* IS = 1 */
-   rs = pid << PPC_BITLSHIFT(31);
-   prs = 1; /* process scoped */
-   r = 1;   /* raidx format */
-
asm volatile("ptesync": : :"memory");
-   asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : 
"memory");
+   __tlbie_pid(pid, ric);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
-   trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
 static inline void __tlbiel_va(unsigned long va, unsigned long pid,
@@ -123,9 +128,11 @@ static inline void _tlbiel_va(unsigned long va, unsigned 
long pid,
 
 static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
-   unsigned long psize)
+   unsigned long psize, bool also_pwc)
 {
asm volatile("ptesync": : :"memory");
+   if (also_pwc)
+   __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
__tlbiel_va_range(start, end, pid, page_size, psize);
asm volatile("ptesync": : :"memory");
 }
@@ -169,9 +176,11 @@ static inline void _tlbie_va(unsigned long va, unsigned 
long pid,
 
 static inline void _tlbie_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
-   unsigned long psize)
+   unsigned long psize, bool also_pwc)
 {
asm volatile("ptesync": : :"memory");
+   if (also_pwc)
+   __tlbie_pid(pid, RIC_FLUSH_PWC);
__tlbie_va_range(start, end, pid, page_size, psize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
@@ -411,13 +420,15 @@ static int radix_get_mmu_psize(int page_size)
return psize;
 }
 
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned 
long start,
+ unsigned long end, int psize);
+
 void radix__tlb_flush(struct mmu_gather *tlb)
 {
int psize = 0;
struct mm_struct *mm = tlb->mm;
int page_size = tlb->page_size;
 
-   psize = radix_get_mmu_psize(page_size);
/*
 * if page size is not something we understand, do a full mm flush
 *
@@ -425,17 +436,28 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 * that flushes the process table entry cache upon process teardown.
 * See the comment for radix in arch_exit_mmap().
 */
-   if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
-   radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, 

[PATCH v2 6/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush

2017-11-07 Thread Nicholas Piggin
The single page flush ceiling is the cut-off point at which we switch
from invalidating individual pages, to invalidating the entire process
address space in response to a range flush.

Introduce a local variant of this heuristic because local and global
tlbie have significantly different properties:
- Local tlbiel requires 128 instructions to invalidate a PID, global
  tlbie only 1 instruction.
- Global tlbie instructions are expensive broadcast operations.

The local ceiling has been made much higher, 2x the number of
instructions required to invalidate the entire PID (i.e., 256 pages).

 Time to mprotect N pages of memory (after mmap, touch), local invalidate:
 N   32 34  64 128 256 512
 vanilla  7.4us  9.0us  14.6us  26.4us  50.2us  98.3us
 patched  7.4us  7.8us  13.8us  26.4us  51.9us  98.3us

The behaviour of both is identical at N=32 and N=512. Between there,
the vanilla kernel does a PID invalidate and the patched kernel does
a va range invalidate.

At N=128, these require the same number of tlbiel instructions, so
the patched version can be sen to be cheaper when < 128, and more
expensive when > 128. However this does not well capture the cost
of invalidated TLB.

The additional cost at 256 pages does not seem prohibitive. It may
be the case that increasing the limit further would continue to be
beneficial to avoid invalidating all of the process's TLB entries.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 277497be7aaf..5842c98fbe48 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -325,6 +325,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
  * individual page flushes to full-pid flushes.
  */
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = 
POWER9_TLB_SETS_RADIX * 2;
 
 void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 unsigned long end)
@@ -347,8 +348,15 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, 
unsigned long start,
return;
 
preempt_disable();
-   local = mm_is_thread_local(mm);
-   full = (end == TLB_FLUSH_ALL || nr_pages > 
tlb_single_page_flush_ceiling);
+   if (mm_is_thread_local(mm)) {
+   local = true;
+   full = (end == TLB_FLUSH_ALL ||
+   nr_pages > tlb_local_single_page_flush_ceiling);
+   } else {
+   local = false;
+   full = (end == TLB_FLUSH_ALL ||
+   nr_pages > tlb_single_page_flush_ceiling);
+   }
 
if (full) {
if (local)
@@ -441,8 +449,15 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
return;
 
preempt_disable();
-   local = mm_is_thread_local(mm);
-   full = (end == TLB_FLUSH_ALL || nr_pages > 
tlb_single_page_flush_ceiling);
+   if (mm_is_thread_local(mm)) {
+   local = true;
+   full = (end == TLB_FLUSH_ALL ||
+   nr_pages > tlb_local_single_page_flush_ceiling);
+   } else {
+   local = false;
+   full = (end == TLB_FLUSH_ALL ||
+   nr_pages > tlb_single_page_flush_ceiling);
+   }
 
if (full) {
if (local)
-- 
2.15.0



[PATCH v2 5/7] powerpc/64s/radix: Optimize flush_tlb_range

2017-11-07 Thread Nicholas Piggin
Currently for radix, flush_tlb_range flushes the entire PID, because
the Linux mm code does not tell us about page size here for THP vs
regular pages. This is quite sub-optimal for small mremap / mprotect
/ change_protection.

So implement va range flushes with two flush passes, one for each
page size (regular and THP). The second flush has an order of matnitude
fewer tlbie instructions than the first, so it is a relatively small
additional cost.

There is still room for improvement here with some changes to generic
APIs, particularly if there are mostly THP pages to be invalidated,
the small page flushes could be reduced.

Time to mprotect 1 page of memory (after mmap, touch):
vanilla 2.9us   1.8us
patched 1.2us   1.6us

Time to mprotect 30 pages of memory (after mmap, touch):
vanilla 8.2us   7.2us
patched 6.9us   17.9us

Time to mprotect 34 pages of memory (after mmap, touch):
vanilla 9.1us   8.0us
patched 9.0us   8.0us

34 pages is the point at which the invalidation switches from va
to entire PID, which tlbie can do in a single instruction. This is
why in the case of 30 pages, the new code runs slower for this test.
This is a deliberate tradeoff already present in the unmap and THP
promotion code, the idea is that the benefit from avoiding flushing
entire TLB for this PID on all threads in the system.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 139 
 1 file changed, 101 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 645a35b7bc9d..277497be7aaf 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -100,6 +100,17 @@ static inline void __tlbiel_va(unsigned long va, unsigned 
long pid,
trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+   unsigned long pid, unsigned long page_size,
+   unsigned long psize)
+{
+   unsigned long addr;
+   unsigned long ap = mmu_get_ap(psize);
+
+   for (addr = start; addr < end; addr += page_size)
+   __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
  unsigned long psize, unsigned long ric)
 {
@@ -114,12 +125,8 @@ static inline void _tlbiel_va_range(unsigned long start, 
unsigned long end,
unsigned long pid, unsigned long page_size,
unsigned long psize)
 {
-   unsigned long addr;
-   unsigned long ap = mmu_get_ap(psize);
-
asm volatile("ptesync": : :"memory");
-   for (addr = start; addr < end; addr += page_size)
-   __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+   __tlbiel_va_range(start, end, pid, page_size, psize);
asm volatile("ptesync": : :"memory");
 }
 
@@ -139,6 +146,17 @@ static inline void __tlbie_va(unsigned long va, unsigned 
long pid,
trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+   unsigned long pid, unsigned long page_size,
+   unsigned long psize)
+{
+   unsigned long addr;
+   unsigned long ap = mmu_get_ap(psize);
+
+   for (addr = start; addr < end; addr += page_size)
+   __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
 static inline void _tlbie_va(unsigned long va, unsigned long pid,
  unsigned long psize, unsigned long ric)
 {
@@ -153,12 +171,8 @@ static inline void _tlbie_va_range(unsigned long start, 
unsigned long end,
unsigned long pid, unsigned long page_size,
unsigned long psize)
 {
-   unsigned long addr;
-   unsigned long ap = mmu_get_ap(psize);
-
asm volatile("ptesync": : :"memory");
-   for (addr = start; addr < end; addr += page_size)
-   __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+   __tlbie_va_range(start, end, pid, page_size, psize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -299,17 +313,78 @@ void radix__flush_tlb_kernel_range(unsigned long start, 
unsigned long end)
 }
 EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 
+#define TLB_FLUSH_ALL -1UL
+
 /*
- * Currently, for range flushing, we just do a full mm flush. Because
- * we use this in code path where we don' track the page size.
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page 

[PATCH v2 4/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions

2017-11-07 Thread Nicholas Piggin
Move the barriers and range iteration down into the _tlbie* level,
which improves readability.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 71 ++---
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 49e71c68f5b1..645a35b7bc9d 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -85,7 +85,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned 
long ric)
 }
 
 static inline void __tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+  unsigned long ap, unsigned long ric)
 {
unsigned long rb,rs,prs,r;
 
@@ -101,13 +101,28 @@ static inline void __tlbiel_va(unsigned long va, unsigned 
long pid,
 }
 
 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+ unsigned long psize, unsigned long ric)
 {
+   unsigned long ap = mmu_get_ap(psize);
+
asm volatile("ptesync": : :"memory");
__tlbiel_va(va, pid, ap, ric);
asm volatile("ptesync": : :"memory");
 }
 
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+   unsigned long pid, unsigned long page_size,
+   unsigned long psize)
+{
+   unsigned long addr;
+   unsigned long ap = mmu_get_ap(psize);
+
+   asm volatile("ptesync": : :"memory");
+   for (addr = start; addr < end; addr += page_size)
+   __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+   asm volatile("ptesync": : :"memory");
+}
+
 static inline void __tlbie_va(unsigned long va, unsigned long pid,
 unsigned long ap, unsigned long ric)
 {
@@ -125,13 +140,27 @@ static inline void __tlbie_va(unsigned long va, unsigned 
long pid,
 }
 
 static inline void _tlbie_va(unsigned long va, unsigned long pid,
-unsigned long ap, unsigned long ric)
+ unsigned long psize, unsigned long ric)
 {
+   unsigned long ap = mmu_get_ap(psize);
+
asm volatile("ptesync": : :"memory");
__tlbie_va(va, pid, ap, ric);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+   unsigned long pid, unsigned long page_size,
+   unsigned long psize)
+{
+   unsigned long addr;
+   unsigned long ap = mmu_get_ap(psize);
+
+   asm volatile("ptesync": : :"memory");
+   for (addr = start; addr < end; addr += page_size)
+   __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+   asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
 
 /*
  * Base TLB flushing operations:
@@ -174,12 +203,11 @@ void radix__local_flush_tlb_page_psize(struct mm_struct 
*mm, unsigned long vmadd
   int psize)
 {
unsigned long pid;
-   unsigned long ap = mmu_get_ap(psize);
 
preempt_disable();
pid = mm->context.id;
if (pid != MMU_NO_CONTEXT)
-   _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+   _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
 }
 
@@ -239,16 +267,15 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, 
unsigned long vmaddr,
 int psize)
 {
unsigned long pid;
-   unsigned long ap = mmu_get_ap(psize);
 
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
if (!mm_is_thread_local(mm))
-   _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+   _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
else
-   _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+   _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
 }
 
@@ -335,9 +362,7 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
  unsigned long end, int psize)
 {
unsigned long pid;
-   unsigned long addr;
bool local;
-   unsigned long ap = mmu_get_ap(psize);
unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
 
@@ -356,17 +381,10 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
_tlbie_pid(pid, RIC_FLUSH_TLB);
 
} else {
-   asm volatile("ptesync": : :"memory");
-   for (addr = start; addr < end; addr += page_size) {
-   if (local)
-   __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-   else
-   __tlbie_va(addr, pid, ap, 

[PATCH v2 3/7] powerpc/64s/radix: optimize TLB range flush barriers

2017-11-07 Thread Nicholas Piggin
Short range flushes issue a sequences of tlbie(l) instructions for
individual effective addresses. These do not all require individual
barrier sequences, only one covering all tlbie(l) instructions.

Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync")
made a similar optimization for tlbiel for PID flushing.

For tlbie, the ISA says:

The tlbsync instruction provides an ordering function for the
effects of all tlbie instructions executed by the thread executing
the tlbsync instruction, with respect to the memory barrier
created by a subsequent ptesync instruction executed by the same
thread.

Time to munmap 30 pages of memory (after mmap, touch):
 local   global
vanilla  10.9us  22.3us
patched   3.4us  14.4us

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 41 -
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 6e77ed2d7c6c..49e71c68f5b1 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -84,7 +84,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned 
long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+static inline void __tlbiel_va(unsigned long va, unsigned long pid,
  unsigned long ap, unsigned long ric)
 {
unsigned long rb,rs,prs,r;
@@ -95,14 +95,20 @@ static inline void _tlbiel_va(unsigned long va, unsigned 
long pid,
prs = 1; /* process scoped */
r = 1;   /* raidx format */
 
-   asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : 
"memory");
-   asm volatile("ptesync": : :"memory");
trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
+{
+   asm volatile("ptesync": : :"memory");
+   __tlbiel_va(va, pid, ap, ric);
+   asm volatile("ptesync": : :"memory");
+}
+
+static inline void __tlbie_va(unsigned long va, unsigned long pid,
 unsigned long ap, unsigned long ric)
 {
unsigned long rb,rs,prs,r;
@@ -113,13 +119,20 @@ static inline void _tlbie_va(unsigned long va, unsigned 
long pid,
prs = 1; /* process scoped */
r = 1;   /* raidx format */
 
-   asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : 
"memory");
-   asm volatile("eieio; tlbsync; ptesync": : :"memory");
trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+unsigned long ap, unsigned long ric)
+{
+   asm volatile("ptesync": : :"memory");
+   __tlbie_va(va, pid, ap, ric);
+   asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+
 /*
  * Base TLB flushing operations:
  *
@@ -341,13 +354,19 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
_tlbiel_pid(pid, RIC_FLUSH_TLB);
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
+
} else {
+   asm volatile("ptesync": : :"memory");
for (addr = start; addr < end; addr += page_size) {
if (local)
-   _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+   __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
else
-   _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+   __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
+   if (local)
+   asm volatile("ptesync": : :"memory");
+   else
+   asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
 
preempt_enable();
@@ -380,6 +399,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
_tlbie_pid(pid, RIC_FLUSH_PWC);
 
/* Then iterate the pages */
+   asm volatile("ptesync": : :"memory");
end = addr + HPAGE_PMD_SIZE;
for (; addr < end; addr += PAGE_SIZE) {
if (local)
@@ -387,7 +407,10 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
else
_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
-
+   if (local)
+   asm volatile("ptesync": : :"memory");
+   else
+   asm volatile("eieio; tlbsync; ptesync": : :"memory");
preempt_enable();
 }
 #endif /*