date:20191125

[PATCH 18/18] powerpc/fault: Use analyse_instr() to check for store with updates to sp

2019-11-25 Thread Jordan Niethe

A user-mode access to an address a long way below the stack pointer is
only valid if the instruction is one that would update the stack pointer
to the address accessed. This is checked by directly looking at the
instructions op-code. As a result is does not take into account prefixed
instructions. Instead of looking at the instruction our self, use
analyse_instr() determine if this a store instruction that will update
the stack pointer.

Something to note is that there currently are not any store with update
prefixed instructions. Actually there is no plan for prefixed
update-form loads and stores. So this patch is probably not needed but
it might be preferable to use analyse_instr() rather than open coding
the test anyway.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/mm/fault.c | 39 +++
 1 file changed, 11 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b5047f9b5dec..cb78b3ca1800 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -41,37 +41,17 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Check whether the instruction inst is a store using
  * an update addressing form which will update r1.
  */
-static bool store_updates_sp(unsigned int inst)
+static bool store_updates_sp(struct instruction_op *op)
 {
-   /* check for 1 in the rA field */
-   if (((inst >> 16) & 0x1f) != 1)
-   return false;
-   /* check major opcode */
-   switch (inst >> 26) {
-   case OP_STWU:
-   case OP_STBU:
-   case OP_STHU:
-   case OP_STFSU:
-   case OP_STFDU:
-   return true;
-   case OP_STD:/* std or stdu */
-   return (inst & 3) == 1;
-   case OP_31:
-   /* check minor opcode */
-   switch ((inst >> 1) & 0x3ff) {
-   case OP_31_XOP_STDUX:
-   case OP_31_XOP_STWUX:
-   case OP_31_XOP_STBUX:
-   case OP_31_XOP_STHUX:
-   case OP_31_XOP_STFSUX:
-   case OP_31_XOP_STFDUX:
+   if (GETTYPE(op->type) == STORE) {
+   if ((op->type & UPDATE) && (op->update_reg == 1))
return true;
-   }
}
return false;
 }
@@ -278,14 +258,17 @@ static bool bad_stack_expansion(struct pt_regs *regs, 
unsigned long address,
 
if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
access_ok(nip, sizeof(*nip))) {
-   unsigned int inst;
+   unsigned int inst, sufx;
+   struct instruction_op op;
int res;
 
pagefault_disable();
-   res = __get_user_inatomic(inst, nip);
+   res = __get_user_instr_inatomic(inst, sufx, nip);
pagefault_enable();
-   if (!res)
-   return !store_updates_sp(inst);
+   if (!res) {
+   analyse_instr(&op, uregs, inst, sufx);
+   return !store_updates_sp(&op);
+   }
*must_retry = true;
}
return true;
-- 
2.20.1

[PATCH 17/18] powerpc: Add prefix support to mce_find_instr_ea_and_pfn()

2019-11-25 Thread Jordan Niethe

mce_find_instr_ea_and_pfn analyses an instruction to determine the
effective address that caused the machine check. Update this to load and
pass the suffix to analyse_instr for prefixed instructions.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/kernel/mce_power.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index d862bb549158..68e81fcbdf07 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -365,7 +365,7 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, 
uint64_t *addr,
 * in real-mode is tricky and can lead to recursive
 * faults
 */
-   int instr;
+   int instr, sufx = 0;
unsigned long pfn, instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
@@ -374,7 +374,9 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, 
uint64_t *addr,
if (pfn != ULONG_MAX) {
instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
instr = *(unsigned int *)(instr_addr);
-   if (!analyse_instr(&op, &tmp, instr, 0)) {
+   if (IS_PREFIX(instr))
+   sufx = *(unsigned int *)(instr_addr + 4);
+   if (!analyse_instr(&op, &tmp, instr, sufx)) {
pfn = addr_to_pfn(regs, op.ea);
*addr = op.ea;
*phys_addr = (pfn << PAGE_SHIFT);
-- 
2.20.1

[PATCH 16/18] powerpc/hw_breakpoints: Initial support for prefixed instructions

2019-11-25 Thread Jordan Niethe

Currently when getting an instruction to emulate in
hw_breakpoint_handler() we do not load the suffix of a prefixed
instruction. Ensure we load the suffix if the instruction we need to
emulate is a prefixed instruction.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/kernel/hw_breakpoint.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/hw_breakpoint.c 
b/arch/powerpc/kernel/hw_breakpoint.c
index f4530961998c..f7e1af8b9eae 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -240,15 +240,15 @@ dar_range_overlaps(unsigned long dar, int size, struct 
arch_hw_breakpoint *info)
 static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp,
 struct arch_hw_breakpoint *info)
 {
-   unsigned int instr = 0;
+   unsigned int instr = 0, sufx = 0;
int ret, type, size;
struct instruction_op op;
unsigned long addr = info->address;
 
-   if (__get_user_inatomic(instr, (unsigned int *)regs->nip))
+   if (__get_user_instr_inatomic(instr, sufx, (unsigned int *)regs->nip))
goto fail;
 
-   ret = analyse_instr(&op, regs, instr, 0);
+   ret = analyse_instr(&op, regs, instr, sufx);
type = GETTYPE(op.type);
size = GETSIZE(op.type);
 
@@ -272,7 +272,7 @@ static bool stepping_handler(struct pt_regs *regs, struct 
perf_event *bp,
return false;
}
 
-   if (!emulate_step(regs, instr, 0))
+   if (!emulate_step(regs, instr, sufx))
goto fail;
 
return true;
-- 
2.20.1

[PATCH 15/18] powerpc/uprobes: Add support for prefixed instructions

2019-11-25 Thread Jordan Niethe

Uprobes can execute instructions out of line. Increase the size of the
buffer used  for this so that this works for prefixed instructions. Take
into account the length of prefixed instructions when fixing up the nip.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/include/asm/uprobes.h | 18 ++
 arch/powerpc/kernel/uprobes.c  |  4 ++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/uprobes.h 
b/arch/powerpc/include/asm/uprobes.h
index 2bbdf27d09b5..5b5e8a3d2f55 100644
--- a/arch/powerpc/include/asm/uprobes.h
+++ b/arch/powerpc/include/asm/uprobes.h
@@ -14,18 +14,28 @@
 
 typedef ppc_opcode_t uprobe_opcode_t;
 
+/*
+ * We have to ensure we have enought space for prefixed instructions, which
+ * are double the size of a word instruction, i.e. 8 bytes. However,
+ * sometimes it is simpler to treat a prefixed instruction like 2 word
+ * instructions.
+ */
 #define MAX_UINSN_BYTES4
-#define UPROBE_XOL_SLOT_BYTES  (MAX_UINSN_BYTES)
+#define UPROBE_XOL_SLOT_BYTES  (2 * MAX_UINSN_BYTES)
 
 /* The following alias is needed for reference from arch-agnostic code */
 #define UPROBE_SWBP_INSN   BREAKPOINT_INSTRUCTION
 #define UPROBE_SWBP_INSN_SIZE  4 /* swbp insn size in bytes */
 
 struct arch_uprobe {
+/*
+ * Ensure there is enough space for prefixed instructions. Prefixed
+ * instructions must not cross 64-byte boundaries.
+ */
union {
-   u32 insn;
-   u32 ixol;
-   };
+   uprobe_opcode_t insn[2];
+   uprobe_opcode_t ixol[2];
+   } __aligned(64);
 };
 
 struct arch_uprobe_task {
diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
index ab1077dc6148..cfcea6946f8b 100644
--- a/arch/powerpc/kernel/uprobes.c
+++ b/arch/powerpc/kernel/uprobes.c
@@ -111,7 +111,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, 
struct pt_regs *regs)
 * support doesn't exist and have to fix-up the next instruction
 * to be executed.
 */
-   regs->nip = utask->vaddr + MAX_UINSN_BYTES;
+   regs->nip = utask->vaddr + ((IS_PREFIX(auprobe->insn[0])) ? 8 : 4);
 
user_disable_single_step(current);
return 0;
@@ -173,7 +173,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, 
struct pt_regs *regs)
 * emulate_step() returns 1 if the insn was successfully emulated.
 * For all other cases, we need to single-step in hardware.
 */
-   ret = emulate_step(regs, auprobe->insn, 0);
+   ret = emulate_step(regs, auprobe->insn[0], auprobe->insn[1]);
if (ret > 0)
return true;
 
-- 
2.20.1

[PATCH 14/18] powerpc/kprobes: Support kprobes on prefixed instructions

2019-11-25 Thread Jordan Niethe

A prefixed instruction is composed of a word prefix followed by a word
suffix. It does not make sense to be able to have a kprobe on the suffix
of a prefixed instruction, so make this impossible.

Kprobes work by replacing an instruction with a trap and saving that
instruction to be single stepped out of place later. Currently there is
not enough space allocated to keep a prefixed instruction for single
stepping. Increase the amount of space allocated for holding the
instruction copy.

kprobe_post_handler() expects all instructions to be 4 bytes long which
means that it does not function correctly for prefixed instructions.
Add checks for prefixed instructions which will use a length of 8 bytes
instead.

For optprobes we normally patch in loading the instruction we put a
probe on into r4 before calling emulate_step(). We now make space and
patch in loading the suffix into r5 as well.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/include/asm/kprobes.h   |  5 +--
 arch/powerpc/kernel/kprobes.c| 46 +---
 arch/powerpc/kernel/optprobes.c  | 31 +++
 arch/powerpc/kernel/optprobes_head.S |  6 
 4 files changed, 62 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/kprobes.h 
b/arch/powerpc/include/asm/kprobes.h
index 66b3f2983b22..1f03a1cacb1e 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -38,12 +38,13 @@ extern kprobe_opcode_t optprobe_template_entry[];
 extern kprobe_opcode_t optprobe_template_op_address[];
 extern kprobe_opcode_t optprobe_template_call_handler[];
 extern kprobe_opcode_t optprobe_template_insn[];
+extern kprobe_opcode_t optprobe_template_sufx[];
 extern kprobe_opcode_t optprobe_template_call_emulate[];
 extern kprobe_opcode_t optprobe_template_ret[];
 extern kprobe_opcode_t optprobe_template_end[];
 
-/* Fixed instruction size for powerpc */
-#define MAX_INSN_SIZE  1
+/* Prefixed instructions are two words */
+#define MAX_INSN_SIZE  2
 #define MAX_OPTIMIZED_LENGTH   sizeof(kprobe_opcode_t) /* 4 bytes */
 #define MAX_OPTINSN_SIZE   (optprobe_template_end - 
optprobe_template_entry)
 #define RELATIVEJUMP_SIZE  sizeof(kprobe_opcode_t) /* 4 bytes */
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 7303fe3856cc..aa15b3480385 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -104,17 +104,30 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, 
unsigned int offset)
 
 int arch_prepare_kprobe(struct kprobe *p)
 {
+   int len;
int ret = 0;
+   struct kprobe *prev;
kprobe_opcode_t insn = *p->addr;
+   kprobe_opcode_t prfx = *(p->addr - 1);
 
+   preempt_disable();
if ((unsigned long)p->addr & 0x03) {
printk("Attempt to register kprobe at an unaligned address\n");
ret = -EINVAL;
} else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) {
printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n");
ret = -EINVAL;
+   } else if (IS_PREFIX(prfx)) {
+   printk("Cannot register a kprobe on the second word of prefixed 
instruction\n");
+   ret = -EINVAL;
+   }
+   prev = get_kprobe(p->addr - 1);
+   if (prev && IS_PREFIX(*prev->ainsn.insn)) {
+   printk("Cannot register a kprobe on the second word of prefixed 
instruction\n");
+   ret = -EINVAL;
}
 
+
/* insn must be on a special executable page on ppc64.  This is
 * not explicitly required on ppc32 (right now), but it doesn't hurt */
if (!ret) {
@@ -124,14 +137,18 @@ int arch_prepare_kprobe(struct kprobe *p)
}
 
if (!ret) {
-   memcpy(p->ainsn.insn, p->addr,
-   MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+   if (IS_PREFIX(insn))
+   len = MAX_INSN_SIZE * sizeof(kprobe_opcode_t);
+   else
+   len = sizeof(kprobe_opcode_t);
+   memcpy(p->ainsn.insn, p->addr, len);
p->opcode = *p->addr;
flush_icache_range((unsigned long)p->ainsn.insn,
(unsigned long)p->ainsn.insn + sizeof(kprobe_opcode_t));
}
 
p->ainsn.boostable = 0;
+   preempt_enable_no_resched();
return ret;
 }
 NOKPROBE_SYMBOL(arch_prepare_kprobe);
@@ -216,10 +233,11 @@ NOKPROBE_SYMBOL(arch_prepare_kretprobe);
 static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
 {
int ret;
-   unsigned int insn = *p->ainsn.insn;
+   unsigned int insn = p->ainsn.insn[0];
+   unsigned int sufx = p->ainsn.insn[1];
 
/* regs->nip is also adjusted if emulate_step returns 1 */
-   ret = emulate_step(regs, insn, 0);
+   ret = emulate_step(regs, insn, sufx);
if (ret > 0) {
/*
 * Once this instruction has been boos

[PATCH 13/18] powerpc/xmon: Dump prefixed instructions

2019-11-25 Thread Jordan Niethe

Currently when xmon is dumping instructions it reads a word at a time
and then prints that instruction (either as a hex number or by
disassembling it). For prefixed instructions it would be nice to show
its prefix and suffix as together. Use read_instr() so that if a prefix
is encountered its suffix is loaded too. Then print these in the form:
prefix:suffix
Xmon uses the disassembly routines from GNU binutils. These currently do
not support prefixed instructions so we will not disassemble the
prefixed instructions yet.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/xmon/xmon.c | 50 +++-
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 93259a06eadc..dc8b1c7b3e1b 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2900,6 +2900,21 @@ prdump(unsigned long adrs, long ndump)
}
 }
 
+static bool instrs_are_equal(unsigned long insta, unsigned long sufxa,
+unsigned long instb, unsigned long sufxb)
+{
+   if (insta != instb)
+   return false;
+
+   if (!IS_PREFIX(insta) && !IS_PREFIX(instb))
+   return true;
+
+   if (IS_PREFIX(insta) && IS_PREFIX(instb))
+   return sufxa == sufxb;
+
+   return false;
+}
+
 typedef int (*instruction_dump_func)(unsigned long inst, unsigned long addr);
 
 static int
@@ -2908,12 +2923,11 @@ generic_inst_dump(unsigned long adr, long count, int 
praddr,
 {
int nr, dotted;
unsigned long first_adr;
-   unsigned int inst, last_inst = 0;
-   unsigned char val[4];
+   unsigned int inst, sufx, last_inst = 0, last_sufx = 0;
 
dotted = 0;
-   for (first_adr = adr; count > 0; --count, adr += 4) {
-   nr = mread(adr, val, 4);
+   for (first_adr = adr; count > 0; --count, adr += nr) {
+   nr = read_instr(adr, &inst, &sufx);
if (nr == 0) {
if (praddr) {
const char *x = fault_chars[fault_type];
@@ -2921,8 +2935,9 @@ generic_inst_dump(unsigned long adr, long count, int 
praddr,
}
break;
}
-   inst = GETWORD(val);
-   if (adr > first_adr && inst == last_inst) {
+   if (adr > first_adr && instrs_are_equal(inst, sufx,
+   last_inst,
+   last_sufx)) {
if (!dotted) {
printf(" ...\n");
dotted = 1;
@@ -2931,11 +2946,24 @@ generic_inst_dump(unsigned long adr, long count, int 
praddr,
}
dotted = 0;
last_inst = inst;
-   if (praddr)
-   printf(REG"  %.8x", adr, inst);
-   printf("\t");
-   dump_func(inst, adr);
-   printf("\n");
+   last_sufx = sufx;
+   if (IS_PREFIX(inst)) {
+   if (praddr)
+   printf(REG"  %.8x:%.8x", adr, inst, sufx);
+   printf("\t");
+   /*
+* Just use this until binutils ppc disassembly
+* prints prefixed instructions.
+*/
+   printf("%.8x:%.8x", inst, sufx);
+   printf("\n");
+   } else {
+   if (praddr)
+   printf(REG"  %.8x", adr, inst);
+   printf("\t");
+   dump_func(inst, adr);
+   printf("\n");
+   }
}
return adr - first_adr;
 }
-- 
2.20.1

[PATCH 12/18] powerpc/xmon: Add initial support for prefixed instructions

2019-11-25 Thread Jordan Niethe

A prefixed instruction is composed of a word prefix and a word suffix.
It does not make sense to be able to have a breakpoint on the suffix of
a prefixed instruction, so make this impossible.

When leaving xmon_core() we check to see if we are currently at a
breakpoint. If this is the case, the breakpoint needs to be proceeded
from. Initially emulate_step() is tried, but if this fails then we need
to execute the saved instruction out of line. The NIP is set to the
address of bpt::instr[] for the current breakpoint.  bpt::instr[]
contains the instruction replaced by the breakpoint, followed by a trap
instruction.  After bpt::instr[0] is executed and we hit the trap we
enter back into xmon_bpt(). We know that if we got here and the offset
indicates we are at bpt::instr[1] then we have just executed out of line
so we can put the NIP back to the instruction after the breakpoint
location and continue on.

Adding prefixed instructions complicates this as the bpt::instr[1] needs
to be used to hold the suffix. To deal with this make bpt::instr[] big
enough for three word instructions.  bpt::instr[2] contains the trap,
and in the case of word instructions pad bpt::instr[1] with a noop.

No support for disassembling prefixed instructions.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/xmon/xmon.c | 82 ++--
 1 file changed, 71 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index f47bd843dc52..93259a06eadc 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -97,7 +97,8 @@ static long *xmon_fault_jmp[NR_CPUS];
 /* Breakpoint stuff */
 struct bpt {
unsigned long   address;
-   unsigned intinstr[2];
+   /* Prefixed instructions can not cross 64-byte boundaries */
+   unsigned intinstr[3] __aligned(64);
atomic_tref_count;
int enabled;
unsigned long   pad;
@@ -113,6 +114,7 @@ static struct bpt bpts[NBPTS];
 static struct bpt dabr;
 static struct bpt *iabr;
 static unsigned bpinstr = 0x7fe8;  /* trap */
+static unsigned nopinstr = 0x6000; /* nop */
 
 #define BP_NUM(bp) ((bp) - bpts + 1)
 
@@ -120,6 +122,7 @@ static unsigned bpinstr = 0x7fe8;   /* trap */
 static int cmds(struct pt_regs *);
 static int mread(unsigned long, void *, int);
 static int mwrite(unsigned long, void *, int);
+static int read_instr(unsigned long, unsigned int *, unsigned int *);
 static int handle_fault(struct pt_regs *);
 static void byterev(unsigned char *, int);
 static void memex(void);
@@ -705,7 +708,8 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) == (MSR_IR|MSR_64BIT)) {
bp = at_breakpoint(regs->nip);
if (bp != NULL) {
-   int stepped = emulate_step(regs, bp->instr[0], 0);
+   int stepped = emulate_step(regs, bp->instr[0],
+  bp->instr[1]);
if (stepped == 0) {
regs->nip = (unsigned long) &bp->instr[0];
atomic_inc(&bp->ref_count);
@@ -760,8 +764,8 @@ static int xmon_bpt(struct pt_regs *regs)
 
/* Are we at the trap at bp->instr[1] for some bp? */
bp = in_breakpoint_table(regs->nip, &offset);
-   if (bp != NULL && offset == 4) {
-   regs->nip = bp->address + 4;
+   if (bp != NULL && (offset == 4 || offset == 8)) {
+   regs->nip = bp->address + offset;
atomic_dec(&bp->ref_count);
return 1;
}
@@ -863,7 +867,8 @@ static struct bpt *in_breakpoint_table(unsigned long nip, 
unsigned long *offp)
return NULL;
off %= sizeof(struct bpt);
if (off != offsetof(struct bpt, instr[0])
-   && off != offsetof(struct bpt, instr[1]))
+   && off != offsetof(struct bpt, instr[1])
+   && off != offsetof(struct bpt, instr[2]))
return NULL;
*offp = off - offsetof(struct bpt, instr[0]);
return (struct bpt *) (nip - off);
@@ -880,9 +885,18 @@ static struct bpt *new_breakpoint(unsigned long a)
 
for (bp = bpts; bp < &bpts[NBPTS]; ++bp) {
if (!bp->enabled && atomic_read(&bp->ref_count) == 0) {
+   /*
+* Prefixed instructions are two words, but regular
+* instructions are only one. Use a nop to pad out the
+* regular instructions so that we can place the trap
+* at the same plac. For prefixed instructions the nop
+* will get overwritten during insert_bpts().
+*/
bp->address = a;
-   bp->instr[1] = bpinstr;
+   bp->instr[1] = nopinstr;
store_inst(&bp->instr[1]);
+

[PATCH 11/18] powerpc/traps: Check for prefixed instructions in facility_unavailable_exception()

2019-11-25 Thread Jordan Niethe

If prefixed instructions are made unavailable by the [H]FSCR, attempting
to use them will cause a facility unavailable exception. Add "PREFIX" to
the facility_strings[].

Currently there are no prefixed instructions that are actually emulated
by emulate_instruction() within facility_unavailable_exception().
However, when caused by a prefixed instructions the SRR1 PREFIXED bit is
set. Prepare for dealing with emulated prefixed instructions by checking
for this bit.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/kernel/traps.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 8e26f464..92057830b9b6 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1726,6 +1726,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
[FSCR_TAR_LG] = "TAR",
[FSCR_MSGP_LG] = "MSGP",
[FSCR_SCV_LG] = "SCV",
+   [FSCR_PREFIX_LG] = "PREFIX",
};
char *facility = "unknown";
u64 value;
-- 
2.20.1

[PATCH 10/18] powerpc: Support prefixed instructions in alignment handler

2019-11-25 Thread Jordan Niethe

Alignment interrupts can be caused by prefixed instructions accessing
memory. In the alignment handler the instruction that caused the
exception is loaded and attempted emulate. If the instruction is a
prefixed instruction load the prefix and suffix to emulate. After
emulating increment the NIP by 8.

Prefixed instructions are not permitted to cross 64-byte boundaries. If
they do the alignment interrupt is invoked with SRR1 BOUNDARY bit set.
If this occurs send a SIGBUS to the offending process if in user mode.
If in kernel mode call bad_page_fault().

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/kernel/align.c |  8 +---
 arch/powerpc/kernel/traps.c | 17 -
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 245e79792a01..53493404c25c 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -293,7 +293,7 @@ static int emulate_spe(struct pt_regs *regs, unsigned int 
reg,
 
 int fix_alignment(struct pt_regs *regs)
 {
-   unsigned int instr;
+   unsigned int instr, sufx;
struct instruction_op op;
int r, type;
 
@@ -303,13 +303,15 @@ int fix_alignment(struct pt_regs *regs)
 */
CHECK_FULL_REGS(regs);
 
-   if (unlikely(__get_user(instr, (unsigned int __user *)regs->nip)))
+   if (unlikely(__get_user_instr(instr, sufx,
+(unsigned int __user *)regs->nip)))
return -EFAULT;
if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) {
/* We don't handle PPC little-endian any more... */
if (cpu_has_feature(CPU_FTR_PPC_LE))
return -EIO;
instr = swab32(instr);
+   sufx = swab32(sufx);
}
 
 #ifdef CONFIG_SPE
@@ -334,7 +336,7 @@ int fix_alignment(struct pt_regs *regs)
if ((instr & 0xfc0006fe) == (PPC_INST_COPY & 0xfc0006fe))
return -EIO;
 
-   r = analyse_instr(&op, regs, instr, 0);
+   r = analyse_instr(&op, regs, instr, sufx);
if (r < 0)
return -EINVAL;
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 014ff0701f24..8e26f464 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -583,6 +583,8 @@ static inline int check_io_access(struct pt_regs *regs)
 #define REASON_ILLEGAL (ESR_PIL | ESR_PUO)
 #define REASON_PRIVILEGED  ESR_PPR
 #define REASON_TRAPESR_PTR
+#define REASON_PREFIXED0
+#define REASON_BOUNDARY0
 
 /* single-step stuff */
 #define single_stepping(regs)  (current->thread.debug.dbcr0 & DBCR0_IC)
@@ -597,6 +599,8 @@ static inline int check_io_access(struct pt_regs *regs)
 #define REASON_ILLEGAL SRR1_PROGILL
 #define REASON_PRIVILEGED  SRR1_PROGPRIV
 #define REASON_TRAPSRR1_PROGTRAP
+#define REASON_PREFIXEDSRR1_PREFIXED
+#define REASON_BOUNDARYSRR1_BOUNDARY
 
 #define single_stepping(regs)  ((regs)->msr & MSR_SE)
 #define clear_single_step(regs)((regs)->msr &= ~MSR_SE)
@@ -1593,11 +1597,20 @@ void alignment_exception(struct pt_regs *regs)
 {
enum ctx_state prev_state = exception_enter();
int sig, code, fixed = 0;
+   unsigned long  reason;
 
/* We restore the interrupt state now */
if (!arch_irq_disabled_regs(regs))
local_irq_enable();
 
+   reason = get_reason(regs);
+
+   if (reason & REASON_BOUNDARY) {
+   sig = SIGBUS;
+   code = BUS_ADRALN;
+   goto bad;
+   }
+
if (tm_abort_check(regs, TM_CAUSE_ALIGNMENT | TM_CAUSE_PERSISTENT))
goto bail;
 
@@ -1606,7 +1619,8 @@ void alignment_exception(struct pt_regs *regs)
fixed = fix_alignment(regs);
 
if (fixed == 1) {
-   regs->nip += 4; /* skip over emulated instruction */
+   /* skip over emulated instruction */
+   regs->nip += (reason & REASON_PREFIXED) ? 8 : 4;
emulate_single_step(regs);
goto bail;
}
@@ -1619,6 +1633,7 @@ void alignment_exception(struct pt_regs *regs)
sig = SIGBUS;
code = BUS_ADRALN;
}
+bad:
if (user_mode(regs))
_exception(sig, regs, code, regs->dar);
else
-- 
2.20.1

[PATCH 09/18] powerpc sstep: Add support for prefixed fixed-point arithmetic

2019-11-25 Thread Jordan Niethe

This adds emulation support for the following prefixed Fixed-Point
Arithmetic instructions:
  * Prefixed Add Immediate (paddi)

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/lib/sstep.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 9ae8d177b67f..1bb0c79cb774 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -2776,6 +2776,10 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
break;
op->ea = mlsd_8lsd_ea(instr, sufx, regs);
switch (sufxopcode) {
+   case 14:/* paddi */
+   op->type = COMPUTE | PREFIXED;
+   op->val = op->ea;
+   goto compute_done;
case 32:/* plwz */
op->type = MKOP(LOAD, PREFIXED, 4);
break;
-- 
2.20.1

[PATCH 08/18] powerpc sstep: Add support for prefixed VSX load/stores

2019-11-25 Thread Jordan Niethe

This adds emulation support for the following prefixed VSX load/stores:
  * Prefixed Load VSX Scalar Doubleword (plxsd)
  * Prefixed Load VSX Scalar Single-Precision (plxssp)
  * Prefixed Load VSX Vector [0|1]  (plxv, plxv0, plxv1)
  * Prefixed Store VSX Scalar Doubleword (pstxsd)
  * Prefixed Store VSX Scalar Single-Precision (pstxssp)
  * Prefixed Store VSX Vector [0|1] (pstxv, pstxv0, pstxv1)

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/lib/sstep.c | 42 
 1 file changed, 42 insertions(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 9113b9a21ae9..9ae8d177b67f 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -2713,6 +2713,48 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
case 41:/* plwa */
op->type = MKOP(LOAD, PREFIXED | SIGNEXT, 4);
break;
+   case 42:/* plxsd */
+   op->reg = rd + 32;
+   op->type = MKOP(LOAD_VSX, PREFIXED, 8);
+   op->element_size = 8;
+   op->vsx_flags = VSX_CHECK_VEC;
+   break;
+   case 43:/* plxssp */
+   op->reg = rd + 32;
+   op->type = MKOP(LOAD_VSX, PREFIXED, 4);
+   op->element_size = 8;
+   op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC;
+   break;
+   case 46:/* pstxsd */
+   op->reg = rd + 32;
+   op->type = MKOP(STORE_VSX, PREFIXED, 8);
+   op->element_size = 8;
+   op->vsx_flags = VSX_CHECK_VEC;
+   break;
+   case 47:/* pstxssp */
+   op->reg = rd + 32;
+   op->type = MKOP(STORE_VSX, PREFIXED, 4);
+   op->element_size = 8;
+   op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC;
+   break;
+   case 51:/* plxv1 */
+   op->reg += 32;
+
+   /* fallthru */
+   case 50:/* plxv0 */
+   op->type = MKOP(LOAD_VSX, PREFIXED, 16);
+   op->element_size = 16;
+   op->vsx_flags = VSX_CHECK_VEC;
+   break;
+   case 55:/* pstxv1 */
+   op->reg = rd + 32;
+
+   /* fallthru */
+   case 54:/* pstxv0 */
+   op->type = MKOP(STORE_VSX, PREFIXED, 16);
+   op->element_size = 16;
+   op->vsx_flags = VSX_CHECK_VEC;
+   break;
case 56:/* plq */
op->type = MKOP(LOAD, PREFIXED, 16);
break;
-- 
2.20.1

[PATCH 07/18] powerpc sstep: Add support for prefixed floating-point load/stores

2019-11-25 Thread Jordan Niethe

This adds emulation support for the follow prefixed floating-point
load/stores:
  * Prefixed Load Floating-Point Single (plfs)
  * Prefixed Load Floating-Point Double (plfd)
  * Prefixed Store Floating-Point Single (pstfs)
  * Prefixed Store Floating-Point Double (pstfd)

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/lib/sstep.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 4f5ad1f602d8..9113b9a21ae9 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -2755,6 +2755,18 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
case 44:/* psth */
op->type = MKOP(STORE, PREFIXED, 2);
break;
+   case 48:/* plfs */
+   op->type = MKOP(LOAD_FP, PREFIXED | FPCONV, 4);
+   break;
+   case 50:/* plfd */
+   op->type = MKOP(LOAD_FP, PREFIXED, 8);
+   break;
+   case 52:/* pstfs */
+   op->type = MKOP(STORE_FP, PREFIXED | FPCONV, 4);
+   break;
+   case 54:/* pstfd */
+   op->type = MKOP(STORE_FP, PREFIXED, 8);
+   break;
}
break;
case 3: /* Type 11 Modified Register-to-Register */
-- 
2.20.1

[PATCH 06/18] powerpc sstep: Add support for prefixed integer load/stores

2019-11-25 Thread Jordan Niethe

This adds emulation support for the following prefixed integer
load/stores:
  * Prefixed Load Byte and Zero (plbz)
  * Prefixed Load Halfword and Zero (plhz)
  * Prefixed Load Halfword Algebraic (plha)
  * Prefixed Load Word and Zero (plwz)
  * Prefixed Load Word Algebraic (plwa)
  * Prefixed Load Doubleword (pld)
  * Prefixed Store Byte (pstb)
  * Prefixed Store Halfword (psth)
  * Prefixed Store Word (pstw)
  * Prefixed Store Doubleword (pstd)
  * Prefixed Load Quadword (plq)
  * Prefixed Store Quadword (pstq)

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/lib/sstep.c | 110 +++
 1 file changed, 110 insertions(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index ade3f5eba2e5..4f5ad1f602d8 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -187,6 +187,43 @@ static nokprobe_inline unsigned long xform_ea(unsigned int 
instr,
return ea;
 }
 
+/*
+ * Calculate effective address for a MLS:D-form / 8LS:D-form prefixed 
instruction
+ */
+static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned int instr,
+ unsigned int sufx,
+ const struct pt_regs *regs)
+{
+   int ra, prefix_r;
+   unsigned int  dd;
+   unsigned long ea, d0, d1, d;
+
+   prefix_r = instr & (1ul << 20);
+   ra = (sufx >> 16) & 0x1f;
+
+   d0 = instr & 0x3;
+   d1 = sufx & 0x;
+   d = (d0 << 16) | d1;
+
+   /*
+* sign extend a 34 bit number
+*/
+   dd = (unsigned int) (d >> 2);
+   ea = (signed int) dd;
+   ea = (ea << 2) | (d & 0x3);
+
+   if (!prefix_r && ra)
+   ea += regs->gpr[ra];
+   else if (!prefix_r && !ra)
+   ; /* Leave ea as is */
+   else if (prefix_r && !ra)
+   ea += regs->nip;
+   else if (prefix_r && ra)
+   ; /* Invalid form. Should already be checked for by caller! */
+
+   return ea;
+}
+
 /*
  * Return the largest power of 2, not greater than sizeof(unsigned long),
  * such that x is a multiple of it.
@@ -1166,6 +1203,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
  unsigned int instr, unsigned int sufx)
 {
unsigned int opcode, ra, rb, rc, rd, spr, u;
+   unsigned int sufxopcode, prefixtype, prefix_r;
unsigned long int imm;
unsigned long int val, val2;
unsigned int mb, me, sh;
@@ -2652,6 +2690,78 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 
}
 
+/*
+ * Prefixed instructions
+ */
+   switch (opcode) {
+   case 1:
+   prefix_r = instr & (1ul << 20);
+   ra = (sufx >> 16) & 0x1f;
+   op->update_reg = ra;
+   rd = (sufx >> 21) & 0x1f;
+   op->reg = rd;
+   op->val = regs->gpr[rd];
+
+   sufxopcode = sufx >> 26;
+   prefixtype = (instr >> 24) & 0x3;
+   switch (prefixtype) {
+   case 0: /* Type 00  Eight-Byte Load/Store */
+   if (prefix_r && ra)
+   break;
+   op->ea = mlsd_8lsd_ea(instr, sufx, regs);
+   switch (sufxopcode) {
+   case 41:/* plwa */
+   op->type = MKOP(LOAD, PREFIXED | SIGNEXT, 4);
+   break;
+   case 56:/* plq */
+   op->type = MKOP(LOAD, PREFIXED, 16);
+   break;
+   case 57:/* pld */
+   op->type = MKOP(LOAD, PREFIXED | SIGNEXT, 8);
+   break;
+   case 60:/* stq */
+   op->type = MKOP(STORE, PREFIXED, 16);
+   break;
+   case 61:/* pstd */
+   op->type = MKOP(STORE, PREFIXED | SIGNEXT, 8);
+   break;
+   }
+   break;
+   case 1: /* Type 01 Modified Register-to-Register */
+   break;
+   case 2: /* Type 10 Modified Load/Store */
+   if (prefix_r && ra)
+   break;
+   op->ea = mlsd_8lsd_ea(instr, sufx, regs);
+   switch (sufxopcode) {
+   case 32:/* plwz */
+   op->type = MKOP(LOAD, PREFIXED, 4);
+   break;
+   case 34:/* plbz */
+   op->type = MKOP(LOAD, PREFIXED, 1);
+   break;
+   case 36:/* pstw */
+   op->type = MKOP(STORE, PREFIXED, 4);
+

[PATCH 05/18] powerpc sstep: Prepare to support prefixed instructions

2019-11-25 Thread Jordan Niethe

Currently all instructions are a single word long. A future ISA version
will include prefixed instructions which have a double word length. The
functions used for analysing and emulating instructions need to be
modified so that they can handle these new instruction types.

A prefixed instruction is a word prefix followed by a word suffix. All
prefixes uniquely have the primary op-code 1. Suffixes may be valid word
instructions or instructions that only exist as suffixes.

In handling prefixed instructions it will be convenient to treat the
suffix and prefix as separate words. To facilitate this modify
analyse_instr() and emulate_step() to take a take a suffix as a
parameter. For word instructions it does not matter what is passed in
here - it will be ignored.

We also define a new flag, PREFIXED, to be used in instruction_op:type.
This flag will indicate when emulating an analysed instruction if the
NIP should be advanced by word length or double word length.

The callers of analyse_instr() and emulate_step() will need their own
changes to be able to support prefixed instructions. For now modify them
to pass in 0 as a suffix.

Note that at this point no prefixed instructions are emulated or
analysed - this is just making it possible to do so.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/include/asm/ppc-opcode.h |  3 +++
 arch/powerpc/include/asm/sstep.h  |  8 +--
 arch/powerpc/include/asm/uaccess.h| 30 +++
 arch/powerpc/kernel/align.c   |  2 +-
 arch/powerpc/kernel/hw_breakpoint.c   |  4 ++--
 arch/powerpc/kernel/kprobes.c |  2 +-
 arch/powerpc/kernel/mce_power.c   |  2 +-
 arch/powerpc/kernel/optprobes.c   |  2 +-
 arch/powerpc/kernel/uprobes.c |  2 +-
 arch/powerpc/kvm/emulate_loadstore.c  |  2 +-
 arch/powerpc/lib/sstep.c  | 12 ++-
 arch/powerpc/lib/test_emulate_step.c  | 30 +--
 arch/powerpc/xmon/xmon.c  |  4 ++--
 13 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index c1df75edde44..a1dfa4bdd22f 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -377,6 +377,9 @@
 #define PPC_INST_VCMPEQUD  0x10c7
 #define PPC_INST_VCMPEQUB  0x1006
 
+/* macro to check if a word is a prefix */
+#define IS_PREFIX(x) (((x) >> 26) == 1)
+
 /* macros to insert fields into opcodes */
 #define ___PPC_RA(a)   (((a) & 0x1f) << 16)
 #define ___PPC_RB(b)   (((b) & 0x1f) << 11)
diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 769f055509c9..6d4cb602e231 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -89,6 +89,9 @@ enum instruction_type {
 #define VSX_LDLEFT 4   /* load VSX register from left */
 #define VSX_CHECK_VEC  8   /* check MSR_VEC not MSR_VSX for reg >= 32 */
 
+/* Prefixed flag, ORed in with type */
+#define PREFIXED   0x800
+
 /* Size field in type word */
 #define SIZE(n)((n) << 12)
 #define GETSIZE(w) ((w) >> 12)
@@ -132,7 +135,7 @@ union vsx_reg {
  * otherwise.
  */
 extern int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
-unsigned int instr);
+unsigned int instr, unsigned int sufx);
 
 /*
  * Emulate an instruction that can be executed just by updating
@@ -149,7 +152,8 @@ void emulate_update_regs(struct pt_regs *reg, struct 
instruction_op *op);
  * 0 if it could not be emulated, or -1 for an instruction that
  * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.).
  */
-extern int emulate_step(struct pt_regs *regs, unsigned int instr);
+extern int emulate_step(struct pt_regs *regs, unsigned int instr,
+   unsigned int sufx);
 
 /*
  * Emulate a load or store instruction by reading/writing the
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 15002b51ff18..bc585399e0c7 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -423,4 +423,34 @@ extern long __copy_from_user_flushcache(void *dst, const 
void __user *src,
 extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
   size_t len);
 
+/*
+ * When reading an instruction iff it is a prefix, the suffix needs to be also
+ * loaded.
+ */
+#define __get_user_instr(x, y, ptr)\
+({ \
+   long __gui_ret = 0; \
+   y = 0;  \
+   __gui_ret = __get_user(x, ptr); \
+   if (!__gui_ret) {   \
+   if (IS_PREFIX(x))   \
+   __gui_ret = __get_user(y, ptr + 1); \
+   }

[PATCH 04/18] powerpc: Rename Bit 35 of SRR1 to indicate new purpose

2019-11-25 Thread Jordan Niethe

Bit 35 of SRR1 is called SRR1_ISI_N_OR_G. This name comes from it being
used to indicate that an ISI was due to the access being no-exec or
guarded. A future ISA version adds another purpose. Now it is also set if there
is a access in a cache-inhibited location for prefixed instruction.
Rename from SRR1_ISI_N_OR_G -> SRR1_ISI_N_G_OR_CIP to reflected this new
role.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/include/asm/reg.h  | 2 +-
 arch/powerpc/kvm/book3s_hv_nested.c | 2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 0a6d39fb4769..d3d8212603cb 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -750,7 +750,7 @@
 #define SPRN_SRR0  0x01A   /* Save/Restore Register 0 */
 #define SPRN_SRR1  0x01B   /* Save/Restore Register 1 */
 #define   SRR1_ISI_NOPT0x4000 /* ISI: Not found in hash */
-#define   SRR1_ISI_N_OR_G  0x1000 /* ISI: Access is no-exec or G */
+#define   SRR1_ISI_N_G_OR_CIP  0x1000 /* ISI: Access is no-exec or G or CI 
for a prefixed instruction */
 #define   SRR1_ISI_PROT0x0800 /* ISI: Other protection 
fault */
 #define   SRR1_WAKEMASK0x0038 /* reason for wakeup */
 #define   SRR1_WAKEMASK_P8 0x003c /* reason for wakeup on POWER8 and 9 
*/
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index cdf30c6eaf54..32798ee76f27 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -1169,7 +1169,7 @@ static int kvmhv_translate_addr_nested(struct kvm_vcpu 
*vcpu,
} else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
/* Can we execute? */
if (!gpte_p->may_execute) {
-   flags |= SRR1_ISI_N_OR_G;
+   flags |= SRR1_ISI_N_G_OR_CIP;
goto forward_to_l1;
}
} else {
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 220305454c23..b53a9f1c1a46 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -1260,7 +1260,7 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned 
long addr,
status &= ~DSISR_NOHPTE;/* DSISR_NOHPTE == SRR1_ISI_NOPT */
if (!data) {
if (gr & (HPTE_R_N | HPTE_R_G))
-   return status | SRR1_ISI_N_OR_G;
+   return status | SRR1_ISI_N_G_OR_CIP;
if (!hpte_read_permission(pp, slb_v & key))
return status | SRR1_ISI_PROT;
} else if (status & DSISR_ISSTORE) {
-- 
2.20.1

[PATCH 03/18] powerpc: Add PREFIXED SRR1 bit for future ISA version

2019-11-25 Thread Jordan Niethe

Add the bit definition for exceptions caused by prefixed instructions.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/include/asm/reg.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 6f9fcc3d4c82..0a6d39fb4769 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -778,6 +778,7 @@
 
 #define   SRR1_MCE_MCP 0x0008 /* Machine check signal caused 
interrupt */
 #define   SRR1_BOUNDARY0x1000 /* Prefixed instruction 
crosses 64-byte boundary */
+#define   SRR1_PREFIXED0x2000 /* Exception caused by 
prefixed instruction */
 
 #define SPRN_HSRR0 0x13A   /* Save/Restore Register 0 */
 #define SPRN_HSRR1 0x13B   /* Save/Restore Register 1 */
-- 
2.20.1

[PATCH 01/18] powerpc: Enable Prefixed Instructions

2019-11-25 Thread Jordan Niethe

From: Alistair Popple 

Prefix instructions have their own FSCR bit which needs to enabled via
a CPU feature. The kernel will save the FSCR for problem state but it
needs to be enabled initially.

Signed-off-by: Alistair Popple 
---
 arch/powerpc/include/asm/reg.h|  3 +++
 arch/powerpc/kernel/dt_cpu_ftrs.c | 23 +++
 2 files changed, 26 insertions(+)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 0b7900f194c8..521ecbe35507 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -397,6 +397,7 @@
 #define SPRN_RWMR  0x375   /* Region-Weighting Mode Register */
 
 /* HFSCR and FSCR bit numbers are the same */
+#define FSCR_PREFIX_LG 13  /* Enable Prefix Instructions */
 #define FSCR_SCV_LG12  /* Enable System Call Vectored */
 #define FSCR_MSGP_LG   10  /* Enable MSGP */
 #define FSCR_TAR_LG8   /* Enable Target Address Register */
@@ -408,11 +409,13 @@
 #define FSCR_VECVSX_LG 1   /* Enable VMX/VSX  */
 #define FSCR_FP_LG 0   /* Enable Floating Point */
 #define SPRN_FSCR  0x099   /* Facility Status & Control Register */
+#define   FSCR_PREFIX  __MASK(FSCR_PREFIX_LG)
 #define   FSCR_SCV __MASK(FSCR_SCV_LG)
 #define   FSCR_TAR __MASK(FSCR_TAR_LG)
 #define   FSCR_EBB __MASK(FSCR_EBB_LG)
 #define   FSCR_DSCR__MASK(FSCR_DSCR_LG)
 #define SPRN_HFSCR 0xbe/* HV=1 Facility Status & Control Register */
+#define   HFSCR_PREFIX __MASK(FSCR_PREFIX_LG)
 #define   HFSCR_MSGP   __MASK(FSCR_MSGP_LG)
 #define   HFSCR_TAR__MASK(FSCR_TAR_LG)
 #define   HFSCR_EBB__MASK(FSCR_EBB_LG)
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 180b3a5d1001..f5ca7dd8fbaf 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -553,6 +553,28 @@ static int __init feat_enable_large_ci(struct 
dt_cpu_feature *f)
return 1;
 }
 
+static int __init feat_enable_prefix(struct dt_cpu_feature *f)
+{
+   u64 fscr, hfscr;
+
+   if (f->usable_privilege & USABLE_HV) {
+   hfscr = mfspr(SPRN_HFSCR);
+   hfscr |= HFSCR_PREFIX;
+   mtspr(SPRN_HFSCR, hfscr);
+   }
+
+   if (f->usable_privilege & USABLE_OS) {
+   fscr = mfspr(SPRN_FSCR);
+   fscr |= FSCR_PREFIX;
+   mtspr(SPRN_FSCR, fscr);
+
+   if (f->usable_privilege & USABLE_PR)
+   current->thread.fscr |= FSCR_PREFIX;
+   }
+
+   return 1;
+}
+
 struct dt_cpu_feature_match {
const char *name;
int (*enable)(struct dt_cpu_feature *f);
@@ -626,6 +648,7 @@ static struct dt_cpu_feature_match __initdata
{"vector-binary128", feat_enable, 0},
{"vector-binary16", feat_enable, 0},
{"wait-v3", feat_enable, 0},
+   {"prefix-instructions", feat_enable_prefix, 0},
 };
 
 static bool __initdata using_dt_cpu_ftrs;
-- 
2.20.1

[PATCH 00/18] Initial Prefixed Instruction support

2019-11-25 Thread Jordan Niethe

A future revision of the ISA will introduce prefixed instructions. A
prefixed instruction is composed of a 4-byte prefix followed by a
4-byte suffix.

All prefixes have the major opcode 1. A prefix will never be a valid
word instruction. A suffix may be an existing word instruction or a new
instruction.

The new instruction formats are:
* Eight-Byte Load/Store Instructions
* Eight-Byte Register-to-Register Instructions
* Modified Load/Store Instructions
* Modified Register-to-Register Instructions

This series enables prefixed instructions and extends the instruction
emulation to support them. Then the places where prefixed instructions
might need to be emulated are updated.

A future series will add prefixed instruction support to guests running
in KVM.

Alistair Popple (1):
  powerpc: Enable Prefixed Instructions

Jordan Niethe (17):
  powerpc: Add BOUNDARY SRR1 bit for future ISA version
  powerpc: Add PREFIXED SRR1 bit for future ISA version
  powerpc: Rename Bit 35 of SRR1 to indicate new purpose
  powerpc sstep: Prepare to support prefixed instructions
  powerpc sstep: Add support for prefixed integer load/stores
  powerpc sstep: Add support for prefixed floating-point load/stores
  powerpc sstep: Add support for prefixed VSX load/stores
  powerpc sstep: Add support for prefixed fixed-point arithmetic
  powerpc: Support prefixed instructions in alignment handler
  powerpc/traps: Check for prefixed instructions in
facility_unavailable_exception()
  powerpc/xmon: Add initial support for prefixed instructions
  powerpc/xmon: Dump prefixed instructions
  powerpc/kprobes: Support kprobes on prefixed instructions
  powerpc/uprobes: Add support for prefixed instructions
  powerpc/hw_breakpoints: Initial support for prefixed instructions
  powerpc: Add prefix support to mce_find_instr_ea_and_pfn()
  powerpc/fault: Use analyse_instr() to check for store with updates to
sp

 arch/powerpc/include/asm/kprobes.h|   5 +-
 arch/powerpc/include/asm/ppc-opcode.h |   3 +
 arch/powerpc/include/asm/reg.h|   7 +-
 arch/powerpc/include/asm/sstep.h  |   8 +-
 arch/powerpc/include/asm/uaccess.h|  30 +
 arch/powerpc/include/asm/uprobes.h|  18 ++-
 arch/powerpc/kernel/align.c   |   8 +-
 arch/powerpc/kernel/dt_cpu_ftrs.c |  23 
 arch/powerpc/kernel/hw_breakpoint.c   |   8 +-
 arch/powerpc/kernel/kprobes.c |  46 +--
 arch/powerpc/kernel/mce_power.c   |   6 +-
 arch/powerpc/kernel/optprobes.c   |  31 +++--
 arch/powerpc/kernel/optprobes_head.S  |   6 +
 arch/powerpc/kernel/traps.c   |  18 ++-
 arch/powerpc/kernel/uprobes.c |   4 +-
 arch/powerpc/kvm/book3s_hv_nested.c   |   2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   |   2 +-
 arch/powerpc/kvm/emulate_loadstore.c  |   2 +-
 arch/powerpc/lib/sstep.c  | 180 +-
 arch/powerpc/lib/test_emulate_step.c  |  30 ++---
 arch/powerpc/mm/fault.c   |  39 ++
 arch/powerpc/xmon/xmon.c  | 132 +++
 22 files changed, 490 insertions(+), 118 deletions(-)

-- 
2.20.1

[PATCH 02/18] powerpc: Add BOUNDARY SRR1 bit for future ISA version

2019-11-25 Thread Jordan Niethe

Add the bit definition for when the cause of an alignment exception is a
prefixed instruction that crosses a 64-byte boundary.

Signed-off-by: Jordan Niethe 
---
 arch/powerpc/include/asm/reg.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 521ecbe35507..6f9fcc3d4c82 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -777,6 +777,7 @@
 #define   SRR1_PROGADDR0x0001 /* SRR0 contains subsequent 
addr */
 
 #define   SRR1_MCE_MCP 0x0008 /* Machine check signal caused 
interrupt */
+#define   SRR1_BOUNDARY0x1000 /* Prefixed instruction 
crosses 64-byte boundary */
 
 #define SPRN_HSRR0 0x13A   /* Save/Restore Register 0 */
 #define SPRN_HSRR1 0x13B   /* Save/Restore Register 1 */
-- 
2.20.1

Re: [PATCH v2] powerpc/kernel/sysfs: Add PMU_SYSFS config option to enable PMU SPRs sysfs file creation

2019-11-25 Thread Nageswara R Sastry


Kajol Jain  wrote on 13/11/2019 09:40:56 PM:

> From: Kajol Jain 
> To: linuxppc-dev@lists.ozlabs.org, m...@ellerman.id.au
> Cc: kj...@linux.ibm.com, a...@linux.vnet.ibm.com,
ma...@linux.vnet.ibm.com
> Date: 26/11/2019 10:16 AM
> Subject: [EXTERNAL] [PATCH v2] powerpc/kernel/sysfs: Add PMU_SYSFS
> config option to enable PMU SPRs sysfs file creation
>
> Many of the performance moniroting unit (PMU) SPRs are
> exposed in the sysfs. "perf" API is the primary interface to program
> PMU and collect counter data in the system. So expose these
> PMU SPRs in the absence of CONFIG_PERF_EVENTS.
>
> Patch adds a new CONFIG option 'CONFIG_PMU_SYSFS'. The new config
> option used in kernel/sysfs.c for PMU SPRs sysfs file creation and
> this new option is enabled only if 'CONFIG_PERF_EVENTS' option is
> disabled.
>
> Tested this patch with enable/disable CONFIG_PERF_EVENTS option
> in powernv and pseries machines.
> Also did compilation testing for different architecture include:
> x86, mips, mips64, alpha, arm. And with book3s_32.config option.
>
> Signed-off-by: Kajol Jain 
> ---
>  arch/powerpc/kernel/sysfs.c| 21 +
>  arch/powerpc/platforms/Kconfig.cputype |  8 
>  2 files changed, 29 insertions(+)

Tested-by: Nageswara R Sastry 
Tested using the following different scenarios:
1. CONFIG_PERF_EVENT - enabled, CONFIG_PMU_SYSFS - disabled,
RESULT: not seen any sysfs files(mmrc*, pmc*)
from /sys/bus/cpu/devices/cpu?/
2. CONFIG_PERF_EVENT - disabled, CONFIG_PMU_SYSFS - enabled,
RESULT: seen any sysfs files(mmrc*, pmc*) from /sys/bus/cpu/devices/cpu?/
3. CONFIG_PERF_EVENT -disabled, CONFIG_PMU_SYSFS - disabled,
RESULT: not possible, any one of the config options need to be enabled.
4. CONFIG_PERF_EVENT -enabled, CONFIG_PMU_SYSFS - enabled,
RESULT: not possible, any one of the config options need to be enabled.

>
> ---
> Changelog:
> v1 -> v2
> - Added new config option 'PMU_SYSFS' for PMU SPR's creation
>   rather than using PERF_EVENTS config option directly and make
>   sure SPR's file creation only if 'CONFIG_PERF_EVENTS' disabled.
> ---
> diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
> index 80a676da11cb..b7c01f1ef236 100644
> --- a/arch/powerpc/kernel/sysfs.c
> +++ b/arch/powerpc/kernel/sysfs.c
> @@ -457,16 +457,21 @@ static ssize_t __used \
>
>  #if defined(CONFIG_PPC64)
>  #define HAS_PPC_PMC_CLASSIC   1
> +#ifdef CONFIG_PMU_SYSFS
>  #define HAS_PPC_PMC_IBM  1
> +#endif
>  #define HAS_PPC_PMC_PA6T   1
>  #elif defined(CONFIG_PPC_BOOK3S_32)
>  #define HAS_PPC_PMC_CLASSIC   1
> +#ifdef CONFIG_PMU_SYSFS
>  #define HAS_PPC_PMC_IBM  1
>  #define HAS_PPC_PMC_G4  1
>  #endif
> +#endif
>
>
>  #ifdef HAS_PPC_PMC_CLASSIC
> +#ifdef CONFIG_PMU_SYSFS
>  SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0);
>  SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1);
>  SYSFS_PMCSETUP(pmc1, SPRN_PMC1);
> @@ -485,6 +490,10 @@ SYSFS_PMCSETUP(pmc7, SPRN_PMC7);
>  SYSFS_PMCSETUP(pmc8, SPRN_PMC8);
>
>  SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
> +#endif /* CONFIG_PPC64 */
> +#endif /* CONFIG_PMU_SYSFS */
> +
> +#ifdef CONFIG_PPC64
>  SYSFS_SPRSETUP(purr, SPRN_PURR);
>  SYSFS_SPRSETUP(spurr, SPRN_SPURR);
>  SYSFS_SPRSETUP(pir, SPRN_PIR);
> @@ -495,7 +504,9 @@ SYSFS_SPRSETUP(tscr, SPRN_TSCR);
>enable write when needed with a separate function.
>Lets be conservative and default to pseries.
>  */
> +#ifdef CONFIG_PMU_SYSFS
>  static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
> +#endif /* CONFIG_PMU_SYSFS */
>  static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
>  static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
>  static DEVICE_ATTR(pir, 0400, show_pir, NULL);
> @@ -606,12 +617,14 @@ static void sysfs_create_dscr_default(void)
>  #endif /* CONFIG_PPC64 */
>
>  #ifdef HAS_PPC_PMC_PA6T
> +#ifdef CONFIG_PMU_SYSFS
>  SYSFS_PMCSETUP(pa6t_pmc0, SPRN_PA6T_PMC0);
>  SYSFS_PMCSETUP(pa6t_pmc1, SPRN_PA6T_PMC1);
>  SYSFS_PMCSETUP(pa6t_pmc2, SPRN_PA6T_PMC2);
>  SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3);
>  SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4);
>  SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5);
> +#endif /* CONFIG_PMU_SYSFS */
>  #ifdef CONFIG_DEBUG_MISC
>  SYSFS_SPRSETUP(hid0, SPRN_HID0);
>  SYSFS_SPRSETUP(hid1, SPRN_HID1);
> @@ -644,6 +657,7 @@ SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3);
>  #endif /* CONFIG_DEBUG_MISC */
>  #endif /* HAS_PPC_PMC_PA6T */
>
> +#ifdef CONFIG_PMU_SYSFS
>  #ifdef HAS_PPC_PMC_IBM
>  static struct device_attribute ibm_common_attrs[] = {
> __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
> @@ -671,9 +685,11 @@ static struct device_attribute classic_pmc_attrs[] =
{
> __ATTR(pmc8, 0600, show_pmc8, store_pmc8),
>  #endif
>  };
> +#endif /* CONFIG_PMU_SYSFS */
>
>  #ifdef HAS_PPC_PMC_PA6T
>  static struct device_attribute pa6t_attrs[] = {
> +#ifdef CONFIG_PMU_SYSFS
> __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
> __ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
> __ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0),
> @@ -682,6 +698,7 @@ static struct device_attribute

Re: [PATCH][v2] powerpc: Set right value of Speculation_Store_Bypass in /proc//status

2019-11-25 Thread Michael Ellerman

Gustavo Walbon  writes:
> The issue has showed the value of status of Speculation_Store_Bypass in the
> /proc//status as `unknown` for PowerPC systems.
>
> The patch fix the checking of the mitigation status of Speculation, and
> can be reported as "not vulnerable", "globally mitigated" or "vulnerable".
>
> Link: https://github.com/linuxppc/issues/issues/255
>
> Changelog:
> Rebase on v5.4-rc8
>
> Signed-off-by: Gustavo Walbon 
> ---
>  arch/powerpc/kernel/security.c | 25 -
>  1 file changed, 24 insertions(+), 1 deletion(-)

On further thoughts I don't think this logic (which I suggested) is
right >:(

I commented on the issue:

  I think my original suggestion on this was wrong.
  
  Our mitigation is not global, ie. it's a barrier that must be used in
  the right location. We have kernel code to insert the barrier on
  kernel entry/exit, but that doesn't protect userspace against itself
  (ie. sandboxes).
  
  There's no way to express that with the current values as far as I can
  see.
  
  I think all we can do for now is:
  
  if stf_enabled_flush_types == STF_BARRIER_NONE:
return PR_SPEC_NOT_AFFECTED // "not vulnerable"
  else
return PR_SPEC_ENABLE // "vulnerable"
  
  To express the situation properly we'd need another value, something
  like PR_SPEC_MITIGATION_AVAILABLE (??) which says that there is a
  mitigation available but it must be used. That still has the problem
  that it doesn't tell userspace what the mitigation is, userspace would
  have to know.

cheers

> diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
> index 7d4b2080a658..04e566026bbc 100644
> --- a/arch/powerpc/kernel/security.c
> +++ b/arch/powerpc/kernel/security.c
> @@ -14,7 +14,7 @@
>  #include 
>  #include 
>  #include 
> -
> +#include 
>  
>  u64 powerpc_security_features __read_mostly = SEC_FTR_DEFAULT;
>  
> @@ -344,6 +344,29 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, 
> struct device_attribute *
>   return sprintf(buf, "Vulnerable\n");
>  }
>  
> +static int ssb_prctl_get(struct task_struct *task)
> +{
> + if (stf_barrier) {
> + if (stf_enabled_flush_types == STF_BARRIER_NONE)
> + return PR_SPEC_NOT_AFFECTED;
> + else
> + return PR_SPEC_DISABLE;
> + } else
> + return PR_SPEC_DISABLE_NOEXEC;
> +
> + return -EINVAL;
> +}
> +
> +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
> +{
> + switch (which) {
> + case PR_SPEC_STORE_BYPASS:
> + return ssb_prctl_get(task);
> + default:
> + return -ENODEV;
> + }
> +}
> +
>  #ifdef CONFIG_DEBUG_FS
>  static int stf_barrier_set(void *data, u64 val)
>  {
> -- 
> 2.19.1

[PATCH v2] of: unittest: fix memory leak in attach_node_and_children

2019-11-25 Thread Erhard Furtner

In attach_node_and_children memory is allocated for full_name via
kasprintf. If the condition of the 1st if is not met the function
returns early without freeing the memory. Add a kfree() to fix that.

This has been detected with kmemleak:
Link: https://bugzilla.kernel.org/show_bug.cgi?id=205327

It looks like the leak was introduced by this commit:
Fixes: 5babefb7f7ab ("of: unittest: allow base devicetree to have symbol 
metadata")

Signed-off-by: Erhard Furtner 
Reviewed-by: Michael Ellerman 
Reviewed-by: Tyrel Datwyler 
---
Changes in v2:
  - Make the commit message more clearer.

 drivers/of/unittest.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 92e895d86458..ca7823eef2b4 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -1146,8 +1146,10 @@ static void attach_node_and_children(struct device_node 
*np)
full_name = kasprintf(GFP_KERNEL, "%pOF", np);
 
if (!strcmp(full_name, "/__local_fixups__") ||
-   !strcmp(full_name, "/__fixups__"))
+   !strcmp(full_name, "/__fixups__")) {
+   kfree(full_name);
return;
+   }
 
dup = of_find_node_by_path(full_name);
kfree(full_name);
-- 
2.23.0

Re: [PATCH v2] powerpc: Fix Kconfig indentation

2019-11-25 Thread Michael Ellerman

On Thu, 2019-11-21 at 03:21:01 UTC, Krzysztof Kozlowski wrote:
> Adjust indentation from spaces to tab (+optional two spaces) as in
> coding style with command like:
>   $ sed -e 's/^/\t/' -i */Kconfig
> 
> Signed-off-by: Krzysztof Kozlowski 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/5f017a56aa5da7f646a858475d57730cd155c9f1

cheers

Re: [PATCH v5 1/3] powerpc: Don't add -mabi= flags when building with Clang

2019-11-25 Thread Michael Ellerman

On Tue, 2019-11-19 at 04:57:10 UTC, Nathan Chancellor wrote:
> When building pseries_defconfig, building vdso32 errors out:
> 
>   error: unknown target ABI 'elfv1'
> 
> This happens because -m32 in clang changes the target to 32-bit,
> which does not allow the ABI to be changed, as the setABI virtual
> function is not overridden:
> 
> https://github.com/llvm/llvm-project/blob/llvmorg-9.0.0/clang/include/clang/Basic/TargetInfo.h#L1073-L1078
> 
> https://github.com/llvm/llvm-project/blob/llvmorg-9.0.0/clang/lib/Basic/Targets/PPC.h#L327-L365
> 
> Commit 4dc831aa8813 ("powerpc: Fix compiling a BE kernel with a
> powerpc64le toolchain") added these flags to fix building big endian
> kernels with a little endian GCC.
> 
> Clang doesn't need -mabi because the target triple controls the default
> value. -mlittle-endian and -mbig-endian manipulate the triple into
> either powerpc64-* or powerpc64le-*, which properly sets the default
> ABI:
> 
> https://github.com/llvm/llvm-project/blob/llvmorg-9.0.0/clang/lib/Driver/Driver.cpp#L450-L463
> 
> https://github.com/llvm/llvm-project/blob/llvmorg-9.0.0/llvm/lib/Support/Triple.cpp#L1432-L1516
> 
> https://github.com/llvm/llvm-project/blob/llvmorg-9.0.0/clang/lib/Basic/Targets/PPC.h#L377-L383
> 
> Adding a debug print out in the PPC64TargetInfo constructor after line
> 383 above shows this:
> 
> $ echo | ./clang -E --target=powerpc64-linux -mbig-endian -o /dev/null -
> Default ABI: elfv1
> 
> $ echo | ./clang -E --target=powerpc64-linux -mlittle-endian -o /dev/null -
> Default ABI: elfv2
> 
> $ echo | ./clang -E --target=powerpc64le-linux -mbig-endian -o /dev/null -
> Default ABI: elfv1
> 
> $ echo | ./clang -E --target=powerpc64le-linux -mlittle-endian -o /dev/null -
> Default ABI: elfv2
> 
> Don't specify -mabi when building with clang to avoid the build error
> with -m32 and not change any code generation.
> 
> -mcall-aixdesc is not an implemented flag in clang so it can be
> safely excluded as well, see commit 238abecde8ad ("powerpc: Don't
> use gcc specific options on clang").
> 
> pseries_defconfig successfully builds after this patch and
> powernv_defconfig and ppc44x_defconfig don't regress.
> 
> Link: https://github.com/ClangBuiltLinux/linux/issues/240
> Reviewed-by: Daniel Axtens 
> Signed-off-by: Nathan Chancellor 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/465bfd9c44dea6b55962b5788a23ac87a467c923

cheers

Re: [PATCH v1 1/4] powerpc/fixmap: don't clear fixmap area in paging_init()

2019-11-25 Thread Michael Ellerman

On Thu, 2019-09-12 at 13:49:41 UTC, Christophe Leroy wrote:
> fixmap is intended to map things permanently like the IMMR region on
> FSL SOC (8xx, 83xx, ...), so don't clear it when initialising paging()
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/f2bb86937d86ebcb0e52f95b6d19aba1d850e601

cheers

[PATCH net v2 3/4] ibmvnic: Bound waits for device queries

2019-11-25 Thread Thomas Falcon

Create a wrapper for wait_for_completion calls with additional
driver checks to ensure that the driver does not wait on a
disabled device. In those cases or if the device does not respond
in an extended amount of time, this will allow the driver an
opportunity to recover.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 112 -
 1 file changed, 97 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 78a3ef70f1ef..4504f96ee07d 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -159,6 +159,40 @@ static long h_reg_sub_crq(unsigned long unit_address, 
unsigned long token,
return rc;
 }
 
+/**
+ * ibmvnic_wait_for_completion - Check device state and wait for completion
+ * @adapter: private device data
+ * @comp_done: completion structure to wait for
+ * @timeout: time to wait in milliseconds
+ *
+ * Wait for a completion signal or until the timeout limit is reached
+ * while checking that the device is still active.
+ */
+static int ibmvnic_wait_for_completion(struct ibmvnic_adapter *adapter,
+  struct completion *comp_done,
+  unsigned long timeout)
+{
+   struct net_device *netdev;
+   unsigned long div_timeout;
+   u8 retry;
+
+   netdev = adapter->netdev;
+   retry = 5;
+   div_timeout = msecs_to_jiffies(timeout / retry);
+   while (true) {
+   if (!adapter->crq.active) {
+   netdev_err(netdev, "Device down!\n");
+   return -ENODEV;
+   }
+   if (retry--)
+   break;
+   if (wait_for_completion_timeout(comp_done, div_timeout))
+   return 0;
+   }
+   netdev_err(netdev, "Operation timed out.\n");
+   return -ETIMEDOUT;
+}
+
 static int alloc_long_term_buff(struct ibmvnic_adapter *adapter,
struct ibmvnic_long_term_buff *ltb, int size)
 {
@@ -183,7 +217,15 @@ static int alloc_long_term_buff(struct ibmvnic_adapter 
*adapter,
dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
return rc;
}
-   wait_for_completion(&adapter->fw_done);
+
+   rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 1);
+   if (rc) {
+   dev_err(dev,
+   "Long term map request aborted or timed out,rc = %d\n",
+   rc);
+   dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
+   return rc;
+   }
 
if (adapter->fw_done_rc) {
dev_err(dev, "Couldn't map long term buffer,rc = %d\n",
@@ -211,6 +253,7 @@ static void free_long_term_buff(struct ibmvnic_adapter 
*adapter,
 static int reset_long_term_buff(struct ibmvnic_adapter *adapter,
struct ibmvnic_long_term_buff *ltb)
 {
+   struct device *dev = &adapter->vdev->dev;
int rc;
 
memset(ltb->buff, 0, ltb->size);
@@ -219,10 +262,16 @@ static int reset_long_term_buff(struct ibmvnic_adapter 
*adapter,
rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id);
if (rc)
return rc;
-   wait_for_completion(&adapter->fw_done);
+
+   rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 1);
+   if (rc) {
+   dev_info(dev,
+"Reset failed, long term map request timed out or 
aborted\n");
+   return rc;
+   }
 
if (adapter->fw_done_rc) {
-   dev_info(&adapter->vdev->dev,
+   dev_info(dev,
 "Reset failed, attempting to free and reallocate 
buffer\n");
free_long_term_buff(adapter, ltb);
return alloc_long_term_buff(adapter, ltb, ltb->size);
@@ -949,7 +998,12 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
rc = ibmvnic_send_crq(adapter, &crq);
if (rc)
return rc;
-   wait_for_completion(&adapter->fw_done);
+
+   rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 1);
+   if (rc) {
+   dev_err(dev, "Could not retrieve VPD size, rc = %d\n", rc);
+   return rc;
+   }
 
if (!adapter->vpd->len)
return -ENODATA;
@@ -987,7 +1041,14 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter 
*adapter)
adapter->vpd->buff = NULL;
return rc;
}
-   wait_for_completion(&adapter->fw_done);
+
+   rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 1);
+   if (rc) {
+   dev_err(dev, "Unable to retrieve VPD, rc = %d\n", rc);
+   kfree(adapter->vpd->buff);
+   adapter->vpd->buff = NULL;
+   return rc;
+   }
 
return 0;
 }
@@ -169

[PATCH net v2 4/4] ibmvnic: Serialize device queries

2019-11-25 Thread Thomas Falcon

Provide some serialization for device CRQ commands
and queries to ensure that the shared variable used for
storing return codes is properly synchronized.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 54 ++
 drivers/net/ethernet/ibm/ibmvnic.h |  2 ++
 2 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 4504f96ee07d..42e15b31a5ff 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -210,11 +210,14 @@ static int alloc_long_term_buff(struct ibmvnic_adapter 
*adapter,
ltb->map_id = adapter->map_id;
adapter->map_id++;
 
+   mutex_lock(&adapter->fw_lock);
+   adapter->fw_done_rc = 0;
reinit_completion(&adapter->fw_done);
rc = send_request_map(adapter, ltb->addr,
  ltb->size, ltb->map_id);
if (rc) {
dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
+   mutex_unlock(&adapter->fw_lock);
return rc;
}
 
@@ -224,6 +227,7 @@ static int alloc_long_term_buff(struct ibmvnic_adapter 
*adapter,
"Long term map request aborted or timed out,rc = %d\n",
rc);
dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
+   mutex_unlock(&adapter->fw_lock);
return rc;
}
 
@@ -231,8 +235,10 @@ static int alloc_long_term_buff(struct ibmvnic_adapter 
*adapter,
dev_err(dev, "Couldn't map long term buffer,rc = %d\n",
adapter->fw_done_rc);
dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
+   mutex_unlock(&adapter->fw_lock);
return -1;
}
+   mutex_unlock(&adapter->fw_lock);
return 0;
 }
 
@@ -258,15 +264,21 @@ static int reset_long_term_buff(struct ibmvnic_adapter 
*adapter,
 
memset(ltb->buff, 0, ltb->size);
 
+   mutex_lock(&adapter->fw_lock);
+   adapter->fw_done_rc = 0;
+
reinit_completion(&adapter->fw_done);
rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id);
-   if (rc)
+   if (rc) {
+   mutex_unlock(&adapter->fw_lock);
return rc;
+   }
 
rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 1);
if (rc) {
dev_info(dev,
 "Reset failed, long term map request timed out or 
aborted\n");
+   mutex_unlock(&adapter->fw_lock);
return rc;
}
 
@@ -274,8 +286,10 @@ static int reset_long_term_buff(struct ibmvnic_adapter 
*adapter,
dev_info(dev,
 "Reset failed, attempting to free and reallocate 
buffer\n");
free_long_term_buff(adapter, ltb);
+   mutex_unlock(&adapter->fw_lock);
return alloc_long_term_buff(adapter, ltb, ltb->size);
}
+   mutex_unlock(&adapter->fw_lock);
return 0;
 }
 
@@ -992,18 +1006,25 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter 
*adapter)
if (adapter->vpd->buff)
len = adapter->vpd->len;
 
+   mutex_lock(&adapter->fw_lock);
+   adapter->fw_done_rc = 0;
reinit_completion(&adapter->fw_done);
+
crq.get_vpd_size.first = IBMVNIC_CRQ_CMD;
crq.get_vpd_size.cmd = GET_VPD_SIZE;
rc = ibmvnic_send_crq(adapter, &crq);
-   if (rc)
+   if (rc) {
+   mutex_unlock(&adapter->fw_lock);
return rc;
+   }
 
rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 1);
if (rc) {
dev_err(dev, "Could not retrieve VPD size, rc = %d\n", rc);
+   mutex_unlock(&adapter->fw_lock);
return rc;
}
+   mutex_unlock(&adapter->fw_lock);
 
if (!adapter->vpd->len)
return -ENODATA;
@@ -1030,7 +1051,10 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter 
*adapter)
return -ENOMEM;
}
 
+   mutex_lock(&adapter->fw_lock);
+   adapter->fw_done_rc = 0;
reinit_completion(&adapter->fw_done);
+
crq.get_vpd.first = IBMVNIC_CRQ_CMD;
crq.get_vpd.cmd = GET_VPD;
crq.get_vpd.ioba = cpu_to_be32(adapter->vpd->dma_addr);
@@ -1039,6 +1063,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter 
*adapter)
if (rc) {
kfree(adapter->vpd->buff);
adapter->vpd->buff = NULL;
+   mutex_unlock(&adapter->fw_lock);
return rc;
}
 
@@ -1047,9 +1072,11 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter 
*adapter)
dev_err(dev, "Unable to retrieve VPD, rc = %d\n", rc);
kfree(adapter->vpd->buff);
adapter->vpd->buff = NULL;
+   mutex_unlock(&adapter->fw_lock);
retu

[PATCH net v2 1/4] ibmvnic: Fix completion structure initialization

2019-11-25 Thread Thomas Falcon

Fix multiple calls to init_completion for device completion
structures. Instead, initialize them during device probe and
reinitialize them later as needed.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index f59d9a8e35e2..48225297a5e2 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -176,7 +176,7 @@ static int alloc_long_term_buff(struct ibmvnic_adapter 
*adapter,
ltb->map_id = adapter->map_id;
adapter->map_id++;
 
-   init_completion(&adapter->fw_done);
+   reinit_completion(&adapter->fw_done);
rc = send_request_map(adapter, ltb->addr,
  ltb->size, ltb->map_id);
if (rc) {
@@ -215,7 +215,7 @@ static int reset_long_term_buff(struct ibmvnic_adapter 
*adapter,
 
memset(ltb->buff, 0, ltb->size);
 
-   init_completion(&adapter->fw_done);
+   reinit_completion(&adapter->fw_done);
rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id);
if (rc)
return rc;
@@ -943,7 +943,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
if (adapter->vpd->buff)
len = adapter->vpd->len;
 
-   init_completion(&adapter->fw_done);
+   reinit_completion(&adapter->fw_done);
crq.get_vpd_size.first = IBMVNIC_CRQ_CMD;
crq.get_vpd_size.cmd = GET_VPD_SIZE;
rc = ibmvnic_send_crq(adapter, &crq);
@@ -1689,7 +1689,7 @@ static int __ibmvnic_set_mac(struct net_device *netdev, 
u8 *dev_addr)
crq.change_mac_addr.cmd = CHANGE_MAC_ADDR;
ether_addr_copy(&crq.change_mac_addr.mac_addr[0], dev_addr);
 
-   init_completion(&adapter->fw_done);
+   reinit_completion(&adapter->fw_done);
rc = ibmvnic_send_crq(adapter, &crq);
if (rc) {
rc = -EIO;
@@ -2316,7 +2316,7 @@ static int wait_for_reset(struct ibmvnic_adapter *adapter)
adapter->fallback.rx_entries = adapter->req_rx_add_entries_per_subcrq;
adapter->fallback.tx_entries = adapter->req_tx_entries_per_subcrq;
 
-   init_completion(&adapter->reset_done);
+   reinit_completion(&adapter->reset_done);
adapter->wait_for_reset = true;
rc = ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM);
if (rc)
@@ -2332,7 +2332,7 @@ static int wait_for_reset(struct ibmvnic_adapter *adapter)
adapter->desired.rx_entries = adapter->fallback.rx_entries;
adapter->desired.tx_entries = adapter->fallback.tx_entries;
 
-   init_completion(&adapter->reset_done);
+   reinit_completion(&adapter->reset_done);
adapter->wait_for_reset = true;
rc = ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM);
if (rc)
@@ -2603,7 +2603,7 @@ static void ibmvnic_get_ethtool_stats(struct net_device 
*dev,
cpu_to_be32(sizeof(struct ibmvnic_statistics));
 
/* Wait for data to be written */
-   init_completion(&adapter->stats_done);
+   reinit_completion(&adapter->stats_done);
rc = ibmvnic_send_crq(adapter, &crq);
if (rc)
return;
@@ -4403,7 +4403,7 @@ static int send_query_phys_parms(struct ibmvnic_adapter 
*adapter)
memset(&crq, 0, sizeof(crq));
crq.query_phys_parms.first = IBMVNIC_CRQ_CMD;
crq.query_phys_parms.cmd = QUERY_PHYS_PARMS;
-   init_completion(&adapter->fw_done);
+   reinit_completion(&adapter->fw_done);
rc = ibmvnic_send_crq(adapter, &crq);
if (rc)
return rc;
@@ -4955,6 +4955,9 @@ static int ibmvnic_probe(struct vio_dev *dev, const 
struct vio_device_id *id)
INIT_LIST_HEAD(&adapter->rwi_list);
spin_lock_init(&adapter->rwi_lock);
init_completion(&adapter->init_done);
+   init_completion(&adapter->fw_done);
+   init_completion(&adapter->reset_done);
+   init_completion(&adapter->stats_done);
clear_bit(0, &adapter->resetting);
 
do {
-- 
2.12.3

[PATCH net v2 2/4] ibmvnic: Terminate waiting device threads after loss of service

2019-11-25 Thread Thomas Falcon

If we receive a notification that the device has been deactivated
or removed, force a completion of all waiting threads.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 48225297a5e2..78a3ef70f1ef 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -4500,6 +4500,15 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
case IBMVNIC_CRQ_XPORT_EVENT:
netif_carrier_off(netdev);
adapter->crq.active = false;
+   /* terminate any thread waiting for a response
+* from the device
+*/
+   if (!completion_done(&adapter->fw_done)) {
+   adapter->fw_done_rc = -EIO;
+   complete(&adapter->fw_done);
+   }
+   if (!completion_done(&adapter->stats_done))
+   complete(&adapter->stats_done);
if (test_bit(0, &adapter->resetting))
adapter->force_reset_recovery = true;
if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
-- 
2.12.3

[PATCH net v2 0/4] ibmvnic: Harden device commands and queries

2019-11-25 Thread Thomas Falcon

This patch series fixes some shortcomings with the current
VNIC device command implementation. The first patch fixes
the initialization of driver completion structures used
for device commands. Additionally, all waits for device
commands are bounded with a timeout in the event that the
device does not respond or becomes inoperable. Finally,
serialize queries to retain the integrity of device return
codes.

Changes in v2:

 - included header comment for ibmvnic_wait_for_completion
 - removed open-coded loop in patch 3/4, suggested by Jakub
 - ibmvnic_wait_for_completion accepts timeout value in milliseconds
   instead of jiffies
 - timeout calculations cleaned up and completed before wait loop
 - included missing mutex_destroy calls, suggested by Jakub
 - included comment before mutex declaration

Thomas Falcon (4):
  ibmvnic: Fix completion structure initialization
  ibmvnic: Terminate waiting device threads after loss of service
  ibmvnic: Bound waits for device queries
  ibmvnic: Serialize device queries

 drivers/net/ethernet/ibm/ibmvnic.c | 192 +++--
 drivers/net/ethernet/ibm/ibmvnic.h |   2 +
 2 files changed, 167 insertions(+), 27 deletions(-)

-- 
2.12.3

[PATCH v2 15/19] media/v4l2-core: pin_user_pages (FOLL_PIN) and put_user_page() conversion

2019-11-25 Thread John Hubbard

1. Change v4l2 from get_user_pages() to pin_user_pages().

2. Because all FOLL_PIN-acquired pages must be released via
put_user_page(), also convert the put_page() call over to
put_user_pages_dirty_lock().

Acked-by: Hans Verkuil 
Cc: Ira Weiny 
Signed-off-by: John Hubbard 
---
 drivers/media/v4l2-core/videobuf-dma-sg.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c 
b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 28262190c3ab..162a2633b1e3 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -183,12 +183,12 @@ static int videobuf_dma_init_user_locked(struct 
videobuf_dmabuf *dma,
dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
data, size, dma->nr_pages);
 
-   err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+   err = pin_user_pages(data & PAGE_MASK, dma->nr_pages,
 flags | FOLL_LONGTERM, dma->pages, NULL);
 
if (err != dma->nr_pages) {
dma->nr_pages = (err >= 0) ? err : 0;
-   dprintk(1, "get_user_pages: err=%d [%d]\n", err,
+   dprintk(1, "pin_user_pages: err=%d [%d]\n", err,
dma->nr_pages);
return err < 0 ? err : -EINVAL;
}
@@ -349,11 +349,8 @@ int videobuf_dma_free(struct videobuf_dmabuf *dma)
BUG_ON(dma->sglen);
 
if (dma->pages) {
-   for (i = 0; i < dma->nr_pages; i++) {
-   if (dma->direction == DMA_FROM_DEVICE)
-   set_page_dirty_lock(dma->pages[i]);
-   put_page(dma->pages[i]);
-   }
+   put_user_pages_dirty_lock(dma->pages, dma->nr_pages,
+ dma->direction == DMA_FROM_DEVICE);
kfree(dma->pages);
dma->pages = NULL;
}
-- 
2.24.0

[PATCH v2 17/19] powerpc: book3s64: convert to pin_user_pages() and put_user_page()

2019-11-25 Thread John Hubbard

1. Convert from get_user_pages() to pin_user_pages().

2. As required by pin_user_pages(), release these pages via
put_user_page(). In this case, do so via put_user_pages_dirty_lock().

That has the side effect of calling set_page_dirty_lock(), instead
of set_page_dirty(). This is probably more accurate.

As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
dealing with a file backed page where we have reference on the inode it
hangs off." [1]

[1] https://lore.kernel.org/r/20190723153640.gb...@lst.de

Cc: Jan Kara 
Signed-off-by: John Hubbard 
---
 arch/powerpc/mm/book3s64/iommu_api.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
b/arch/powerpc/mm/book3s64/iommu_api.c
index 56cc84520577..fc1670a6fc3c 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -103,7 +103,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
unsigned long ua,
for (entry = 0; entry < entries; entry += chunk) {
unsigned long n = min(entries - entry, chunk);
 
-   ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
+   ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
FOLL_WRITE | FOLL_LONGTERM,
mem->hpages + entry, NULL);
if (ret == n) {
@@ -167,9 +167,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
unsigned long ua,
return 0;
 
 free_exit:
-   /* free the reference taken */
-   for (i = 0; i < pinned; i++)
-   put_page(mem->hpages[i]);
+   /* free the references taken */
+   put_user_pages(mem->hpages, pinned);
 
vfree(mem->hpas);
kfree(mem);
@@ -212,10 +211,9 @@ static void mm_iommu_unpin(struct 
mm_iommu_table_group_mem_t *mem)
if (!page)
continue;
 
-   if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
-   SetPageDirty(page);
+   put_user_pages_dirty_lock(&page, 1,
+   mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY);
 
-   put_page(page);
mem->hpas[i] = 0;
}
 }
-- 
2.24.0

[PATCH v2 19/19] mm, tree-wide: rename put_user_page() to unpin_user_page()

2019-11-25 Thread John Hubbard

In order to provide a clearer, more symmetric API for pinning
and unpinning DMA pages. This way, pin_user_pages*() calls
match up with unpin_user_pages*() calls, and the API is a lot
closer to being self-explanatory.

Reviewed-by: Jan Kara 
Signed-off-by: John Hubbard 
---
 Documentation/core-api/pin_user_pages.rst   |  2 +-
 arch/powerpc/mm/book3s64/iommu_api.c|  4 +--
 drivers/gpu/drm/via/via_dmablit.c   |  4 +--
 drivers/infiniband/core/umem.c  |  2 +-
 drivers/infiniband/hw/hfi1/user_pages.c |  2 +-
 drivers/infiniband/hw/mthca/mthca_memfree.c |  6 ++--
 drivers/infiniband/hw/qib/qib_user_pages.c  |  2 +-
 drivers/infiniband/hw/qib/qib_user_sdma.c   |  6 ++--
 drivers/infiniband/hw/usnic/usnic_uiom.c|  2 +-
 drivers/infiniband/sw/siw/siw_mem.c |  2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c   |  4 +--
 drivers/platform/goldfish/goldfish_pipe.c   |  4 +--
 drivers/vfio/vfio_iommu_type1.c |  2 +-
 fs/io_uring.c   |  4 +--
 include/linux/mm.h  | 26 -
 mm/gup.c| 32 ++---
 mm/process_vm_access.c  |  4 +--
 net/xdp/xdp_umem.c  |  2 +-
 18 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/Documentation/core-api/pin_user_pages.rst 
b/Documentation/core-api/pin_user_pages.rst
index 4f26637a5005..bba96428ade7 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -220,7 +220,7 @@ since the system was booted, via two new /proc/vmstat 
entries: ::
 /proc/vmstat/nr_foll_pin_requested
 
 Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is
-because there is a noticeable performance drop in put_user_page(), when they
+because there is a noticeable performance drop in unpin_user_page(), when they
 are activated.
 
 References
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
b/arch/powerpc/mm/book3s64/iommu_api.c
index fc1670a6fc3c..b965a0dfd4a2 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -168,7 +168,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
unsigned long ua,
 
 free_exit:
/* free the references taken */
-   put_user_pages(mem->hpages, pinned);
+   unpin_user_pages(mem->hpages, pinned);
 
vfree(mem->hpas);
kfree(mem);
@@ -211,7 +211,7 @@ static void mm_iommu_unpin(struct 
mm_iommu_table_group_mem_t *mem)
if (!page)
continue;
 
-   put_user_pages_dirty_lock(&page, 1,
+   unpin_user_pages_dirty_lock(&page, 1,
mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY);
 
mem->hpas[i] = 0;
diff --git a/drivers/gpu/drm/via/via_dmablit.c 
b/drivers/gpu/drm/via/via_dmablit.c
index 37c5e572993a..719d036c9384 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -188,8 +188,8 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t 
*vsg)
kfree(vsg->desc_pages);
/* fall through */
case dr_via_pages_locked:
-   put_user_pages_dirty_lock(vsg->pages, vsg->num_pages,
- (vsg->direction == DMA_FROM_DEVICE));
+   unpin_user_pages_dirty_lock(vsg->pages, vsg->num_pages,
+  (vsg->direction == DMA_FROM_DEVICE));
/* fall through */
case dr_via_pages_alloc:
vfree(vsg->pages);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 39a1542e6707..663b5c785716 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -54,7 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct 
ib_umem *umem, int d
 
for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
page = sg_page_iter_page(&sg_iter);
-   put_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
+   unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
}
 
sg_free_table(&umem->sg_head);
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c 
b/drivers/infiniband/hw/hfi1/user_pages.c
index 9a94761765c0..3b505006c0a6 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -118,7 +118,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned 
long vaddr, size_t np
 void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
 size_t npages, bool dirty)
 {
-   put_user_pages_dirty_lock(p, npages, dirty);
+   unpin_user_pages_dirty_lock(p, npages, dirty);
 
if (mm) { /* during close after signal, mm can be NULL */
atomic64_sub(npages, &mm->pinned_vm);
diff --git a/drivers/infiniband/hw/mthca/mthca_mem

[PATCH v2 12/19] fs/io_uring: set FOLL_PIN via pin_user_pages()

2019-11-25 Thread John Hubbard

Convert fs/io_uring to use the new pin_user_pages() call, which sets
FOLL_PIN. Setting FOLL_PIN is now required for code that requires
tracking of pinned pages, and therefore for any code that calls
put_user_page().

In partial anticipation of this work, the io_uring code was already
calling put_user_page() instead of put_page(). Therefore, in order to
convert from the get_user_pages()/put_page() model, to the
pin_user_pages()/put_user_page() model, the only change required
here is to change get_user_pages() to pin_user_pages().

Reviewed-by: Jens Axboe 
Reviewed-by: Jan Kara 
Signed-off-by: John Hubbard 
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e5bff60f61d6..869191d8f8d4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4239,7 +4239,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx 
*ctx, void __user *arg,
 
ret = 0;
down_read(¤t->mm->mmap_sem);
-   pret = get_user_pages(ubuf, nr_pages,
+   pret = pin_user_pages(ubuf, nr_pages,
  FOLL_WRITE | FOLL_LONGTERM,
  pages, vmas);
if (pret == nr_pages) {
-- 
2.24.0

[PATCH v2 18/19] mm/gup_benchmark: use proper FOLL_WRITE flags instead of hard-coding "1"

2019-11-25 Thread John Hubbard

Fix the gup benchmark flags to use the symbolic FOLL_WRITE,
instead of a hard-coded "1" value.

Also, clean up the filtering of gup flags a little, by just doing
it once before issuing any of the get_user_pages*() calls. This
makes it harder to overlook, instead of having little "gup_flags & 1"
phrases in the function calls.

Reviewed-by: Ira Weiny 
Signed-off-by: John Hubbard 
---
 mm/gup_benchmark.c | 9 ++---
 tools/testing/selftests/vm/gup_benchmark.c | 6 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 7dd602d7f8db..7fc44d25eca7 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -48,18 +48,21 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = (next - addr) / PAGE_SIZE;
}
 
+   /* Filter out most gup flags: only allow a tiny subset here: */
+   gup->flags &= FOLL_WRITE;
+
switch (cmd) {
case GUP_FAST_BENCHMARK:
-   nr = get_user_pages_fast(addr, nr, gup->flags & 1,
+   nr = get_user_pages_fast(addr, nr, gup->flags,
 pages + i);
break;
case GUP_LONGTERM_BENCHMARK:
nr = get_user_pages(addr, nr,
-   (gup->flags & 1) | FOLL_LONGTERM,
+   gup->flags | FOLL_LONGTERM,
pages + i, NULL);
break;
case GUP_BENCHMARK:
-   nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
+   nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
default:
diff --git a/tools/testing/selftests/vm/gup_benchmark.c 
b/tools/testing/selftests/vm/gup_benchmark.c
index 485cf06ef013..389327e9b30a 100644
--- a/tools/testing/selftests/vm/gup_benchmark.c
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -18,6 +18,9 @@
 #define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
 #define GUP_BENCHMARK  _IOWR('g', 3, struct gup_benchmark)
 
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01/* check pte is writable */
+
 struct gup_benchmark {
__u64 get_delta_usec;
__u64 put_delta_usec;
@@ -85,7 +88,8 @@ int main(int argc, char **argv)
}
 
gup.nr_pages_per_call = nr_pages;
-   gup.flags = write;
+   if (write)
+   gup.flags |= FOLL_WRITE;
 
fd = open("/sys/kernel/debug/gup_benchmark", O_RDWR);
if (fd == -1)
-- 
2.24.0

[PATCH v2 16/19] vfio, mm: pin_user_pages (FOLL_PIN) and put_user_page() conversion

2019-11-25 Thread John Hubbard

1. Change vfio from get_user_pages_remote(), to
pin_user_pages_remote().

2. Because all FOLL_PIN-acquired pages must be released via
put_user_page(), also convert the put_page() call over to
put_user_pages_dirty_lock().

Note that this effectively changes the code's behavior in
vfio_iommu_type1.c: put_pfn(): it now ultimately calls
set_page_dirty_lock(), instead of set_page_dirty(). This is
probably more accurate.

As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
dealing with a file backed page where we have reference on the inode it
hangs off." [1]

[1] https://lore.kernel.org/r/20190723153640.gb...@lst.de

Tested-by: Alex Williamson 
Acked-by: Alex Williamson 
Signed-off-by: John Hubbard 
---
 drivers/vfio/vfio_iommu_type1.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b800fc9a0251..18bfc2fc8e6d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -309,9 +309,8 @@ static int put_pfn(unsigned long pfn, int prot)
 {
if (!is_invalid_reserved_pfn(pfn)) {
struct page *page = pfn_to_page(pfn);
-   if (prot & IOMMU_WRITE)
-   SetPageDirty(page);
-   put_page(page);
+
+   put_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
return 1;
}
return 0;
@@ -329,7 +328,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned 
long vaddr,
flags |= FOLL_WRITE;
 
down_read(&mm->mmap_sem);
-   ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags | FOLL_LONGTERM,
+   ret = pin_user_pages_remote(NULL, mm, vaddr, 1, flags | FOLL_LONGTERM,
page, NULL, NULL);
if (ret == 1) {
*pfn = page_to_pfn(page[0]);
-- 
2.24.0

[PATCH v2 14/19] media/v4l2-core: set pages dirty upon releasing DMA buffers

2019-11-25 Thread John Hubbard

After DMA is complete, and the device and CPU caches are synchronized,
it's still required to mark the CPU pages as dirty, if the data was
coming from the device. However, this driver was just issuing a
bare put_page() call, without any set_page_dirty*() call.

Fix the problem, by calling set_page_dirty_lock() if the CPU pages
were potentially receiving data from the device.

Reviewed-by: Christoph Hellwig 
Acked-by: Hans Verkuil 
Cc: Mauro Carvalho Chehab 
Cc: 
Signed-off-by: John Hubbard 
---
 drivers/media/v4l2-core/videobuf-dma-sg.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c 
b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 66a6c6c236a7..28262190c3ab 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -349,8 +349,11 @@ int videobuf_dma_free(struct videobuf_dmabuf *dma)
BUG_ON(dma->sglen);
 
if (dma->pages) {
-   for (i = 0; i < dma->nr_pages; i++)
+   for (i = 0; i < dma->nr_pages; i++) {
+   if (dma->direction == DMA_FROM_DEVICE)
+   set_page_dirty_lock(dma->pages[i]);
put_page(dma->pages[i]);
+   }
kfree(dma->pages);
dma->pages = NULL;
}
-- 
2.24.0

[PATCH v2 09/19] IB/{core, hw, umem}: set FOLL_PIN via pin_user_pages*(), fix up ODP

2019-11-25 Thread John Hubbard

Convert infiniband to use the new pin_user_pages*() calls.

Also, revert earlier changes to Infiniband ODP that had it using
put_user_page(). ODP is "Case 3" in
Documentation/core-api/pin_user_pages.rst, which is to say, normal
get_user_pages() and put_page() is the API to use there.

The new pin_user_pages*() calls replace corresponding get_user_pages*()
calls, and set the FOLL_PIN flag. The FOLL_PIN flag requires that the
caller must return the pages via put_user_page*() calls, but infiniband
was already doing that as part of an earlier commit.

Reviewed-by: Jason Gunthorpe 
Signed-off-by: John Hubbard 
---
 drivers/infiniband/core/umem.c  |  2 +-
 drivers/infiniband/core/umem_odp.c  | 13 ++---
 drivers/infiniband/hw/hfi1/user_pages.c |  2 +-
 drivers/infiniband/hw/mthca/mthca_memfree.c |  2 +-
 drivers/infiniband/hw/qib/qib_user_pages.c  |  2 +-
 drivers/infiniband/hw/qib/qib_user_sdma.c   |  2 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c|  2 +-
 drivers/infiniband/sw/siw/siw_mem.c |  2 +-
 8 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 7a3b99597ead..39a1542e6707 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -267,7 +267,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, 
unsigned long addr,
 
while (npages) {
down_read(&mm->mmap_sem);
-   ret = get_user_pages(cur_base,
+   ret = pin_user_pages(cur_base,
 min_t(unsigned long, npages,
   PAGE_SIZE / sizeof (struct page *)),
 gup_flags | FOLL_LONGTERM,
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index e42d44e501fd..abc3bb6578cc 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -308,9 +308,8 @@ EXPORT_SYMBOL(ib_umem_odp_release);
  * The function returns -EFAULT if the DMA mapping operation fails. It returns
  * -EAGAIN if a concurrent invalidation prevents us from updating the page.
  *
- * The page is released via put_user_page even if the operation failed. For
- * on-demand pinning, the page is released whenever it isn't stored in the
- * umem.
+ * The page is released via put_page even if the operation failed. For 
on-demand
+ * pinning, the page is released whenever it isn't stored in the umem.
  */
 static int ib_umem_odp_map_dma_single_page(
struct ib_umem_odp *umem_odp,
@@ -363,7 +362,7 @@ static int ib_umem_odp_map_dma_single_page(
}
 
 out:
-   put_user_page(page);
+   put_page(page);
return ret;
 }
 
@@ -473,7 +472,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, 
u64 user_virt,
ret = -EFAULT;
break;
}
-   put_user_page(local_page_list[j]);
+   put_page(local_page_list[j]);
continue;
}
 
@@ -500,8 +499,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, 
u64 user_virt,
 * ib_umem_odp_map_dma_single_page().
 */
if (npages - (j + 1) > 0)
-   put_user_pages(&local_page_list[j+1],
-  npages - (j + 1));
+   release_pages(&local_page_list[j+1],
+ npages - (j + 1));
break;
}
}
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c 
b/drivers/infiniband/hw/hfi1/user_pages.c
index 469acb961fbd..9a94761765c0 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -106,7 +106,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned 
long vaddr, size_t np
int ret;
unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0);
 
-   ret = get_user_pages_fast(vaddr, npages, gup_flags, pages);
+   ret = pin_user_pages_fast(vaddr, npages, gup_flags, pages);
if (ret < 0)
return ret;
 
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c 
b/drivers/infiniband/hw/mthca/mthca_memfree.c
index edccfd6e178f..8269ab040c21 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -472,7 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct 
mthca_uar *uar,
goto out;
}
 
-   ret = get_user_pages_fast(uaddr & PAGE_MASK, 1,
+   ret = pin_user_pages_fast(uaddr & PAGE_MASK, 1,
  FOLL_WRITE | FOLL_LONGTERM, pages);
if (ret < 0)
goto out;
diff --git a/drivers/infinib

[PATCH v2 11/19] drm/via: set FOLL_PIN via pin_user_pages_fast()

2019-11-25 Thread John Hubbard

Convert drm/via to use the new pin_user_pages_fast() call, which sets
FOLL_PIN. Setting FOLL_PIN is now required for code that requires
tracking of pinned pages, and therefore for any code that calls
put_user_page().

In partial anticipation of this work, the drm/via driver was already
calling put_user_page() instead of put_page(). Therefore, in order to
convert from the get_user_pages()/put_page() model, to the
pin_user_pages()/put_user_page() model, the only change required
is to change get_user_pages() to pin_user_pages().

Acked-by: Daniel Vetter 
Reviewed-by: Jérôme Glisse 
Reviewed-by: Ira Weiny 
Signed-off-by: John Hubbard 
---
 drivers/gpu/drm/via/via_dmablit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/via/via_dmablit.c 
b/drivers/gpu/drm/via/via_dmablit.c
index 3db000aacd26..37c5e572993a 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -239,7 +239,7 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg,  
drm_via_dmablit_t *xfer)
vsg->pages = vzalloc(array_size(sizeof(struct page *), vsg->num_pages));
if (NULL == vsg->pages)
return -ENOMEM;
-   ret = get_user_pages_fast((unsigned long)xfer->mem_addr,
+   ret = pin_user_pages_fast((unsigned long)xfer->mem_addr,
vsg->num_pages,
vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
vsg->pages);
-- 
2.24.0

[PATCH v2 13/19] net/xdp: set FOLL_PIN via pin_user_pages()

2019-11-25 Thread John Hubbard

Convert net/xdp to use the new pin_longterm_pages() call, which sets
FOLL_PIN. Setting FOLL_PIN is now required for code that requires
tracking of pinned pages.

In partial anticipation of this work, the net/xdp code was already
calling put_user_page() instead of put_page(). Therefore, in order to
convert from the get_user_pages()/put_page() model, to the
pin_user_pages()/put_user_page() model, the only change required
here is to change get_user_pages() to pin_user_pages().

Acked-by: Björn Töpel 
Signed-off-by: John Hubbard 
---
 net/xdp/xdp_umem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 3049af269fbf..d071003b5e76 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -291,7 +291,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
return -ENOMEM;
 
down_read(¤t->mm->mmap_sem);
-   npgs = get_user_pages(umem->address, umem->npgs,
+   npgs = pin_user_pages(umem->address, umem->npgs,
  gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
up_read(¤t->mm->mmap_sem);
 
-- 
2.24.0

[PATCH v2 05/19] mm: fix get_user_pages_remote()'s handling of FOLL_LONGTERM

2019-11-25 Thread John Hubbard

As it says in the updated comment in gup.c: current FOLL_LONGTERM
behavior is incompatible with FAULT_FLAG_ALLOW_RETRY because of the
FS DAX check requirement on vmas.

However, the corresponding restriction in get_user_pages_remote() was
slightly stricter than is actually required: it forbade all
FOLL_LONGTERM callers, but we can actually allow FOLL_LONGTERM callers
that do not set the "locked" arg.

Update the code and comments to loosen the restriction, allowing
FOLL_LONGTERM in some cases.

Also, copy the DAX check ("if a VMA is DAX, don't allow long term
pinning") from the VFIO call site, all the way into the internals
of get_user_pages_remote() and __gup_longterm_locked(). That is:
get_user_pages_remote() calls __gup_longterm_locked(), which in turn
calls check_dax_vmas(). This check will then be removed from the VFIO
call site in a subsequent patch.

Thanks to Jason Gunthorpe for pointing out a clean way to fix this,
and to Dan Williams for helping clarify the DAX refactoring.

Tested-by: Alex Williamson 
Acked-by: Alex Williamson 
Reviewed-by: Jason Gunthorpe 
Reviewed-by: Ira Weiny 
Suggested-by: Jason Gunthorpe 
Cc: Dan Williams 
Cc: Jerome Glisse 
Signed-off-by: John Hubbard 
---
 mm/gup.c | 27 ++-
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 3ecce297a47f..c0c56888e7cc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,6 +29,13 @@ struct follow_page_context {
unsigned int page_mask;
 };
 
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int flags);
 /*
  * Return the compound head page with ref appropriately incremented,
  * or NULL if that failed.
@@ -1179,13 +1186,23 @@ long get_user_pages_remote(struct task_struct *tsk, 
struct mm_struct *mm,
struct vm_area_struct **vmas, int *locked)
 {
/*
-* FIXME: Current FOLL_LONGTERM behavior is incompatible with
+* Parts of FOLL_LONGTERM behavior are incompatible with
 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
-* vmas.  As there are no users of this flag in this call we simply
-* disallow this option for now.
+* vmas. However, this only comes up if locked is set, and there are
+* callers that do request FOLL_LONGTERM, but do not set locked. So,
+* allow what we can.
 */
-   if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
-   return -EINVAL;
+   if (gup_flags & FOLL_LONGTERM) {
+   if (WARN_ON_ONCE(locked))
+   return -EINVAL;
+   /*
+* This will check the vmas (even if our vmas arg is NULL)
+* and return -ENOTSUPP if DAX isn't allowed in this case:
+*/
+   return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+vmas, gup_flags | FOLL_TOUCH |
+FOLL_REMOTE);
+   }
 
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
   locked,
-- 
2.24.0

[PATCH v2 10/19] mm/process_vm_access: set FOLL_PIN via pin_user_pages_remote()

2019-11-25 Thread John Hubbard

Convert process_vm_access to use the new pin_user_pages_remote()
call, which sets FOLL_PIN. Setting FOLL_PIN is now required for
code that requires tracking of pinned pages.

Also, release the pages via put_user_page*().

Also, rename "pages" to "pinned_pages", as this makes for
easier reading of process_vm_rw_single_vec().

Reviewed-by: Jan Kara 
Reviewed-by: Jérôme Glisse 
Reviewed-by: Ira Weiny 
Signed-off-by: John Hubbard 
---
 mm/process_vm_access.c | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 357aa7bef6c0..fd20ab675b85 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -42,12 +42,11 @@ static int process_vm_rw_pages(struct page **pages,
if (copy > len)
copy = len;
 
-   if (vm_write) {
+   if (vm_write)
copied = copy_page_from_iter(page, offset, copy, iter);
-   set_page_dirty_lock(page);
-   } else {
+   else
copied = copy_page_to_iter(page, offset, copy, iter);
-   }
+
len -= copied;
if (copied < copy && iov_iter_count(iter))
return -EFAULT;
@@ -96,7 +95,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
flags |= FOLL_WRITE;
 
while (!rc && nr_pages && iov_iter_count(iter)) {
-   int pages = min(nr_pages, max_pages_per_loop);
+   int pinned_pages = min(nr_pages, max_pages_per_loop);
int locked = 1;
size_t bytes;
 
@@ -106,14 +105,15 @@ static int process_vm_rw_single_vec(unsigned long addr,
 * current/current->mm
 */
down_read(&mm->mmap_sem);
-   pages = get_user_pages_remote(task, mm, pa, pages, flags,
- process_pages, NULL, &locked);
+   pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
+flags, process_pages,
+NULL, &locked);
if (locked)
up_read(&mm->mmap_sem);
-   if (pages <= 0)
+   if (pinned_pages <= 0)
return -EFAULT;
 
-   bytes = pages * PAGE_SIZE - start_offset;
+   bytes = pinned_pages * PAGE_SIZE - start_offset;
if (bytes > len)
bytes = len;
 
@@ -122,10 +122,12 @@ static int process_vm_rw_single_vec(unsigned long addr,
 vm_write);
len -= bytes;
start_offset = 0;
-   nr_pages -= pages;
-   pa += pages * PAGE_SIZE;
-   while (pages)
-   put_page(process_pages[--pages]);
+   nr_pages -= pinned_pages;
+   pa += pinned_pages * PAGE_SIZE;
+
+   /* If vm_write is set, the pages need to be made dirty: */
+   put_user_pages_dirty_lock(process_pages, pinned_pages,
+ vm_write);
}
 
return rc;
-- 
2.24.0

[PATCH v2 08/19] goldish_pipe: convert to pin_user_pages() and put_user_page()

2019-11-25 Thread John Hubbard

1. Call the new global pin_user_pages_fast(), from pin_goldfish_pages().

2. As required by pin_user_pages(), release these pages via
put_user_page(). In this case, do so via put_user_pages_dirty_lock().

That has the side effect of calling set_page_dirty_lock(), instead
of set_page_dirty(). This is probably more accurate.

As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
dealing with a file backed page where we have reference on the inode it
hangs off." [1]

Another side effect is that the release code is simplified because
the page[] loop is now in gup.c instead of here, so just delete the
local release_user_pages() entirely, and call
put_user_pages_dirty_lock() directly, instead.

[1] https://lore.kernel.org/r/20190723153640.gb...@lst.de

Reviewed-by: Jan Kara 
Reviewed-by: Ira Weiny 
Signed-off-by: John Hubbard 
---
 drivers/platform/goldfish/goldfish_pipe.c | 17 +++--
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.c 
b/drivers/platform/goldfish/goldfish_pipe.c
index ef50c264db71..2a5901efecde 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -274,7 +274,7 @@ static int goldfish_pin_pages(unsigned long first_page,
*iter_last_page_size = last_page_size;
}
 
-   ret = get_user_pages_fast(first_page, requested_pages,
+   ret = pin_user_pages_fast(first_page, requested_pages,
  !is_write ? FOLL_WRITE : 0,
  pages);
if (ret <= 0)
@@ -285,18 +285,6 @@ static int goldfish_pin_pages(unsigned long first_page,
return ret;
 }
 
-static void release_user_pages(struct page **pages, int pages_count,
-  int is_write, s32 consumed_size)
-{
-   int i;
-
-   for (i = 0; i < pages_count; i++) {
-   if (!is_write && consumed_size > 0)
-   set_page_dirty(pages[i]);
-   put_page(pages[i]);
-   }
-}
-
 /* Populate the call parameters, merging adjacent pages together */
 static void populate_rw_params(struct page **pages,
   int pages_count,
@@ -372,7 +360,8 @@ static int transfer_max_buffers(struct goldfish_pipe *pipe,
 
*consumed_size = pipe->command_buffer->rw_params.consumed_size;
 
-   release_user_pages(pipe->pages, pages_count, is_write, *consumed_size);
+   put_user_pages_dirty_lock(pipe->pages, pages_count,
+ !is_write && *consumed_size > 0);
 
mutex_unlock(&pipe->lock);
return 0;
-- 
2.24.0

[PATCH v2 07/19] mm/gup: introduce pin_user_pages*() and FOLL_PIN

2019-11-25 Thread John Hubbard

Introduce pin_user_pages*() variations of get_user_pages*() calls,
and also pin_longterm_pages*() variations.

For now, these are placeholder calls, until the various call sites
are converted to use the correct get_user_pages*() or
pin_user_pages*() API.

These variants will eventually all set FOLL_PIN, which is also
introduced, and thoroughly documented.

pin_user_pages()
pin_user_pages_remote()
pin_user_pages_fast()

All pages that are pinned via the above calls, must be unpinned via
put_user_page().

The underlying rules are:

* FOLL_PIN is a gup-internal flag, so the call sites should not directly
set it. That behavior is enforced with assertions.

* Call sites that want to indicate that they are going to do DirectIO
  ("DIO") or something with similar characteristics, should call a
  get_user_pages()-like wrapper call that sets FOLL_PIN. These wrappers
  will:
* Start with "pin_user_pages" instead of "get_user_pages". That
  makes it easy to find and audit the call sites.
* Set FOLL_PIN

* For pages that are received via FOLL_PIN, those pages must be returned
  via put_user_page().

Thanks to Jan Kara and Vlastimil Babka for explaining the 4 cases
in this documentation. (I've reworded it and expanded upon it.)

Reviewed-by: Jan Kara 
Reviewed-by: Mike Rapoport   # Documentation
Reviewed-by: Jérôme Glisse 
Cc: Jonathan Corbet 
Cc: Ira Weiny 
Signed-off-by: John Hubbard 
---
 Documentation/core-api/index.rst  |   1 +
 Documentation/core-api/pin_user_pages.rst | 233 ++
 include/linux/mm.h|  63 --
 mm/gup.c  | 161 +--
 4 files changed, 424 insertions(+), 34 deletions(-)
 create mode 100644 Documentation/core-api/pin_user_pages.rst

diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index ab0eae1c153a..413f7d7c8642 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -31,6 +31,7 @@ Core utilities
generic-radix-tree
memory-allocation
mm-api
+   pin_user_pages
gfp_mask-from-fs-io
timekeeping
boot-time-mm
diff --git a/Documentation/core-api/pin_user_pages.rst 
b/Documentation/core-api/pin_user_pages.rst
new file mode 100644
index ..4f26637a5005
--- /dev/null
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -0,0 +1,233 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+
+pin_user_pages() and related calls
+
+
+.. contents:: :local:
+
+Overview
+
+
+This document describes the following functions: ::
+
+ pin_user_pages
+ pin_user_pages_fast
+ pin_user_pages_remote
+
+Basic description of FOLL_PIN
+=
+
+FOLL_PIN and FOLL_LONGTERM are flags that can be passed to the 
get_user_pages*()
+("gup") family of functions. FOLL_PIN has significant interactions and
+interdependencies with FOLL_LONGTERM, so both are covered here.
+
+FOLL_PIN is internal to gup, meaning that it should not appear at the gup call
+sites. This allows the associated wrapper functions  (pin_user_pages*() and
+others) to set the correct combination of these flags, and to check for 
problems
+as well.
+
+FOLL_LONGTERM, on the other hand, *is* allowed to be set at the gup call sites.
+This is in order to avoid creating a large number of wrapper functions to cover
+all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the
+pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so
+that's a natural dividing line, and a good point to make separate wrapper 
calls.
+In other words, use pin_user_pages*() for DMA-pinned pages, and
+get_user_pages*() for other cases. There are four cases described later on in
+this document, to further clarify that concept.
+
+FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However,
+multiple threads and call sites are free to pin the same struct pages, via both
+FOLL_PIN and FOLL_GET. It's just the call site that needs to choose one or the
+other, not the struct page(s).
+
+The FOLL_PIN implementation is nearly the same as FOLL_GET, except that 
FOLL_PIN
+uses a different reference counting technique.
+
+FOLL_PIN is a prerequisite to FOLL_LONGTGERM. Another way of saying that is,
+FOLL_LONGTERM is a specific case, more restrictive case of FOLL_PIN.
+
+Which flags are set by each wrapper
+===
+
+For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
+flags the caller provides. The caller is required to pass in a non-null struct
+pages* array, and the function then pin pages by incrementing each by a special
+value. For now, that value is +1, just like get_user_pages*().::
+
+ Function
+ 
+ pin_user_pages  FOLL_PIN is always set internally by this function.
+ pin_user_pages_fast FOLL_PIN is always set internally by this function.
+ pin

[PATCH v2 06/19] vfio: fix FOLL_LONGTERM use, simplify get_user_pages_remote() call

2019-11-25 Thread John Hubbard

Update VFIO to take advantage of the recently loosened restriction on
FOLL_LONGTERM with get_user_pages_remote(). Also, now it is possible to
fix a bug: the VFIO caller is logically a FOLL_LONGTERM user, but it
wasn't setting FOLL_LONGTERM.

Also, remove an unnessary pair of calls that were releasing and
reacquiring the mmap_sem. There is no need to avoid holding mmap_sem
just in order to call page_to_pfn().

Also, now that the the DAX check ("if a VMA is DAX, don't allow long
term pinning") is in the internals of get_user_pages_remote() and
__gup_longterm_locked(), there's no need for it at the VFIO call site.
So remove it.

Tested-by: Alex Williamson 
Acked-by: Alex Williamson 
Reviewed-by: Jason Gunthorpe 
Reviewed-by: Ira Weiny 
Suggested-by: Jason Gunthorpe 
Cc: Dan Williams 
Cc: Jerome Glisse 
Signed-off-by: John Hubbard 
---
 drivers/vfio/vfio_iommu_type1.c | 30 +-
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2ada8e6cdb88..b800fc9a0251 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -322,7 +322,6 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned 
long vaddr,
 {
struct page *page[1];
struct vm_area_struct *vma;
-   struct vm_area_struct *vmas[1];
unsigned int flags = 0;
int ret;
 
@@ -330,33 +329,14 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned 
long vaddr,
flags |= FOLL_WRITE;
 
down_read(&mm->mmap_sem);
-   if (mm == current->mm) {
-   ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
-vmas);
-   } else {
-   ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
-   vmas, NULL);
-   /*
-* The lifetime of a vaddr_get_pfn() page pin is
-* userspace-controlled. In the fs-dax case this could
-* lead to indefinite stalls in filesystem operations.
-* Disallow attempts to pin fs-dax pages via this
-* interface.
-*/
-   if (ret > 0 && vma_is_fsdax(vmas[0])) {
-   ret = -EOPNOTSUPP;
-   put_page(page[0]);
-   }
-   }
-   up_read(&mm->mmap_sem);
-
+   ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags | FOLL_LONGTERM,
+   page, NULL, NULL);
if (ret == 1) {
*pfn = page_to_pfn(page[0]);
-   return 0;
+   ret = 0;
+   goto done;
}
 
-   down_read(&mm->mmap_sem);
-
vaddr = untagged_addr(vaddr);
 
vma = find_vma_intersection(mm, vaddr, vaddr + 1);
@@ -366,7 +346,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned 
long vaddr,
if (is_invalid_reserved_pfn(*pfn))
ret = 0;
}
-
+done:
up_read(&mm->mmap_sem);
return ret;
 }
-- 
2.24.0

[PATCH v2 03/19] mm: Cleanup __put_devmap_managed_page() vs ->page_free()

2019-11-25 Thread John Hubbard

From: Dan Williams 

After the removal of the device-public infrastructure there are only 2
->page_free() call backs in the kernel. One of those is a device-private
callback in the nouveau driver, the other is a generic wakeup needed in
the DAX case. In the hopes that all ->page_free() callbacks can be
migrated to common core kernel functionality, move the device-private
specific actions in __put_devmap_managed_page() under the
is_device_private_page() conditional, including the ->page_free()
callback. For the other page types just open-code the generic wakeup.

Yes, the wakeup is only needed in the MEMORY_DEVICE_FSDAX case, but it
does no harm in the MEMORY_DEVICE_DEVDAX and MEMORY_DEVICE_PCI_P2PDMA
case.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Jérôme Glisse 
Cc: Jan Kara 
Cc: Ira Weiny 
Signed-off-by: Dan Williams 
Signed-off-by: John Hubbard 
---
 drivers/nvdimm/pmem.c |  6 
 mm/memremap.c | 80 ---
 2 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ad8e4df1282b..4eae441f86c9 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -337,13 +337,7 @@ static void pmem_release_disk(void *__pmem)
put_disk(pmem->disk);
 }
 
-static void pmem_pagemap_page_free(struct page *page)
-{
-   wake_up_var(&page->_refcount);
-}
-
 static const struct dev_pagemap_ops fsdax_pagemap_ops = {
-   .page_free  = pmem_pagemap_page_free,
.kill   = pmem_pagemap_kill,
.cleanup= pmem_pagemap_cleanup,
 };
diff --git a/mm/memremap.c b/mm/memremap.c
index 022e78e68ea0..e1678e575d9f 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -27,7 +27,8 @@ static void devmap_managed_enable_put(void)
 
 static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
 {
-   if (!pgmap->ops || !pgmap->ops->page_free) {
+   if (pgmap->type == MEMORY_DEVICE_PRIVATE &&
+   (!pgmap->ops || !pgmap->ops->page_free)) {
WARN(1, "Missing page_free method\n");
return -EINVAL;
}
@@ -444,44 +445,51 @@ void __put_devmap_managed_page(struct page *page)
 {
int count = page_ref_dec_return(page);
 
-   /*
-* If refcount is 1 then page is freed and refcount is stable as nobody
-* holds a reference on the page.
-*/
-   if (count == 1) {
-   /* Clear Active bit in case of parallel mark_page_accessed */
-   __ClearPageActive(page);
-   __ClearPageWaiters(page);
+   /* still busy */
+   if (count > 1)
+   return;
 
-   mem_cgroup_uncharge(page);
+   /* only triggered by the dev_pagemap shutdown path */
+   if (count == 0) {
+   __put_page(page);
+   return;
+   }
 
-   /*
-* When a device_private page is freed, the page->mapping field
-* may still contain a (stale) mapping value. For example, the
-* lower bits of page->mapping may still identify the page as
-* an anonymous page. Ultimately, this entire field is just
-* stale and wrong, and it will cause errors if not cleared.
-* One example is:
-*
-*  migrate_vma_pages()
-*migrate_vma_insert_page()
-*  page_add_new_anon_rmap()
-*__page_set_anon_rmap()
-*  ...checks page->mapping, via PageAnon(page) call,
-*and incorrectly concludes that the page is an
-*anonymous page. Therefore, it incorrectly,
-*silently fails to set up the new anon rmap.
-*
-* For other types of ZONE_DEVICE pages, migration is either
-* handled differently or not done at all, so there is no need
-* to clear page->mapping.
-*/
-   if (is_device_private_page(page))
-   page->mapping = NULL;
+   /* notify page idle for dax */
+   if (!is_device_private_page(page)) {
+   wake_up_var(&page->_refcount);
+   return;
+   }
 
-   page->pgmap->ops->page_free(page);
-   } else if (!count)
-   __put_page(page);
+   /* Clear Active bit in case of parallel mark_page_accessed */
+   __ClearPageActive(page);
+   __ClearPageWaiters(page);
+
+   mem_cgroup_uncharge(page);
+
+   /*
+* When a device_private page is freed, the page->mapping field
+* may still contain a (stale) mapping value. For example, the
+* lower bits of page->mapping may still identify the page as an
+* anonymous page. Ultimately, this entire field is just stale
+* and wrong, and it will cause errors if not cleared.  One
+* example is:
+*
+

[PATCH v2 02/19] mm/gup: move try_get_compound_head() to top, fix minor issues

2019-11-25 Thread John Hubbard

An upcoming patch uses try_get_compound_head() more widely,
so move it to the top of gup.c.

Also fix a tiny spelling error and a checkpatch.pl warning.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Jan Kara 
Reviewed-by: Ira Weiny 
Signed-off-by: John Hubbard 
---
 mm/gup.c | 29 +++--
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index f764432914c4..3ecce297a47f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,6 +29,21 @@ struct follow_page_context {
unsigned int page_mask;
 };
 
+/*
+ * Return the compound head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+   struct page *head = compound_head(page);
+
+   if (WARN_ON_ONCE(page_ref_count(head) < 0))
+   return NULL;
+   if (unlikely(!page_cache_add_speculative(head, refs)))
+   return NULL;
+   return head;
+}
+
 /**
  * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
  * @pages:  array of pages to be maybe marked dirty, and definitely released.
@@ -1807,20 +1822,6 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int 
nr_start,
}
 }
 
-/*
- * Return the compund head page with ref appropriately incremented,
- * or NULL if that failed.
- */
-static inline struct page *try_get_compound_head(struct page *page, int refs)
-{
-   struct page *head = compound_head(page);
-   if (WARN_ON_ONCE(page_ref_count(head) < 0))
-   return NULL;
-   if (unlikely(!page_cache_add_speculative(head, refs)))
-   return NULL;
-   return head;
-}
-
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 unsigned int flags, struct page **pages, int *nr)
-- 
2.24.0

[PATCH v2 04/19] goldish_pipe: rename local pin_user_pages() routine

2019-11-25 Thread John Hubbard

1. Avoid naming conflicts: rename local static function from
"pin_user_pages()" to "goldfish_pin_pages()".

An upcoming patch will introduce a global pin_user_pages()
function.

Reviewed-by: Jan Kara 
Reviewed-by: Jérôme Glisse 
Reviewed-by: Ira Weiny 
Signed-off-by: John Hubbard 
---
 drivers/platform/goldfish/goldfish_pipe.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.c 
b/drivers/platform/goldfish/goldfish_pipe.c
index cef0133aa47a..ef50c264db71 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -257,12 +257,12 @@ static int goldfish_pipe_error_convert(int status)
}
 }
 
-static int pin_user_pages(unsigned long first_page,
- unsigned long last_page,
- unsigned int last_page_size,
- int is_write,
- struct page *pages[MAX_BUFFERS_PER_COMMAND],
- unsigned int *iter_last_page_size)
+static int goldfish_pin_pages(unsigned long first_page,
+ unsigned long last_page,
+ unsigned int last_page_size,
+ int is_write,
+ struct page *pages[MAX_BUFFERS_PER_COMMAND],
+ unsigned int *iter_last_page_size)
 {
int ret;
int requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1;
@@ -354,9 +354,9 @@ static int transfer_max_buffers(struct goldfish_pipe *pipe,
if (mutex_lock_interruptible(&pipe->lock))
return -ERESTARTSYS;
 
-   pages_count = pin_user_pages(first_page, last_page,
-last_page_size, is_write,
-pipe->pages, &iter_last_page_size);
+   pages_count = goldfish_pin_pages(first_page, last_page,
+last_page_size, is_write,
+pipe->pages, &iter_last_page_size);
if (pages_count < 0) {
mutex_unlock(&pipe->lock);
return pages_count;
-- 
2.24.0

[PATCH v2 01/19] mm/gup: factor out duplicate code from four routines

2019-11-25 Thread John Hubbard

There are four locations in gup.c that have a fair amount of code
duplication. This means that changing one requires making the same
changes in four places, not to mention reading the same code four
times, and wondering if there are subtle differences.

Factor out the common code into static functions, thus reducing the
overall line count and the code's complexity.

Also, take the opportunity to slightly improve the efficiency of the
error cases, by doing a mass subtraction of the refcount, surrounded
by get_page()/put_page().

Also, further simplify (slightly), by waiting until the the successful
end of each routine, to increment *nr.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Jérôme Glisse 
Reviewed-by: Jan Kara 
Cc: Ira Weiny 
Cc: Christoph Hellwig 
Cc: Aneesh Kumar K.V 
Signed-off-by: John Hubbard 
---
 mm/gup.c | 91 ++--
 1 file changed, 36 insertions(+), 55 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 7646bf993b25..f764432914c4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1978,6 +1978,25 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, 
unsigned long addr,
 }
 #endif
 
+static int record_subpages(struct page *page, unsigned long addr,
+  unsigned long end, struct page **pages)
+{
+   int nr;
+
+   for (nr = 0; addr != end; addr += PAGE_SIZE)
+   pages[nr++] = page++;
+
+   return nr;
+}
+
+static void put_compound_head(struct page *page, int refs)
+{
+   /* Do a get_page() first, in case refs == page->_refcount */
+   get_page(page);
+   page_ref_sub(page, refs);
+   put_page(page);
+}
+
 #ifdef CONFIG_ARCH_HAS_HUGEPD
 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
  unsigned long sz)
@@ -2007,32 +2026,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, 
unsigned long addr,
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 
-   refs = 0;
head = pte_page(pte);
-
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-   do {
-   VM_BUG_ON(compound_head(page) != head);
-   pages[*nr] = page;
-   (*nr)++;
-   page++;
-   refs++;
-   } while (addr += PAGE_SIZE, addr != end);
+   refs = record_subpages(page, addr, end, pages + *nr);
 
head = try_get_compound_head(head, refs);
-   if (!head) {
-   *nr -= refs;
+   if (!head)
return 0;
-   }
 
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-   /* Could be optimized better */
-   *nr -= refs;
-   while (refs--)
-   put_page(head);
+   put_compound_head(head, refs);
return 0;
}
 
+   *nr += refs;
SetPageReferenced(head);
return 1;
 }
@@ -2079,28 +2086,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, 
unsigned long addr,
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
}
 
-   refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-   do {
-   pages[*nr] = page;
-   (*nr)++;
-   page++;
-   refs++;
-   } while (addr += PAGE_SIZE, addr != end);
+   refs = record_subpages(page, addr, end, pages + *nr);
 
head = try_get_compound_head(pmd_page(orig), refs);
-   if (!head) {
-   *nr -= refs;
+   if (!head)
return 0;
-   }
 
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
-   *nr -= refs;
-   while (refs--)
-   put_page(head);
+   put_compound_head(head, refs);
return 0;
}
 
+   *nr += refs;
SetPageReferenced(head);
return 1;
 }
@@ -2120,28 +2118,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, 
unsigned long addr,
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
}
 
-   refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-   do {
-   pages[*nr] = page;
-   (*nr)++;
-   page++;
-   refs++;
-   } while (addr += PAGE_SIZE, addr != end);
+   refs = record_subpages(page, addr, end, pages + *nr);
 
head = try_get_compound_head(pud_page(orig), refs);
-   if (!head) {
-   *nr -= refs;
+   if (!head)
return 0;
-   }
 
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
-   *nr -= refs;
-   while (refs--)
-   put_page(head);
+   put_compound_head(head, refs);
return 0;
}
 
+   *nr += refs;
SetPageReferenced(head);
return 1;
 }
@@ -2157,28 +2146,20 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, 
unsigned long addr

[PATCH v2 00/19] pin_user_pages(): reduced-risk series for Linux 5.5

2019-11-25 Thread John Hubbard

Hi,

Changes since v1:

* Fixed up ppc in response to Jan Kara's review comments (thanks for
  those!).

* Fixed a kbuilt robot-detected build failure: added a stub function for
  the !CONFIG_MMU case.

* Cover letter: now refers to "unpin_user_page()", reflecting the name
  change in the last patch (instead of put_user_page() ).

* Rebased onto today's linux-next: c165016bac27 ("Add linux-next
  specific files for 20191125")


Here is a set of well-reviewed (expect for one patch), lower-risk  items
that can go into Linux 5.5. (Update: the powerpc conversion patch has
had some initial review now, since v1 was posted.)

This is essentially a cut-down v8 of "mm/gup: track dma-pinned pages:
FOLL_PIN" [1], and with one of the VFIO patches split into two patches.
The idea here is to get this long list of "noise" checked into 5.5, so
that the actual, higher-risk "track FOLL_PIN pages" (which is deferred:
not part of this series) will be a much shorter patchset to review.

For the v4l2-core changes, I've left those here (instead of sending
them separately to the -media tree), in order to get the name change
done now (put_user_page --> unpin_user_page). However, I've added a Cc
stable, as recommended during the last round of reviews.

Here are the relevant notes from the original cover letter, edited to
match the current situation:

This is a prerequisite to tracking dma-pinned pages. That in turn is a
prerequisite to solving the larger problem of proper interactions
between file-backed pages, and [R]DMA activities, as discussed in [1],
[2], [3], and in a remarkable number of email threads since about
2017. :)

A new internal gup flag, FOLL_PIN is introduced, and thoroughly
documented in the last patch's Documentation/vm/pin_user_pages.rst.

I believe that this will provide a good starting point for doing the
layout lease work that Ira Weiny has been working on. That's because
these new wrapper functions provide a clean, constrained, systematically
named set of functionality that, again, is required in order to even
know if a page is "dma-pinned".

In contrast to earlier approaches, the page tracking can be
incrementally applied to the kernel call sites that, until now, have
been simply calling get_user_pages() ("gup"). In other words, opt-in by
changing from this:

get_user_pages() (sets FOLL_GET)
put_page()

to this:
pin_user_pages() (sets FOLL_PIN)
unpin_user_page()

Because there are interdependencies with FOLL_LONGTERM, a similar
conversion as for FOLL_PIN, was applied. The change was from this:

get_user_pages(FOLL_LONGTERM) (also sets FOLL_GET)
put_page()

to this:
pin_longterm_pages() (sets FOLL_PIN | FOLL_LONGTERM)
unpin_user_page()

[1] https://lore.kernel.org/r/20191121071354.456618-1-jhubb...@nvidia.com

thanks,
John Hubbard
NVIDIA


Dan Williams (1):
  mm: Cleanup __put_devmap_managed_page() vs ->page_free()

John Hubbard (18):
  mm/gup: factor out duplicate code from four routines
  mm/gup: move try_get_compound_head() to top, fix minor issues
  goldish_pipe: rename local pin_user_pages() routine
  mm: fix get_user_pages_remote()'s handling of FOLL_LONGTERM
  vfio: fix FOLL_LONGTERM use, simplify get_user_pages_remote() call
  mm/gup: introduce pin_user_pages*() and FOLL_PIN
  goldish_pipe: convert to pin_user_pages() and put_user_page()
  IB/{core,hw,umem}: set FOLL_PIN via pin_user_pages*(), fix up ODP
  mm/process_vm_access: set FOLL_PIN via pin_user_pages_remote()
  drm/via: set FOLL_PIN via pin_user_pages_fast()
  fs/io_uring: set FOLL_PIN via pin_user_pages()
  net/xdp: set FOLL_PIN via pin_user_pages()
  media/v4l2-core: set pages dirty upon releasing DMA buffers
  media/v4l2-core: pin_user_pages (FOLL_PIN) and put_user_page()
conversion
  vfio, mm: pin_user_pages (FOLL_PIN) and put_user_page() conversion
  powerpc: book3s64: convert to pin_user_pages() and put_user_page()
  mm/gup_benchmark: use proper FOLL_WRITE flags instead of hard-coding
"1"
  mm, tree-wide: rename put_user_page*() to unpin_user_page*()

 Documentation/core-api/index.rst|   1 +
 Documentation/core-api/pin_user_pages.rst   | 233 ++
 arch/powerpc/mm/book3s64/iommu_api.c|  12 +-
 drivers/gpu/drm/via/via_dmablit.c   |   6 +-
 drivers/infiniband/core/umem.c  |   4 +-
 drivers/infiniband/core/umem_odp.c  |  13 +-
 drivers/infiniband/hw/hfi1/user_pages.c |   4 +-
 drivers/infiniband/hw/mthca/mthca_memfree.c |   8 +-
 drivers/infiniband/hw/qib/qib_user_pages.c  |   4 +-
 drivers/infiniband/hw/qib/qib_user_sdma.c   |   8 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c|   4 +-
 drivers/infiniband/sw/siw/siw_mem.c |   4 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c   |   8 +-
 drivers/nvdimm/pmem.c   |   6 -

Re: [PATCH 17/19] powerpc: book3s64: convert to pin_user_pages() and put_user_page()

2019-11-25 Thread John Hubbard

On 11/25/19 12:59 AM, Jan Kara wrote:
> On Sun 24-11-19 20:20:09, John Hubbard wrote:
>> 1. Convert from get_user_pages() to pin_user_pages().
>>
>> 2. As required by pin_user_pages(), release these pages via
>> put_user_page(). In this case, do so via put_user_pages_dirty_lock().
>>
>> That has the side effect of calling set_page_dirty_lock(), instead
>> of set_page_dirty(). This is probably more accurate.
>>
>> As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
>> dealing with a file backed page where we have reference on the inode it
>> hangs off." [1]
>>
>> 3. Release each page in mem->hpages[] (instead of mem->hpas[]), because
>> that is the array that pin_longterm_pages() filled in. This is more
>> accurate and should be a little safer from a maintenance point of
>> view.
> 
> Except that this breaks the code. hpages is unioned with hpas...
> 

OK. 

>> @@ -212,10 +211,9 @@ static void mm_iommu_unpin(struct 
>> mm_iommu_table_group_mem_t *mem)
>>  if (!page)
>>  continue;
>>  
>> -if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
>> -SetPageDirty(page);
>> +put_user_pages_dirty_lock(&mem->hpages[i], 1,
>> +  MM_IOMMU_TABLE_GROUP_PAGE_DIRTY);
> 
> And the dirtying condition is wrong here as well. Currently it is always
> true.
> 
>   Honza
> 

Yes. Fixed up locally. The function now looks like this (for this patch, not for
the entire series, which renames "put" to "unpin"):


static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
{
long i;
struct page *page = NULL;

if (!mem->hpas)
return;

for (i = 0; i < mem->entries; ++i) {
if (!mem->hpas[i])
continue;

page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
if (!page)
continue;

put_user_pages_dirty_lock(&page, 1,
mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY);

mem->hpas[i] = 0;
}
}

thanks,
-- 
John Hubbard
NVIDIA

Re: [PATCH 07/19] mm/gup: introduce pin_user_pages*() and FOLL_PIN

2019-11-25 Thread John Hubbard

On 11/25/19 12:44 AM, kbuild test robot wrote:
> Hi John,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on rdma/for-next]
> [cannot apply to v5.4 next-20191122]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system. BTW, we also suggest to use '--base' option to specify the
> base tree in git format-patch, please see 
> https://stackoverflow.com/a/37406982]
> 
> url:
> https://github.com/0day-ci/linux/commits/John-Hubbard/pin_user_pages-reduced-risk-series-for-Linux-5-5/20191125-125637
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git for-next
> config: arm-randconfig-a001-20191125 (attached as .config)
> compiler: arm-linux-gnueabi-gcc (GCC) 7.4.0
> reproduce:
> wget 
> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
> ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # save the attached .config to linux build tree
> GCC_VERSION=7.4.0 make.cross ARCH=arm 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot 
> 
> All errors (new ones prefixed by >>):
> 
>mm/gup.o: In function `pin_user_pages_remote':
>>> mm/gup.c:2528: undefined reference to `get_user_pages_remote'
> 
> vim +2528 mm/gup.c


This, and the other (sh) report, is due to !CONFIG_MMU lacking a 
get_user_pages_remote(), 
but pin_user_pages_remote() needs it for a (temporary) implementation. I'll 
post the fix, 
in v2.


thanks,
-- 
John Hubbard
NVIDIA

[PATCH] powerpc/mpc85xx: also write addr_h to spin table for 64bit boot entry

2019-11-25 Thread yingjie_bai

From: Bai Yingjie 

CPU like P4080 has 36bit physical address, its DDR physical
start address can be configured above 4G by LAW registers.

For such systems in which their physical memory start address was
configured higher than 4G, we need also to write addr_h into the spin
table of the target secondary CPU, so that addr_h and addr_l together
represent a 64bit physical address.
Otherwise the secondary core can not get correct entry to start from.

This should do no harm for normal case where addr_h is all 0.

Signed-off-by: Bai Yingjie 
---
 arch/powerpc/platforms/85xx/smp.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/85xx/smp.c 
b/arch/powerpc/platforms/85xx/smp.c
index 8c7ea2486bc0..f12cdd1e80ff 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -252,6 +252,14 @@ static int smp_85xx_start_cpu(int cpu)
out_be64((u64 *)(&spin_table->addr_h),
__pa(ppc_function_entry(generic_secondary_smp_init)));
 #else
+   /*
+* We need also to write addr_h to spin table for systems
+* in which their physical memory start address was configured
+* to above 4G, otherwise the secondary core can not get
+* correct entry to start from.
+* This does no harm for normal case where addr_h is all 0.
+*/
+   out_be32(&spin_table->addr_h, __pa(__early_start) >> 32);
out_be32(&spin_table->addr_l, __pa(__early_start));
 #endif
flush_spin_table(spin_table);
-- 
2.17.1

Re: [PATCH net 0/4] ibmvnic: Harden device commands and queries

2019-11-25 Thread Jakub Kicinski

On Mon, 25 Nov 2019 12:40:42 -0600, Thomas Falcon wrote:
> On 11/23/19 7:49 PM, Jakub Kicinski wrote:
> > On Fri, 22 Nov 2019 13:41:42 -0600, Thomas Falcon wrote:  
> >> This patch series fixes some shortcomings with the current
> >> VNIC device command implementation. The first patch fixes
> >> the initialization of driver completion structures used
> >> for device commands. Additionally, all waits for device
> >> commands are bounded with a timeout in the event that the
> >> device does not respond or becomes inoperable. Finally,
> >> serialize queries to retain the integrity of device return
> >> codes.  
> > I have minor comments on two patches, but also I think it's
> > a little late in the release cycle for putting this in net.
> >
> > Could you target net-next and repost ASAP so it still makes
> > it into 5.5?
> 
> Thank you, sorry for the late response.  I will make the requested 
> changes ASAP, but I've missed the net-next window.  What should I target 
> for v2?

You're right, sticking to "net" makes sense at this point.

Re: [PATCH net 0/4] ibmvnic: Harden device commands and queries

2019-11-25 Thread Thomas Falcon




On 11/23/19 7:49 PM, Jakub Kicinski wrote:

On Fri, 22 Nov 2019 13:41:42 -0600, Thomas Falcon wrote:

This patch series fixes some shortcomings with the current
VNIC device command implementation. The first patch fixes
the initialization of driver completion structures used
for device commands. Additionally, all waits for device
commands are bounded with a timeout in the event that the
device does not respond or becomes inoperable. Finally,
serialize queries to retain the integrity of device return
codes.

I have minor comments on two patches, but also I think it's
a little late in the release cycle for putting this in net.

Could you target net-next and repost ASAP so it still makes
it into 5.5?

Thanks.


Thank you, sorry for the late response.  I will make the requested 
changes ASAP, but I've missed the net-next window.  What should I target 
for v2?


Thanks again,

Tom

Re: Bug 205201 - Booting halts if Dawicontrol DC-2976 UW SCSI board installed, unless RAM size limited to 3500M

2019-11-25 Thread Christian Zigotzky


On 25 November 2019 at 10:32 am, Mike Rapoport wrote:

On Mon, Nov 25, 2019 at 08:39:23AM +0100, Christoph Hellwig wrote:

On Sat, Nov 23, 2019 at 12:42:27PM +0100, Christian Zigotzky wrote:

Hello Christoph,

Please find attached the dmesg of your Git kernel.

Thanks.  It looks like on your platform the swiotlb buffer isn't
actually addressable based on the bus dma mask limit, which is rather
interesting.  swiotlb_init uses memblock_alloc_low to allocate the
buffer, and I'll need some help from Mike and the powerpc maintainers
to figure out how that select where to allocate the buffer from, and
how we can move it to a lower address.  My gut feeling would be to try
to do what arm64 does and define a new ARCH_LOW_ADDRESS_LIMIT, preferably
without needing too much arch specific magic.

Presuming the problem is relevant for all CoreNet boards something like
this could work:
  
diff --git a/arch/powerpc/include/asm/dma.h b/arch/powerpc/include/asm/dma.h

index 1b4f0254868f..7c6cfeeaff52 100644
--- a/arch/powerpc/include/asm/dma.h
+++ b/arch/powerpc/include/asm/dma.h
@@ -347,5 +347,11 @@ extern int isa_dma_bridge_buggy;
  #define isa_dma_bridge_buggy  (0)
  #endif
  
+#ifdef CONFIG_CORENET_GENERIC

+extern phys_addr_t ppc_dma_phys_limit;
+#define ARCH_LOW_ADDRESS_LIMIT (ppc_dma_phys_limit - 1)
+#endif
+
+
  #endif /* __KERNEL__ */
  #endif/* _ASM_POWERPC_DMA_H */
diff --git a/arch/powerpc/platforms/85xx/common.c 
b/arch/powerpc/platforms/85xx/common.c
index fe0606439b5a..346b436b6d3f 100644
--- a/arch/powerpc/platforms/85xx/common.c
+++ b/arch/powerpc/platforms/85xx/common.c
@@ -126,3 +126,7 @@ void __init mpc85xx_qe_par_io_init(void)
}
  }
  #endif
+
+#ifdef CONFIG_CORENET_GENERIC
+phys_addr_t ppc_dma_phys_limit = 0xUL;
+#endif
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c 
b/arch/powerpc/platforms/85xx/corenet_generic.c
index 7ee2c6628f64..673bcbdc7c75 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -64,7 +64,7 @@ void __init corenet_gen_setup_arch(void)
mpc85xx_smp_init();
  
  	swiotlb_detect_4g();

-
+   ppc_dma_phys_limit = 0x0fffUL;
pr_info("%s board\n", ppc_md.name);
  
  	mpc85xx_qe_init();



As a quick hack can you try this patch on top of the tree from Friday?

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f491690d54c6..e3f95c362922 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -344,7 +344,7 @@ static inline int memblock_get_region_node(const struct 
memblock_region *r)
  #define MEMBLOCK_LOW_LIMIT 0
  
  #ifndef ARCH_LOW_ADDRESS_LIMIT

-#define ARCH_LOW_ADDRESS_LIMIT  0xUL
+#define ARCH_LOW_ADDRESS_LIMIT  0x0fffUL
  #endif
  
  phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,

Hello Mike,

Many thanks for your patch! I will test it tomorrow.

Cheers,
Christian

Re: Bug 205201 - Booting halts if Dawicontrol DC-2976 UW SCSI board installed, unless RAM size limited to 3500M

2019-11-25 Thread Christian Zigotzky


On 25 November 2019 at 08:39 am, Christoph Hellwig wrote:

On Sat, Nov 23, 2019 at 12:42:27PM +0100, Christian Zigotzky wrote:

Hello Christoph,

Please find attached the dmesg of your Git kernel.

Thanks.  It looks like on your platform the swiotlb buffer isn't
actually addressable based on the bus dma mask limit, which is rather
interesting.  swiotlb_init uses memblock_alloc_low to allocate the
buffer, and I'll need some help from Mike and the powerpc maintainers
to figure out how that select where to allocate the buffer from, and
how we can move it to a lower address.  My gut feeling would be to try
to do what arm64 does and define a new ARCH_LOW_ADDRESS_LIMIT, preferably
without needing too much arch specific magic.

As a quick hack can you try this patch on top of the tree from Friday?

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f491690d54c6..e3f95c362922 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -344,7 +344,7 @@ static inline int memblock_get_region_node(const struct 
memblock_region *r)
  #define MEMBLOCK_LOW_LIMIT 0
  
  #ifndef ARCH_LOW_ADDRESS_LIMIT

-#define ARCH_LOW_ADDRESS_LIMIT  0xUL
+#define ARCH_LOW_ADDRESS_LIMIT  0x0fffUL
  #endif
  
  phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,



Hello Christoph,

Thanks a lot for your help! I will test your patch tomorrow.

Cheers,
Christian

Re: [PATCH v2] dma-mapping: treat dev->bus_dma_mask as a DMA limit

2019-11-25 Thread Robin Murphy


On 25/11/2019 7:44 am, Christoph Hellwig wrote:

On Sat, Nov 23, 2019 at 09:51:08AM -0700, Nathan Chancellor wrote:

Just as an FYI, this introduces a warning on arm32 allyesconfig for me:


I think the dma_limit argument to iommu_dma_alloc_iova should be a u64
and/or we need to use min_t and open code the zero exception.

Robin, Nicolas - any opinions?


Yeah, given that it's always held a mask I'm not entirely sure why it 
was ever a dma_addr_t rather than a u64. Unless anyone else is desperate 
to do it I'll get a cleanup patch ready for rc1.



Also I wonder how this file gets compiled on arm32 given that arm32
has its own set of iommu dma ops..


As long as the dependencies for CONFIG_IOMMU_DMA are met it can be built 
even when it's not actually used. That said, I might have expected that 
arm allyesconfig ends up with CONFIG_ARCH_DMA_ADDR_T_64BIT=y anyway; I 
guess it must pick some of CONFIG_ARM_LPAE's negative dependencies.


(/me doesn't feel like jumping down the all*config rabbit hole today)

Robin.

Re: [PATCH v4 2/2] powerpc/irq: inline call_do_irq() and call_do_softirq()

2019-11-25 Thread Segher Boessenkool

On Mon, Nov 25, 2019 at 09:32:23PM +1100, Michael Ellerman wrote:
> Segher Boessenkool  writes:
> >> > +static inline void call_do_irq(struct pt_regs *regs, void *sp)
> >> > +{
> >> > +register unsigned long r3 asm("r3") = (unsigned long)regs;
> >> > +
> >> > +/* Temporarily switch r1 to sp, call __do_irq() then restore r1 
> >> > */
> >> > +asm volatile(
> >> > +"   "PPC_STLU"  1, %2(%1);\n"
> >> > +"   mr  1, %1;\n"
> >> > +"   bl  %3;\n"
> >> > +"   "PPC_LL"1, 0(1);\n" :
> >> > +"+r"(r3) :
> >> > +"b"(sp), "i"(THREAD_SIZE - STACK_FRAME_OVERHEAD), 
> >> > "i"(__do_irq) :
> >> > +"lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", 
> >> > "cr6", "cr7",
> >> > +"r0", "r2", "r4", "r5", "r6", "r7", "r8", "r9", "r10", 
> >> > "r11", "r12");
> >> > +}
> >> 
> >> If we add a nop after the bl, so the linker could insert a TOC restore,
> >> then I don't think there's any circumstance under which we expect this
> >> to actually clobber r2, is there?
> >
> > That is mostly correct.
> 
> That's the standard I aspire to :P
> 
> > If call_do_irq was a no-inline function, there would not be problems.
> >
> > What TOC does __do_irq require in r2 on entry, and what will be there
> > when it returns?
> 
> The kernel TOC, and also the kernel TOC, unless something's gone wrong
> or I'm missing something.

If that is the case, we can just do the bl, no nop at all?  And that works
for all of our ABIs.

If we can be certain that we have the kernel TOC in r2 on entry to
call_do_irq, that is!  (Or it establishes it itself).


Segher

Re: [PATCH v17 10/13] namei: LOOKUP_{IN_ROOT,BENEATH}: permit limited ".." resolution

2019-11-25 Thread Aleksa Sarai

On 2019-11-25, Al Viro  wrote:
> On Sun, Nov 17, 2019 at 12:17:10PM +1100, Aleksa Sarai wrote:
> > +   if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
> > +   /*
> > +* If there was a racing rename or mount along our
> > +* path, then we can't be sure that ".." hasn't jumped
> > +* above nd->root (and so userspace should retry or use
> > +* some fallback).
> > +*/
> > +   if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
> > +   return -EAGAIN;
> > +   if (unlikely(read_seqretry(&rename_lock, nd->r_seq)))
> > +   return -EAGAIN;
> > +   }
> 
> Looks like excessive barriers to me - it's
>   rmb
>   check mount_lock.sequence
>   rmb
>   check rename_lock.sequence

If you like, I can switch this to

smp_rmb();
if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
return -EAGAIN;
if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
return -EAGAIN;

Though I think it makes it more noisy (and this code-path will only be
hit for ".." and LOOKUP_IS_SCOPED).

> > @@ -2266,6 +2274,10 @@ static const char *path_init(struct nameidata *nd, 
> > unsigned flags)
> > nd->last_type = LAST_ROOT; /* if there are only slashes... */
> > nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
> > nd->depth = 0;
> > +
> > +   nd->m_seq = read_seqbegin(&mount_lock);
> > +   nd->r_seq = read_seqbegin(&rename_lock);
> 
> Same here, pretty much (fetch/rmb/fetch/rmb)

Unless I'm mistaken, wouldn't we have to do
seqcount_lockdep_reader_access() explicitly -- so it would end up
looking something like:

seqcount_lockdep_reader_access(&mount_lock.seqcount);
nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
seqcount_lockdep_reader_access(&mount_lock.seqcount);
nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
smp_rmb();

Given this code only runs once at the start of each lookup, I'm not sure
it makes much sense to expand it like that and make it look uglier.

If you really want to avoid the duplicate memory barriers in the common
case I could instead gate the rename_lock grab behind LOOKUP_IS_SCOPED
(since that's the only time it's used).

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature

Re: lockdep warning while booting POWER9 PowerNV

2019-11-25 Thread Daniel Axtens

powerpc: define arch_is_kernel_initmem_freed() for lockdep

Under certain circumstances, we hit a warning in lockdep_register_key:

if (WARN_ON_ONCE(static_obj(key)))
return;

This occurs when the key falls into initmem that has since been freed
and can now be reused. This has been observed on boot, and under memory
pressure.

Define arch_is_kernel_initmem_freed(), which allows lockdep to correctly
identify this memory as dynamic.

This fixes a bug picked up by the powerpc64 syzkaller instance where we
hit the WARN via alloc_netdev_mqs.

Link: https://github.com/linuxppc/issues/issues/284
Link: https://lore.kernel.org/linuxppc-dev/87ef0vpfbc@mpe.ellerman.id.au/
Reported-by: Qian Cai 
Reported-by: ppc syzbot c/o Andrew Donnellan 
Commit-message-by: Daniel Axtens 


---

The ppc64 syzkaller link is probably not stable enough to go into
the git history forever, but fwiw:
https://syzkaller-ppc64.appspot.com/bug?id=cfdf75cd985012d0124cd41e6fa095d33e7d0f6b

---
 arch/powerpc/include/asm/sections.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/sections.h 
b/arch/powerpc/include/asm/sections.h
index 5a9b6eb651b6..d19871763ed4 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -5,8 +5,22 @@
 
 #include 
 #include 
+
+#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
+
 #include 
 
+extern bool init_mem_is_free;
+
+static inline int arch_is_kernel_initmem_freed(unsigned long addr)
+{
+   if (!init_mem_is_free)
+   return 0;
+
+   return addr >= (unsigned long)__init_begin &&
+   addr < (unsigned long)__init_end;
+}
+
 extern char __head_end[];
 
 #ifdef __powerpc64__

[PATCH v2 06/14] powerpc: Replace cpu_up/down with device_online/offline

2019-11-25 Thread Qais Yousef

The core device API performs extra housekeeping bits that are missing
from directly calling cpu_up/down.

See commit a6717c01ddc2 ("powerpc/rtas: use device model APIs and
serialization during LPM") for an example description of what might go
wrong.

This also prepares to make cpu_up/down a private interface for anything
but the cpu subsystem.

Acked-by: Michael Ellerman 
Signed-off-by: Qais Yousef 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Michael Ellerman 
CC: Enrico Weigelt 
CC: Ram Pai 
CC: Nicholas Piggin 
CC: Thiago Jung Bauermann 
CC: Christophe Leroy 
CC: Thomas Gleixner 
CC: linuxppc-dev@lists.ozlabs.org
CC: linux-ker...@vger.kernel.org
---
 arch/powerpc/kernel/machine_kexec_64.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/machine_kexec_64.c 
b/arch/powerpc/kernel/machine_kexec_64.c
index 04a7cba58eff..ebf8cc7acc4d 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -208,13 +208,15 @@ static void wake_offline_cpus(void)
 {
int cpu = 0;
 
+   lock_device_hotplug();
for_each_present_cpu(cpu) {
if (!cpu_online(cpu)) {
printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
   cpu);
-   WARN_ON(cpu_up(cpu));
+   WARN_ON(device_online(get_cpu_device(cpu)));
}
}
+   unlock_device_hotplug();
 }
 
 static void kexec_prepare_cpus(void)
-- 
2.17.1

[PATCH v2 00/14] Convert cpu_up/down to device_online/offline

2019-11-25 Thread Qais Yousef

Changes in v2:
* Add 2 new patches that create smp_shutdown_nonboot_cpus() to be used
  in machine_shutdown() in ia64, arm and arm64
* Use proper kernel-doc for the newly introduced functions
* Renamed a function
* Removed a stale comment in a function
* Rebased on top of 5.4-rc8

git clone git://linux-arm.org/linux-qy.git -b cpu-hp-cleanup-v2

Using cpu_up/down directly to bring cpus online/offline loses synchronization
with sysfs and could suffer from a race similar to what is described in
commit a6717c01ddc2 ("powerpc/rtas: use device model APIs and serialization
during LPM").

cpu_up/down seem to be more of a internal implementation detail for the cpu
subsystem to use to boot up cpus, perform suspend/resume and low level hotplug
operations. Users outside of the cpu subsystem would be better using the device
core API to bring a cpu online/offline which is the interface used to hotplug
memory and other system devices.

Several users have already migrated to use the device core API, this series
converts the remaining users and hides cpu_up/down from internal users at the
end.

I noticed this problem while working on a hack to disable offlining
a particular CPU but noticed that setting the offline_disabled attribute in the
device struct isn't enough because users can easily bypass the device core.
While my hack isn't a valid use case but it did highlight the inconsistency in
the way cpus are being onlined/offlined and this attempt hopefully improves on
this.

The first 8 patches fix arch users.

The remaining 6 patches fix generic code users. Particularly creating a new
special exported API for the device core to use instead of cpu_up/down.

The last patch removes cpu_up/down from cpu.h and unexport the functions.

In some cases where the use of cpu_up/down seemed legitimate, I encapsulated
the logic in a higher level - special purposed function; and converted the code
to use that instead.

I did re-run the rcu torture, lock torture and psci checker tests and no
problem was noticed. I did perform build tests on all arch affected except for
parisc.

Hopefully I got the CC list right for all the patches. Apologies in advance if
some people were omitted from some patches but they should have been CCed.

CC: Armijn Hemel 
CC: Benjamin Herrenschmidt 
CC: Bjorn Helgaas 
CC: Borislav Petkov 
CC: Boris Ostrovsky 
CC: Catalin Marinas 
CC: Christophe Leroy 
CC: Daniel Lezcano 
CC: Davidlohr Bueso 
CC: "David S. Miller" 
CC: Eiichi Tsukata 
CC: Enrico Weigelt 
CC: Fenghua Yu 
CC: Greg Kroah-Hartman 
CC: Helge Deller 
CC: "H. Peter Anvin" 
CC: Ingo Molnar 
CC: "James E.J. Bottomley" 
CC: James Morse 
CC: Jiri Kosina 
CC: Josh Poimboeuf 
CC: Josh Triplett 
CC: Juergen Gross 
CC: Lorenzo Pieralisi 
CC: Mark Rutland 
CC: Michael Ellerman 
CC: Nadav Amit 
CC: Nicholas Piggin 
CC: "Paul E. McKenney" 
CC: Paul Mackerras 
CC: Pavankumar Kondeti 
CC: "Peter Zijlstra (Intel)" 
CC: "Rafael J. Wysocki" 
CC: Ram Pai 
CC: Richard Fontana 
CC: Russell King 
CC: Sakari Ailus 
CC: Stefano Stabellini 
CC: Steve Capper 
CC: Thiago Jung Bauermann 
CC: Thomas Gleixner 
CC: Tony Luck 
CC: Will Deacon 
CC: Zhenzhong Duan 
CC: linux-arm-ker...@lists.infradead.org
CC: linux-i...@vger.kernel.org
CC: linux-ker...@vger.kernel.org
CC: linux-par...@vger.kernel.org
CC: linuxppc-dev@lists.ozlabs.org
CC: sparcli...@vger.kernel.org
CC: x...@kernel.org
CC: xen-de...@lists.xenproject.org


Qais Yousef (14):
  smp: create a new function to shutdown nonboot cpus
  ia64: Replace cpu_down with smp_shutdown_nonboot_cpus()
  arm: arm64: Don't use disable_nonboot_cpus()
  arm64: hibernate.c: create a new function to handle cpu_up(sleep_cpu)
  x86: Replace cpu_up/down with devcie_online/offline
  powerpc: Replace cpu_up/down with device_online/offline
  sparc: Replace cpu_up/down with device_online/offline
  parisc: Replace cpu_up/down with device_online/offline
  driver: base: cpu: export device_online/offline
  driver: xen: Replace cpu_up/down with device_online/offline
  firmware: psci: Replace cpu_up/down with device_online/offline
  torture: Replace cpu_up/down with device_online/offline
  smp: Create a new function to bringup nonboot cpus online
  cpu: Hide cpu_up/down

 arch/arm/kernel/reboot.c   |  4 +-
 arch/arm64/kernel/hibernate.c  | 13 ++--
 arch/arm64/kernel/process.c|  4 +-
 arch/ia64/kernel/process.c |  8 +--
 arch/parisc/kernel/processor.c |  4 +-
 arch/powerpc/kernel/machine_kexec_64.c |  4 +-
 arch/sparc/kernel/ds.c |  8 ++-
 arch/x86/kernel/topology.c |  4 +-
 arch/x86/mm/mmio-mod.c |  8 ++-
 arch/x86/xen/smp.c |  4 +-
 drivers/base/core.c|  4 ++
 drivers/base/cpu.c |  4 +-
 drivers/firmware/psci/psci_checker.c   |  6 +-
 drivers/xen/cpu_hotplug.c  |  2 +-
 include/linux/cpu.h|  8 ++-
 kernel/cpu.c

Re: [PATCH] selftests/powerpc: spectre_v2 test must be built 64-bit

2019-11-25 Thread Michael Ellerman

On Wed, 2019-11-20 at 02:39:24 UTC, Michael Ellerman wrote:
> The spectre_v2 test must be built 64-bit, it includes hand-written asm
> that is 64-bit only, and segfaults if built 32-bit.
> 
> Fixes: c790c3d2b0ec ("selftests/powerpc: Add a test of spectre_v2 
> mitigations")
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/bf9c95e23324cbaf2e58fc3f0cbdc73137f2d1ca

cheers

Re: [PATCH v2] powerpc/powernv: Disable native PCIe port management

2019-11-25 Thread Michael Ellerman

On Mon, 2019-11-18 at 06:55:53 UTC, Oliver O'Halloran wrote:
> On PowerNV the PCIe topology is (currently) managed by the powernv platform
> code in Linux in cooperation with the platform firmware. Linux's native
> PCIe port service drivers operate independently of both and this can cause
> problems.
> 
> The main issue is that the portbus driver will conflict with the platform
> specific hotplug driver (pnv_php) over ownership of the MSI used to notify
> the host when a hotplug event occurs. The portbus driver claims this MSI on
> behalf of the individual port services because the same interrupt is used
> for hotplug events, PMEs (on root ports), and link bandwidth change
> notifications. The portbus driver will always claim the interrupt even if
> the individual port service drivers, such as pciehp, are compiled out.
> 
> The second, bigger, problem is that the hotplug port service driver
> fundamentally does not work on PowerNV. The platform assumes that all
> PCI devices have a corresponding arch-specific handle derived from the DT
> node for the device (pci_dn) and without one the platform will not allow
> a PCI device to be enabled. This problem is largely due to historical
> baggage, but it can't be resolved without significant re-factoring of the
> platform PCI support.
> 
> We can fix these problems in the interim by setting the
> "pcie_ports_disabled" flag during platform initialisation. The flag
> indicates the platform owns the PCIe ports which stops the portbus driver
> from being registered.
> 
> This does have the side effect of disabling all port services drivers
> that is: AER, PME, BW notifications, hotplug, and DPC. However, this is
> not a huge disadvantage on PowerNV since these services are either unused
> or handled through other means.
> 
> Cc: Sergey Miroshnichenko 
> Fixes: 66725152fb9f ("PCI/hotplug: PowerPC PowerNV PCI hotplug driver")
> Signed-off-by: Oliver O'Halloran 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/9d72dcef891030545f39ad386a30cf91df517fb2

cheers

Re: [PATCH] powerpc/sysdev: drop simple gpio

2019-11-25 Thread Michael Ellerman

On Thu, 2019-10-31 at 13:47:30 UTC, Christophe Leroy wrote:
> There is a config item CONFIG_SIMPLE_GPIO which
> provides simple memory mapped GPIOs specific to powerpc.
> 
> However, the only platform which selects this option is
> mpc5200, and this platform doesn't use it.
> 
> There are three boards calling simple_gpiochip_init(), but
> as they don't select CONFIG_SIMPLE_GPIO, this is just a nop.
> 
> Simple_gpio is just redundant with the generic MMIO GPIO
> driver which can be found in driver/gpio/ and selected via
> CONFIG_GPIO_GENERIC_PLATFORM, so drop simple_gpio driver.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/8795a739e5c72abeec51caf36b6df2b37e5720c5

cheers

Re: [PATCH v4 1/2] powerpc/32: Split kexec low level code out of misc_32.S

2019-11-25 Thread Michael Ellerman

On Tue, 2019-10-29 at 12:13:57 UTC, Christophe Leroy wrote:
> Almost half of misc_32.S is dedicated to kexec.
> That's the relocation function for kexec.
> 
> Drop it into a dedicated kexec_relocate_32.S
> 
> Signed-off-by: Christophe Leroy 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/9f7bd9201521b3ad11e96887550dd3e835ba01cb

cheers

Re: [PATCH v3 1/2] powerpc/32s: automatically allocate BAT in setbat()

2019-11-25 Thread Michael Ellerman

On Mon, 2019-09-16 at 20:25:39 UTC, Christophe Leroy wrote:
> If no BAT is given to setbat(), select an available BAT.
> 
> Signed-off-by: Christophe Leroy 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/cbcaff7d27ad5c5d2c2db113ec489be88adb815a

cheers

Re: [PATCH v1 4/4] powerpc/ioremap: warn on early use of ioremap()

2019-11-25 Thread Michael Ellerman

On Thu, 2019-09-12 at 13:49:44 UTC, Christophe Leroy wrote:
> Powerpc now has EARLY_IOREMAP.
> 
> Next step is to convert all early users of ioremap() to
> early_ioremap().
> 
> Add a warning to help locate those users.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/d538aadc2718a95bfd80095c66ea814824535b34

cheers

Re: [PATCH v1 3/4] powerpc: Add support for GENERIC_EARLY_IOREMAP

2019-11-25 Thread Michael Ellerman

On Thu, 2019-09-12 at 13:49:43 UTC, Christophe Leroy wrote:
> Add support for GENERIC_EARLY_IOREMAP.
> 
> Let's define 16 slots of 256Kbytes each for early ioremap.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/265c3491c4bc8d40587996d6ee2f447a7ccfb4f3

cheers

Re: [PATCH v1 2/4] powerpc/fixmap: Use __fix_to_virt() instead of fix_to_virt()

2019-11-25 Thread Michael Ellerman

On Thu, 2019-09-12 at 13:49:42 UTC, Christophe Leroy wrote:
> Modify back __set_fixmap() to using __fix_to_virt() instead
> of fix_to_virt() otherwise the following happens because it
> seems GCC doesn't see idx as a builtin const.
> 
>   CC  mm/early_ioremap.o
> In file included from ./include/linux/kernel.h:11:0,
>  from mm/early_ioremap.c:11:
> In function âfix_to_virtâ,
> inlined from â__set_fixmapâ at 
> ./arch/powerpc/include/asm/fixmap.h:87:2,
> inlined from â__early_ioremapâ at mm/early_ioremap.c:156:4:
> ./include/linux/compiler.h:350:38: error: call to 
> â__compiletime_assert_32â declared with attribute error: BUILD_BUG_ON 
> failed: idx >= __end_of_fixed_addresses
>   _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
>   ^
> ./include/linux/compiler.h:331:4: note: in definition of macro 
> â__compiletime_assertâ
> prefix ## suffix();\
> ^
> ./include/linux/compiler.h:350:2: note: in expansion of macro 
> â_compiletime_assertâ
>   _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
>   ^
> ./include/linux/build_bug.h:39:37: note: in expansion of macro 
> âcompiletime_assertâ
>  #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
>  ^
> ./include/linux/build_bug.h:50:2: note: in expansion of macro 
> âBUILD_BUG_ON_MSGâ
>   BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
>   ^
> ./include/asm-generic/fixmap.h:32:2: note: in expansion of macro 
> âBUILD_BUG_ONâ
>   BUILD_BUG_ON(idx >= __end_of_fixed_addresses);
>   ^
> 
> Signed-off-by: Christophe Leroy 
> Fixes: 4cfac2f9c7f1 ("powerpc/mm: Simplify __set_fixmap()")

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/77693a5fb57be4606a6024ec8e3076f9499b906b

cheers

Re: [PATCH] powerpc/8xx: use the fixmapped IMMR in cpm_reset()

2019-11-25 Thread Michael Ellerman

On Thu, 2019-09-12 at 13:29:07 UTC, Christophe Leroy wrote:
> Since commit f86ef74ed919 ("powerpc/8xx: Fix vaddr for IMMR early
> remap"), the IMMR area has been mapped at startup with fixmap.
> 
> Use that fixmap directly instead of calling ioremap(), this
> avoids calling ioremap() early before the slab is available.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/eafd687e689acd99d780e468d6a0622f4694d0bc

cheers

Re: [PATCH] powerpc/8xx: add __init to cpm1 init functions

2019-11-25 Thread Michael Ellerman

On Thu, 2019-09-12 at 13:22:55 UTC, Christophe Leroy wrote:
> Functions cpm1_clk_setup(), cpm1_set_pin(), cpm_pic_init() and
> mpc8xx_pic_init() are only called from __init functions, so mark
> them __init as well.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/132f92fdc42782fd297e076ef74bedeb8ce774e4

cheers

Re: [PATCH v3 1/2] powerpc: permanently include 8xx registers in reg.h

2019-11-25 Thread Michael Ellerman

On Thu, 2019-08-29 at 08:45:12 UTC, Christophe Leroy wrote:
> Most 8xx registers have specific names, so just include
> reg_8xx.h all the time in reg.h in order to have them defined
> even when CONFIG_PPC_8xx is not selected. This will avoid
> the need for #ifdefs in C code.
> 
> Guard SPRN_ICTRL in an #ifdef CONFIG_PPC_8xx as this register
> has same name but different meaning and different spr number as
> another register in the mpc7450.
> 
> Signed-off-by: Christophe Leroy 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/8640dd0df98891c5ea4695d89a4972cb4c1f

cheers

Re: [PATCH] powerpc/reg: use ASM_FTR_IFSET() instead of opencoding fixup.

2019-11-25 Thread Michael Ellerman

On Wed, 2019-08-28 at 13:42:01 UTC, Christophe Leroy wrote:
> mftb() includes a feature fixup for CELL ppc.
> 
> Use ASM_FTR_IFSET() macro instead of opencoding the setup
> of the fixup sections.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/b06174345f6e70200916136695514e0b6b95ac17

cheers

Re: [PATCH] powerpc/32: Don't populate page tables for block mapped pages except on the 8xx.

2019-11-25 Thread Michael Ellerman

On Fri, 2019-08-23 at 09:56:21 UTC, Christophe Leroy wrote:
> Commit d2f15e0979ee ("powerpc/32: always populate page tables for
> Abatron BDI.") wrongly sets page tables for any PPC32 for using BDI,
> and does't update them after init (remove RX on init section, set
> text and rodata read-only)
> 
> Only the 8xx requires page tables to be populated for using the BDI.
> They also need to be populated in order to see the mappings in
> /sys/kernel/debug/kernel_page_tables
> 
> On BOOK3S_32, pages that are not mapped by page tables are mapped
> by BATs. The BDI knows BATs and they can be viewed in
> /sys/kernel/debug/powerpc/block_address_translation
> 
> Only set pagetables for RAM and IMMR on the 8xx and properly update
> them at the end of init.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/a2227a27774328507a5c2335a6dd600c079d1ff5

cheers

Re: [PATCH] powerpc/mm: tell if a bad page fault on data is read or write.

2019-11-25 Thread Michael Ellerman

On Wed, 2019-08-21 at 15:21:55 UTC, Christophe Leroy wrote:
> DSISR has a bit to tell if the fault is due to a read or a write.
> 
> Display it.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/46ddcb3950a28c0df4815e8dbb8d4b91d5d9f22d

cheers

Re: [PATCH] powerpc/mm: drop #ifdef CONFIG_MMU in is_ioremap_addr()

2019-11-25 Thread Michael Ellerman

On Wed, 2019-08-21 at 10:13:32 UTC, Christophe Leroy wrote:
> powerpc always selects CONFIG_MMU and CONFIG_MMU is not checked
> anywhere else in powerpc code.
> 
> Drop the #ifdef and the alternative part of is_ioremap_addr()
> 
> Fixes: 9bd3bb6703d8("mm/nvdimm: add is_ioremap_addr and use that to check 
> ioremap address")
> Signed-off-by: Christophe Leroy 
> Cc: Aneesh Kumar K.V 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/c4028fa2daa059ac9231ab3a4f57cbae814b3625

cheers

Re: [PATCH 2/3] powerpc: refactoring BUG/WARN macros

2019-11-25 Thread Michael Ellerman

On Mon, 2019-08-19 at 13:06:30 UTC, Christophe Leroy wrote:
> BUG(), WARN() and friends are using a similar inline
> assembly to implement various traps with various flags.
> 
> Lets refactor via a new BUG_ENTRY() macro.
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/43f003bb74b9b27da6e719cfc2f7630f5652665a

cheers

Re: [PATCH v4 2/2] powerpc/irq: inline call_do_irq() and call_do_softirq()

2019-11-25 Thread Michael Ellerman

Segher Boessenkool  writes:
> On Thu, Nov 21, 2019 at 05:14:45PM +1100, Michael Ellerman wrote:
>> Christophe Leroy  writes:
>> That breaks 64-bit with GCC9:
>> 
>>   arch/powerpc/kernel/irq.c: In function 'do_IRQ':
>>   arch/powerpc/kernel/irq.c:650:2: error: PIC register clobbered by 'r2' in 
>> 'asm'
>> 650 |  asm volatile(
>> |  ^~~
>>   arch/powerpc/kernel/irq.c: In function 'do_softirq_own_stack':
>>   arch/powerpc/kernel/irq.c:711:2: error: PIC register clobbered by 'r2' in 
>> 'asm'
>> 711 |  asm volatile(
>> |  ^~~
>> 
>> 
>> > diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
>> > index 04204be49577..d62fe18405a0 100644
>> > --- a/arch/powerpc/kernel/irq.c
>> > +++ b/arch/powerpc/kernel/irq.c
>> > @@ -642,6 +642,22 @@ void __do_irq(struct pt_regs *regs)
>> >irq_exit();
>> >  }
>> >  
>> > +static inline void call_do_irq(struct pt_regs *regs, void *sp)
>> > +{
>> > +  register unsigned long r3 asm("r3") = (unsigned long)regs;
>> > +
>> > +  /* Temporarily switch r1 to sp, call __do_irq() then restore r1 */
>> > +  asm volatile(
>> > +  "   "PPC_STLU"  1, %2(%1);\n"
>> > +  "   mr  1, %1;\n"
>> > +  "   bl  %3;\n"
>> > +  "   "PPC_LL"1, 0(1);\n" :
>> > +  "+r"(r3) :
>> > +  "b"(sp), "i"(THREAD_SIZE - STACK_FRAME_OVERHEAD), "i"(__do_irq) 
>> > :
>> > +  "lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", "cr6", "cr7",
>> > +  "r0", "r2", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", 
>> > "r12");
>> > +}
>> 
>> If we add a nop after the bl, so the linker could insert a TOC restore,
>> then I don't think there's any circumstance under which we expect this
>> to actually clobber r2, is there?
>
> That is mostly correct.

That's the standard I aspire to :P

> If call_do_irq was a no-inline function, there would not be problems.
>
> What TOC does __do_irq require in r2 on entry, and what will be there
> when it returns?

The kernel TOC, and also the kernel TOC, unless something's gone wrong
or I'm missing something.

cheers

Re: Bug 205201 - Booting halts if Dawicontrol DC-2976 UW SCSI board installed, unless RAM size limited to 3500M

2019-11-25 Thread Mike Rapoport

On Mon, Nov 25, 2019 at 08:39:23AM +0100, Christoph Hellwig wrote:
> On Sat, Nov 23, 2019 at 12:42:27PM +0100, Christian Zigotzky wrote:
> > Hello Christoph,
> >
> > Please find attached the dmesg of your Git kernel.
> 
> Thanks.  It looks like on your platform the swiotlb buffer isn't
> actually addressable based on the bus dma mask limit, which is rather
> interesting.  swiotlb_init uses memblock_alloc_low to allocate the
> buffer, and I'll need some help from Mike and the powerpc maintainers
> to figure out how that select where to allocate the buffer from, and
> how we can move it to a lower address.  My gut feeling would be to try
> to do what arm64 does and define a new ARCH_LOW_ADDRESS_LIMIT, preferably
> without needing too much arch specific magic.

Presuming the problem is relevant for all CoreNet boards something like
this could work:
 
diff --git a/arch/powerpc/include/asm/dma.h b/arch/powerpc/include/asm/dma.h
index 1b4f0254868f..7c6cfeeaff52 100644
--- a/arch/powerpc/include/asm/dma.h
+++ b/arch/powerpc/include/asm/dma.h
@@ -347,5 +347,11 @@ extern int isa_dma_bridge_buggy;
 #define isa_dma_bridge_buggy   (0)
 #endif
 
+#ifdef CONFIG_CORENET_GENERIC
+extern phys_addr_t ppc_dma_phys_limit;
+#define ARCH_LOW_ADDRESS_LIMIT (ppc_dma_phys_limit - 1)
+#endif
+
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_DMA_H */
diff --git a/arch/powerpc/platforms/85xx/common.c 
b/arch/powerpc/platforms/85xx/common.c
index fe0606439b5a..346b436b6d3f 100644
--- a/arch/powerpc/platforms/85xx/common.c
+++ b/arch/powerpc/platforms/85xx/common.c
@@ -126,3 +126,7 @@ void __init mpc85xx_qe_par_io_init(void)
}
 }
 #endif
+
+#ifdef CONFIG_CORENET_GENERIC
+phys_addr_t ppc_dma_phys_limit = 0xUL;
+#endif
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c 
b/arch/powerpc/platforms/85xx/corenet_generic.c
index 7ee2c6628f64..673bcbdc7c75 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -64,7 +64,7 @@ void __init corenet_gen_setup_arch(void)
mpc85xx_smp_init();
 
swiotlb_detect_4g();
-
+   ppc_dma_phys_limit = 0x0fffUL;
pr_info("%s board\n", ppc_md.name);
 
mpc85xx_qe_init();

> As a quick hack can you try this patch on top of the tree from Friday?
> 
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index f491690d54c6..e3f95c362922 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -344,7 +344,7 @@ static inline int memblock_get_region_node(const struct 
> memblock_region *r)
>  #define MEMBLOCK_LOW_LIMIT 0
>  
>  #ifndef ARCH_LOW_ADDRESS_LIMIT
> -#define ARCH_LOW_ADDRESS_LIMIT  0xUL
> +#define ARCH_LOW_ADDRESS_LIMIT  0x0fffUL
>  #endif
>  
>  phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,

-- 
Sincerely yours,
Mike.

[PATCH] powerpc/32: drop unused ISA_DMA_THRESHOLD

2019-11-25 Thread Mike Rapoport

From: Mike Rapoport 

The ISA_DMA_THRESHOLD variable is set by several platforms but never
referenced.
Remove it.

Signed-off-by: Mike Rapoport 
---
 arch/powerpc/include/asm/dma.h  | 3 +--
 arch/powerpc/kernel/setup_32.c  | 1 -
 arch/powerpc/platforms/44x/warp.c   | 3 ---
 arch/powerpc/platforms/52xx/efika.c | 1 -
 arch/powerpc/platforms/amigaone/setup.c | 1 -
 arch/powerpc/platforms/chrp/setup.c | 1 -
 arch/powerpc/platforms/powermac/setup.c | 1 -
 7 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/dma.h b/arch/powerpc/include/asm/dma.h
index 1b4f0254868f..6161a9596196 100644
--- a/arch/powerpc/include/asm/dma.h
+++ b/arch/powerpc/include/asm/dma.h
@@ -151,10 +151,9 @@
 #define DMA2_EXT_REG   0x4D6
 
 #ifndef __powerpc64__
-/* in arch/ppc/kernel/setup.c -- Cort */
+/* in arch/powerpc/kernel/setup_32.c -- Cort */
 extern unsigned int DMA_MODE_WRITE;
 extern unsigned int DMA_MODE_READ;
-extern unsigned long ISA_DMA_THRESHOLD;
 #else
 #define DMA_MODE_READ  0x44/* I/O to memory, no autoinit, 
increment, single mode */
 #define DMA_MODE_WRITE 0x48/* memory to I/O, no autoinit, 
increment, single mode */
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index a7541edf0cdb..e019f450cf9a 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -57,7 +57,6 @@ EXPORT_SYMBOL_GPL(boot_cpuid_phys);
 int smp_hw_index[NR_CPUS];
 EXPORT_SYMBOL(smp_hw_index);
 
-unsigned long ISA_DMA_THRESHOLD;
 unsigned int DMA_MODE_READ;
 unsigned int DMA_MODE_WRITE;
 
diff --git a/arch/powerpc/platforms/44x/warp.c 
b/arch/powerpc/platforms/44x/warp.c
index 6620b64e4963..665f18e37efb 100644
--- a/arch/powerpc/platforms/44x/warp.c
+++ b/arch/powerpc/platforms/44x/warp.c
@@ -43,9 +43,6 @@ static int __init warp_probe(void)
if (!of_machine_is_compatible("pika,warp"))
return 0;
 
-   /* For arch_dma_alloc */
-   ISA_DMA_THRESHOLD = ~0L;
-
return 1;
 }
 
diff --git a/arch/powerpc/platforms/52xx/efika.c 
b/arch/powerpc/platforms/52xx/efika.c
index 61538869e88a..4514a6f7458a 100644
--- a/arch/powerpc/platforms/52xx/efika.c
+++ b/arch/powerpc/platforms/52xx/efika.c
@@ -205,7 +205,6 @@ static int __init efika_probe(void)
if (strcmp(model, "EFIKA5K2"))
return 0;
 
-   ISA_DMA_THRESHOLD = ~0L;
DMA_MODE_READ = 0x44;
DMA_MODE_WRITE = 0x48;
 
diff --git a/arch/powerpc/platforms/amigaone/setup.c 
b/arch/powerpc/platforms/amigaone/setup.c
index ea5e45e32683..f5d0bf999759 100644
--- a/arch/powerpc/platforms/amigaone/setup.c
+++ b/arch/powerpc/platforms/amigaone/setup.c
@@ -146,7 +146,6 @@ static int __init amigaone_probe(void)
 */
cur_cpu_spec->cpu_features &= ~CPU_FTR_NEED_COHERENT;
 
-   ISA_DMA_THRESHOLD = 0x00ff;
DMA_MODE_READ = 0x44;
DMA_MODE_WRITE = 0x48;
 
diff --git a/arch/powerpc/platforms/chrp/setup.c 
b/arch/powerpc/platforms/chrp/setup.c
index fcf6f2342ef4..590d292f47e6 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -573,7 +573,6 @@ static int __init chrp_probe(void)
if (strcmp(dtype, "chrp"))
return 0;
 
-   ISA_DMA_THRESHOLD = ~0L;
DMA_MODE_READ = 0x44;
DMA_MODE_WRITE = 0x48;
 
diff --git a/arch/powerpc/platforms/powermac/setup.c 
b/arch/powerpc/platforms/powermac/setup.c
index c6d5333729ed..95fb4feb6ccc 100644
--- a/arch/powerpc/platforms/powermac/setup.c
+++ b/arch/powerpc/platforms/powermac/setup.c
@@ -586,7 +586,6 @@ static int __init pmac_probe(void)
 
 #ifdef CONFIG_PPC32
/* isa_io_base gets set in pmac_pci_init */
-   ISA_DMA_THRESHOLD = ~0L;
DMA_MODE_READ = 1;
DMA_MODE_WRITE = 2;
 #endif /* CONFIG_PPC32 */
-- 
2.24.0

Re: [PATCH 17/19] powerpc: book3s64: convert to pin_user_pages() and put_user_page()

2019-11-25 Thread Jan Kara

On Sun 24-11-19 20:20:09, John Hubbard wrote:
> 1. Convert from get_user_pages() to pin_user_pages().
> 
> 2. As required by pin_user_pages(), release these pages via
> put_user_page(). In this case, do so via put_user_pages_dirty_lock().
> 
> That has the side effect of calling set_page_dirty_lock(), instead
> of set_page_dirty(). This is probably more accurate.
> 
> As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
> dealing with a file backed page where we have reference on the inode it
> hangs off." [1]
> 
> 3. Release each page in mem->hpages[] (instead of mem->hpas[]), because
> that is the array that pin_longterm_pages() filled in. This is more
> accurate and should be a little safer from a maintenance point of
> view.

Except that this breaks the code. hpages is unioned with hpas...

> [1] https://lore.kernel.org/r/20190723153640.gb...@lst.de
> 
> Signed-off-by: John Hubbard 
> ---
>  arch/powerpc/mm/book3s64/iommu_api.c | 12 +---
>  1 file changed, 5 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
> b/arch/powerpc/mm/book3s64/iommu_api.c
> index 56cc84520577..196383e8e5a9 100644
> --- a/arch/powerpc/mm/book3s64/iommu_api.c
> +++ b/arch/powerpc/mm/book3s64/iommu_api.c
> @@ -103,7 +103,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
> unsigned long ua,
>   for (entry = 0; entry < entries; entry += chunk) {
>   unsigned long n = min(entries - entry, chunk);
>  
> - ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
> + ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
>   FOLL_WRITE | FOLL_LONGTERM,
>   mem->hpages + entry, NULL);
>   if (ret == n) {
> @@ -167,9 +167,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
> unsigned long ua,
>   return 0;
>  
>  free_exit:
> - /* free the reference taken */
> - for (i = 0; i < pinned; i++)
> - put_page(mem->hpages[i]);
> + /* free the references taken */
> + put_user_pages(mem->hpages, pinned);
>  
>   vfree(mem->hpas);
>   kfree(mem);
> @@ -212,10 +211,9 @@ static void mm_iommu_unpin(struct 
> mm_iommu_table_group_mem_t *mem)
>   if (!page)
>   continue;
>  
> - if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
> - SetPageDirty(page);
> + put_user_pages_dirty_lock(&mem->hpages[i], 1,
> +   MM_IOMMU_TABLE_GROUP_PAGE_DIRTY);

And the dirtying condition is wrong here as well. Currently it is always
true.

Honza
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH 07/19] mm/gup: introduce pin_user_pages*() and FOLL_PIN

2019-11-25 Thread kbuild test robot

Hi John,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on rdma/for-next]
[cannot apply to v5.4 next-20191122]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:
https://github.com/0day-ci/linux/commits/John-Hubbard/pin_user_pages-reduced-risk-series-for-Linux-5-5/20191125-125637
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git for-next
config: arm-randconfig-a001-20191125 (attached as .config)
compiler: arm-linux-gnueabi-gcc (GCC) 7.4.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.4.0 make.cross ARCH=arm 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot 

All errors (new ones prefixed by >>):

   mm/gup.o: In function `pin_user_pages_remote':
>> mm/gup.c:2528: undefined reference to `get_user_pages_remote'

vim +2528 mm/gup.c

  2507  
  2508  /**
  2509   * pin_user_pages_remote() - pin pages of a remote process (task != 
current)
  2510   *
  2511   * For now, this is a placeholder function, until various call sites are
  2512   * converted to use the correct get_user_pages*() or pin_user_pages*() 
API. So,
  2513   * this is identical to get_user_pages_remote().
  2514   *
  2515   * This is intended for Case 1 (DIO) in 
Documentation/vm/pin_user_pages.rst. It
  2516   * is NOT intended for Case 2 (RDMA: long-term pins).
  2517   */
  2518  long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct 
*mm,
  2519 unsigned long start, unsigned long nr_pages,
  2520 unsigned int gup_flags, struct page **pages,
  2521 struct vm_area_struct **vmas, int *locked)
  2522  {
  2523  /*
  2524   * This is a placeholder, until the pin functionality is 
activated.
  2525   * Until then, just behave like the corresponding 
get_user_pages*()
  2526   * routine.
  2527   */
> 2528  return get_user_pages_remote(tsk, mm, start, nr_pages, 
> gup_flags, pages,
  2529   vmas, locked);
  2530  }
  2531  EXPORT_SYMBOL(pin_user_pages_remote);
  2532  

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org Intel Corporation


.config.gz
Description: application/gzip

84 matches

Mail list logo