[PATCH 1/2] powerpc/tracing: Trace TLBIE(L)

2017-04-10 Thread Balbir Singh
Just a quick patch to trace tlbie(l)'s. The idea being that it can be
enabled when we suspect corruption or when we need to see if we are doing
the right thing during flush. I think the format can be enhanced to
make it nicer (expand the RB/RS/IS/L cases in more detail if we ever
need that level of details).

A typical trace might look like this

<...>-5141  [062]  1354.486693: tlbie:
tlbie with lpid 0, local 0, rb=7b5d0ff874f11f1, rs=0, ric=0 prs=0 r=0
systemd-udevd-2584  [018]  1354.486772: tlbie:
tlbie with lpid 0, local 0, rb=17be1f421adc10c1, rs=0, ric=0 prs=0 r=0
...

qemu-system-ppc-5371  [016]  1412.369519: tlbie:
tlbie with lpid 0, local 1, rb=67bd8900174c11c1, rs=0, ric=0 prs=0 r=0
qemu-system-ppc-5377  [056]  1421.687262: tlbie:
tlbie with lpid 1, local 0, rb=5f04edffa00c11c1, rs=1, ric=0 prs=0 r=0

Signed-off-by: Balbir Singh 
---

 NOTES: Did not implement the count suggestion to see individual flushes
 The calls are always outside of ptesync, except for when in a loop in do_tlbies

 arch/powerpc/include/asm/trace.h| 33 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 11 +--
 arch/powerpc/mm/hash_native_64.c|  3 +++
 arch/powerpc/mm/tlb-radix.c |  9 +
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index c05cef6..18f168a 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -168,6 +168,39 @@ TRACE_EVENT(hash_fault,
  __entry->addr, __entry->access, __entry->trap)
 );
 
+
+TRACE_EVENT(tlbie,
+
+   TP_PROTO(unsigned long lpid, unsigned long local, unsigned long rb,
+   unsigned long rs, unsigned long ric, unsigned long prs,
+   unsigned long r),
+   TP_ARGS(lpid, local, rb, rs, ric, prs, r),
+   TP_STRUCT__entry(
+   __field(unsigned long, lpid)
+   __field(unsigned long, local)
+   __field(unsigned long, rb)
+   __field(unsigned long, rs)
+   __field(unsigned long, ric)
+   __field(unsigned long, prs)
+   __field(unsigned long, r)
+   ),
+
+   TP_fast_assign(
+   __entry->lpid = lpid;
+   __entry->local = local;
+   __entry->rb = rb;
+   __entry->rs = rs;
+   __entry->ric = ric;
+   __entry->prs = prs;
+   __entry->r = r;
+   ),
+
+   TP_printk("lpid=%ld, local=%ld, rb=0x%lx, rs=0x%lx, ric=0x%lx, "
+   "prs=0x%lx, r=0x%lx", __entry->lpid, __entry->local,
+   __entry->rb, __entry->rs, __entry->ric, __entry->prs,
+   __entry->r)
+);
+
 #endif /* _TRACE_POWERPC_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index ce6f212..584c74c 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -15,6 +15,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -443,17 +444,23 @@ static void do_tlbies(struct kvm *kvm, unsigned long 
*rbvalues,
cpu_relax();
if (need_sync)
asm volatile("ptesync" : : : "memory");
-   for (i = 0; i < npages; ++i)
+   for (i = 0; i < npages; ++i) {
asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
 "r" (rbvalues[i]), "r" (kvm->arch.lpid));
+   trace_tlbie(kvm->arch.lpid, 0, rbvalues[i],
+   kvm->arch.lpid, 0, 0, 0);
+   }
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
kvm->arch.tlbie_lock = 0;
} else {
if (need_sync)
asm volatile("ptesync" : : : "memory");
-   for (i = 0; i < npages; ++i)
+   for (i = 0; i < npages; ++i) {
asm volatile(PPC_TLBIEL(%0,%1,0,0,0) : :
 "r" (rbvalues[i]), "r" (0));
+   trace_tlbie(kvm->arch.lpid, 1, rbvalues[i],
+   0, 0, 0, 0);
+   }
asm volatile("ptesync" : : : "memory");
}
 }
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 65bb8f3..bdaac28 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -98,6 +99,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int 
apsize, int ssize)
 : "memory");
break;
}
+   trace_tlbie(0, 0, va, 0, 0, 0, 0);
 }
 
 static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int 
ssize)
@@ 

[PATCH v2] ppc64/kprobe: Fix oops when kprobed on 'stdu' instruction

2017-04-10 Thread Ravi Bangoria
If we set a kprobe on a 'stdu' instruction on powerpc64, we see a kernel 
OOPS:

  [ 1275.165932] Bad kernel stack pointer cd93c840 at c0009868
  [ 1275.166378] Oops: Bad kernel stack pointer, sig: 6 [#1]
  ...
  GPR00: c01fcd93cb30 cd93c840 c15c5e00 cd93c840
  ...
  [ 1275.178305] NIP [c0009868] resume_kernel+0x2c/0x58
  [ 1275.178594] LR [c0006208] program_check_common+0x108/0x180

Basically, on 64 bit system, when user probes on 'stdu' instruction,
kernel does not emulate actual store in emulate_step itself because it
may corrupt exception frame. So kernel does actual store operation in
exception return code i.e. resume_kernel().

resume_kernel() loads the saved stack pointer from memory using lwz,
effectively loading a corrupt (32bit) address, causing the kernel crash.

Fix this by loading the 64bit value instead.

Fixes: be96f63375a1 ("powerpc: Split out instruction analysis part of 
emulate_step()") 
Signed-off-by: Ravi Bangoria 
Reviewed-by: Naveen N. Rao  
---
History:
  Commit 8e9f69371536 ("powerpc/kprobe: Don't emulate store when kprobe
  stwu r1") fixed exception frame corruption for 32 bit system which uses
  'stwu' instruction for stack frame allocation. This commit also added
  code for 64 bit system but did not enabled it for 'stdu' instruction.
  So 'stdu' instruction on 64 bit machine was emulating actual store in
  emulate_step() itself until...

  Commit be96f63375a1 ("powerpc: Split out instruction analysis part of
  emulate_step()"), enabled it for 'stdu' instruction on 64 bit machine.

  So kprobe on 'stdu' has always been broken on powerpc64.  We haven't
  noticed since most stdu operations were probably landing in the red
  zone so the exception frame never got corrupted. In that sense, this
  fix is needed for BE ever since load/store emulation was added.

  For LE, this is only getting exposed now due to my recent patch to
  enable load/store emulation on LE, which got merged as commit
  e148bd17f48b ("powerpc: Emulation support for load/store instructions
  on LE").

  Please mark this for stable as well.

Changes in v2:
  - Replace 'stwu' with 'stdu' in the comment.

 arch/powerpc/kernel/entry_64.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 6432d4b..767ef6d 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -689,7 +689,7 @@ resume_kernel:
 
addir8,r1,INT_FRAME_SIZE/* Get the kprobed function entry */
 
-   lwz r3,GPR1(r1)
+   ld  r3,GPR1(r1)
subir3,r3,INT_FRAME_SIZE/* dst: Allocate a trampoline exception 
frame */
mr  r4,r1   /* src:  current exception frame */
mr  r1,r3   /* Reroute the trampoline frame to r1 */
@@ -703,8 +703,8 @@ resume_kernel:
addir6,r6,8
bdnz2b
 
-   /* Do real store operation to complete stwu */
-   lwz r5,GPR1(r1)
+   /* Do real store operation to complete stdu */
+   ld  r5,GPR1(r1)
std r8,0(r5)
 
/* Clear _TIF_EMULATE_STACK_STORE flag */
-- 
1.9.3



Re: EEH error in doing DMA with PEX 8619

2017-04-10 Thread Gavin Shan
On Tue, Apr 11, 2017 at 12:15:10PM +1000, Benjamin Herrenschmidt wrote:
>On Mon, 2017-04-10 at 19:04 -0700, IanJiang wrote:
>> Thanks for your replay.
>> 
>> I fixed my test according your suggestion. The CPU physical addresses (0x
>> 1f9e40 and 0x 1f82c0) converted with virt_to_phys() are used ,
>> instead of DMA addresses, or BUS physical addresses (0x 60a0 and 0x
>> 60c0). However, EEH still reports error.
>
>That's incorrect. The system has an IOMMU, only addressed properly
>mapped/translated by the IOMMU can be used for DMA. That is addresses
>returned by things like dma_map_single/sg, dma_alloc_coherent, etc...
>
>You also need to ensure you configure a proper dma mask.
>
>Using virt_to_phys() is NEVER correct on *any* architecture.
>

Ben is correct that PCI device needs to use DMA address instead of
memory address. I was suggesting to ensure that in the driver's code
and was misunderstood. It seems the driver has used the DMA address,
which is correct. But it seems the TCE entry for the DMA address isn't
built correctly. Could you please share the kernel log with original
driver code? I am not sure if I can find somethere there, but worthy
to check.

Thanks,
Gavin



[RFC][PATCH] powerpc/syscalls/trace: Fix mmap in syscalls_trace

2017-04-10 Thread Balbir Singh
This patch uses SYSCALL_DEFINE6 for sys_mmap and sys_mmap2
so that the meta-data associated with these syscalls is
visible to the syscall tracer. In the absence of this
generic syscalls (defined outside arch) like munmap,etc.
are visible in available_events, syscall_enter_mmap and
syscall_exit_mmap is not.

A side-effect of this change is that the return type has
changed from unsigned long to long.

Signed-off-by: Balbir Singh 
---
 arch/powerpc/include/asm/syscalls.h |  4 ++--
 arch/powerpc/kernel/syscalls.c  | 16 
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/syscalls.h 
b/arch/powerpc/include/asm/syscalls.h
index 23be8f1..16fab68 100644
--- a/arch/powerpc/include/asm/syscalls.h
+++ b/arch/powerpc/include/asm/syscalls.h
@@ -8,10 +8,10 @@
 
 struct rtas_args;
 
-asmlinkage unsigned long sys_mmap(unsigned long addr, size_t len,
+asmlinkage long sys_mmap(unsigned long addr, size_t len,
unsigned long prot, unsigned long flags,
unsigned long fd, off_t offset);
-asmlinkage unsigned long sys_mmap2(unsigned long addr, size_t len,
+asmlinkage long sys_mmap2(unsigned long addr, size_t len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff);
 asmlinkage long ppc64_personality(unsigned long personality);
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index de04c9f..10b7fe9d 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -42,11 +42,11 @@
 #include 
 #include 
 
-static inline unsigned long do_mmap2(unsigned long addr, size_t len,
+static inline long do_mmap2(unsigned long addr, size_t len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long off, int shift)
 {
-   unsigned long ret = -EINVAL;
+   long ret = -EINVAL;
 
if (!arch_validate_prot(prot))
goto out;
@@ -62,16 +62,16 @@ static inline unsigned long do_mmap2(unsigned long addr, 
size_t len,
return ret;
 }
 
-unsigned long sys_mmap2(unsigned long addr, size_t len,
-   unsigned long prot, unsigned long flags,
-   unsigned long fd, unsigned long pgoff)
+SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len,
+   unsigned long, prot, unsigned long, flags,
+   unsigned long, fd, unsigned long, pgoff)
 {
return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12);
 }
 
-unsigned long sys_mmap(unsigned long addr, size_t len,
-  unsigned long prot, unsigned long flags,
-  unsigned long fd, off_t offset)
+SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
+   unsigned long, prot, unsigned long, flags,
+   unsigned long, fd, off_t, offset)
 {
return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT);
 }
-- 
2.9.3



Re: [PATCH] ibmveth: Support to enable LSO/CSO for Trunk VEA.

2017-04-10 Thread Sivakumar Krishnasamy

Re-sending as my earlier response had some HTML subparts.

Let me give some background before I answer your queries.

In IBM PowerVM environment, ibmveth driver supports largesend and 
checksum offload today, but only for virtual ethernet adapters (VEA) 
which are not configured in "Trunk mode".  In trunk mode, one cannot 
enable checksum and largesend offload capabilities. Without these 
offloads enabled, the performance numbers are not good. This patch is to 
enable these offloads for "Trunk" VEAs.


The following shows a typical configuration for network packet flow, 
when VMs in the PowerVM server have their network virtualized and 
communicate to external world.


VM (ibmveth) <=> PowerVM Hypervisor <=>  PowerVM I/O Server VM 
( ibmveth in "Trunk mode" <=> OVS <=> Physical NIC ) <=>  External Network


As you can see the packets originating in VM will travel through local 
ibmveth driver and then to PowerVM Hypervisor, then it gets delivered to 
ibmveth driver configured in "Trunk" mode in I/O Server, which is then 
bridged by OVS to external network via Physical NIC.  To have largesend 
and checksum offload enabled end to end, from VM up to Physical NIC, 
ibmveth needs to support these offload capabilities when configured in 
"Trunk" mode too.


Before this patch, when a VM communicates with external network (in a 
configuration similar to above), throughput numbers were not so good 
(~1.5 Gbps) and with the patch, I see ~9.4 Gbps throughput for a 10G NIC 
(iperf used for measurements).


On 4/9/2017 12:15 AM, David Miller wrote:

From: Sivakumar Krishnasamy 
Date: Fri,  7 Apr 2017 05:57:59 -0400


Enable largesend and checksum offload for ibmveth configured in trunk mode.
Added support to SKB frag_list in TX path by skb_linearize'ing such SKBs.

Signed-off-by: Sivakumar Krishnasamy 


Why is linearization necessary?

It would seem that the gains you get from GRO are nullified by
linearizing the SKB and thus copying all the data around and
allocating buffers.

When Physical NIC has GRO enabled and when OVS bridges these packets, 
OVS vport send code will end up calling dev_queue_xmit, which in turn 
calls validate_xmit_skb.


validate_xmit_skb has the below code snippet,

if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;

segs = skb_gso_segment(skb, features); <=== Segments the 
GSO packet into MTU sized segments.


When the OVS outbound vport is ibmveth, netif_needs_gso returns 
positively if the SKB has a frag_list and if the driver doesn't support 
the same (NETIF_F_FRAGLIST feature).  So all the packets received by 
ibmveth are of MSS size (or lesser) due to the above code.


On a 10G physical NIC, the maximum throughput achieved was 2.2 Gbps due 
to the above segmentation in validate_xmit_skb. With the patch to 
linearize the SKB, the throughput increased to 9 Gbps (and ibmveth 
received packets without being segmented). This is ~4X improvement even 
though we end up allocating buffers and copying data.



Finally, all of that new checksumming stuff looks extremely
suspicious.  You have to explain why that is happening and why it
isn't because this driver is doing something incorrectly.

Thanks.

We are now enabling support for OVS and improving bridging performance 
in IBM's PowerVM environment, which brings in these new offload 
requirements for ibmveth driver configured in Trunk mode.


Please let me know if you need more details.

Regards,
Siva K



Re: clear_page, copy_page address align question?

2017-04-10 Thread Minchan Kim
On Tue, Apr 11, 2017 at 01:12:24PM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2017-04-11 at 12:08 +0900, Minchan Kim wrote:
> > Hello,
> > 
> > When I tested zram in ppc64, I got random corruption.
> > With investigation, it seems clear_page corrupted the memory.
> > I passed 64K kmalloced(kmalloc(PAGE_SIZE)) address to clear_page
> > and turned on slub debug so address is not aligned with PAGE_SIZE.
> > Is it a valid usecase that non-PAGE_SIZE aligned address is
> > used for clear_page in ppc64?
> > 
> > As well, copy_page have same rule, too?
> > 
> > Anyway, when I changed clear_page to memset, it seems the problem
> > is gone.
> 
> Yes, both clear_page and copy_page assume a PAGE_SHIFT alignment and
> are highly optimize according to this.
> 
> I wouldn't be surprised of other architectures implementations are the
> same.
> 
> I don't think it's ever legit to call these functions for something
> that isn't a naturally aligned page.


If it's the common for every architecture, it would have better to
have description about that in somewhere or WARN_ON. :(

Thanks for the confirm!


Re: clear_page, copy_page address align question?

2017-04-10 Thread Benjamin Herrenschmidt
On Tue, 2017-04-11 at 12:08 +0900, Minchan Kim wrote:
> Hello,
> 
> When I tested zram in ppc64, I got random corruption.
> With investigation, it seems clear_page corrupted the memory.
> I passed 64K kmalloced(kmalloc(PAGE_SIZE)) address to clear_page
> and turned on slub debug so address is not aligned with PAGE_SIZE.
> Is it a valid usecase that non-PAGE_SIZE aligned address is
> used for clear_page in ppc64?
> 
> As well, copy_page have same rule, too?
> 
> Anyway, when I changed clear_page to memset, it seems the problem
> is gone.

Yes, both clear_page and copy_page assume a PAGE_SHIFT alignment and
are highly optimize according to this.

I wouldn't be surprised of other architectures implementations are the
same.

I don't think it's ever legit to call these functions for something
that isn't a naturally aligned page.

Cheers,
Ben.



clear_page, copy_page address align question?

2017-04-10 Thread Minchan Kim
Hello,

When I tested zram in ppc64, I got random corruption.
With investigation, it seems clear_page corrupted the memory.
I passed 64K kmalloced(kmalloc(PAGE_SIZE)) address to clear_page
and turned on slub debug so address is not aligned with PAGE_SIZE.
Is it a valid usecase that non-PAGE_SIZE aligned address is
used for clear_page in ppc64?

As well, copy_page have same rule, too?

Anyway, when I changed clear_page to memset, it seems the problem
is gone.

Thanks.


[PATCH v3 5/5] perf report: Show branch type in callchain entry

2017-04-10 Thread Jin Yao
Show branch type in callchain entry. The branch type is printed
with other LBR information (such as cycles/abort/...).

One example:
perf report --branch-history --stdio --no-children

--23.54%--main div.c:42 (CROSS_2M RET cycles:2)
  compute_flag div.c:28 (RET cycles:2)
  compute_flag div.c:27 (CROSS_2M RET cycles:1)
  rand rand.c:28 (CROSS_4K RET cycles:1)
  rand rand.c:28 (CROSS_2M RET cycles:1)
  __random random.c:298 (CROSS_4K RET cycles:1)
  __random random.c:297 (JCC backward CROSS_2M cycles:1)
  __random random.c:295 (JCC forward CROSS_4K cycles:1)
  __random random.c:295 (JCC backward CROSS_2M cycles:1)
  __random random.c:295 (JCC forward CROSS_4K cycles:1)
  __random random.c:295 (CROSS_2M RET cycles:9)

Signed-off-by: Jin Yao 
---
 tools/perf/util/callchain.c | 195 ++--
 tools/perf/util/callchain.h |   4 +-
 tools/perf/util/machine.c   |  26 --
 3 files changed, 152 insertions(+), 73 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 2e5eff5..3c875b1 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -467,6 +467,11 @@ fill_node(struct callchain_node *node, struct 
callchain_cursor *cursor)
call->cycles_count = cursor_node->branch_flags.cycles;
call->iter_count = cursor_node->nr_loop_iter;
call->samples_count = cursor_node->samples;
+
+   branch_type_count(>brtype_stat,
+ _node->branch_flags,
+ cursor_node->branch_from,
+ cursor_node->ip);
}
 
list_add_tail(>list, >val);
@@ -579,6 +584,11 @@ static enum match_result match_chain(struct 
callchain_cursor_node *node,
cnode->cycles_count += node->branch_flags.cycles;
cnode->iter_count += node->nr_loop_iter;
cnode->samples_count += node->samples;
+
+   branch_type_count(>brtype_stat,
+ >branch_flags,
+ node->branch_from,
+ node->ip);
}
 
return MATCH_EQ;
@@ -813,7 +823,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
list_for_each_entry_safe(list, next_list, >val, list) {
callchain_cursor_append(cursor, list->ip,
list->ms.map, list->ms.sym,
-   false, NULL, 0, 0);
+   false, NULL, 0, 0, 0);
list_del(>list);
map__zput(list->ms.map);
free(list);
@@ -853,7 +863,7 @@ int callchain_merge(struct callchain_cursor *cursor,
 int callchain_cursor_append(struct callchain_cursor *cursor,
u64 ip, struct map *map, struct symbol *sym,
bool branch, struct branch_flags *flags,
-   int nr_loop_iter, int samples)
+   int nr_loop_iter, int samples, u64 branch_from)
 {
struct callchain_cursor_node *node = *cursor->last;
 
@@ -877,6 +887,7 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
memcpy(>branch_flags, flags,
sizeof(struct branch_flags));
 
+   node->branch_from = branch_from;
cursor->nr++;
 
cursor->last = >next;
@@ -1105,95 +1116,151 @@ int callchain_branch_counts(struct callchain_root 
*root,
  cycles_count);
 }
 
+static int branch_type_str(struct branch_type_stat *stat,
+  char *bf, int bfsize)
+{
+   int i, j = 0, printed = 0;
+   u64 total = 0;
+
+   for (i = 0; i < PERF_BR_MAX; i++)
+   total += stat->counts[i];
+
+   if (total == 0)
+   return 0;
+
+   printed += scnprintf(bf + printed, bfsize - printed, " (");
+
+   if (stat->jcc_fwd > 0) {
+   j++;
+   printed += scnprintf(bf + printed, bfsize - printed,
+"JCC forward");
+   }
+
+   if (stat->jcc_bwd > 0) {
+   if (j++)
+   printed += scnprintf(bf + printed, bfsize - printed,
+" JCC backward");
+   else
+   printed += scnprintf(bf + printed, bfsize - printed,
+"JCC backward");
+   }
+
+   if (stat->cross_4k > 0) {
+   if (j++)
+   printed += scnprintf(bf + printed, bfsize - printed,
+" CROSS_4K");
+   else
+   

[PATCH v3 4/5] perf report: Show branch type statistics for stdio mode

2017-04-10 Thread Jin Yao
Show the branch type statistics at the end of perf report --stdio.

For example:
perf report --stdio

 JCC forward:  27.8%
JCC backward:   9.7%
CROSS_4K:   0.0%
CROSS_2M:  14.3%
 JCC:  37.6%
 JMP:   0.0%
 IND_JMP:   6.5%
CALL:  26.6%
 RET:  29.3%
IRET:   0.0%

The branch types are:
-
 JCC forward: Conditional forward jump
JCC backward: Conditional backward jump
 JMP: Jump imm
 IND_JMP: Jump reg/mem
CALL: Call imm
IND_CALL: Call reg/mem
 RET: Ret
 SYSCALL: Syscall
  SYSRET: Syscall return
 IRQ: HW interrupt/trap/fault
 INT: SW interrupt
IRET: Return from interrupt
  FAR_BRANCH: Others not generic branch type

CROSS_4K and CROSS_2M:
--
They are the metrics checking for branches cross 4K or 2MB pages.
It's an approximate computing. We don't know if the area is 4K or
2MB, so always compute both.

To make the output simple, if a branch crosses 2M area, CROSS_4K
will not be incremented.

Signed-off-by: Jin Yao 
---
 tools/perf/builtin-report.c | 70 +
 tools/perf/util/event.h |  3 +-
 tools/perf/util/hist.c  |  5 +---
 tools/perf/util/util.c  | 59 ++
 tools/perf/util/util.h  | 17 +++
 5 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index c18158b..c2889eb 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -66,6 +66,7 @@ struct report {
u64 queue_size;
int socket_filter;
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+   struct branch_type_stat brtype_stat;
 };
 
 static int report__config(const char *var, const char *value, void *cb)
@@ -144,6 +145,24 @@ static int hist_iter__report_callback(struct 
hist_entry_iter *iter,
return err;
 }
 
+static int hist_iter__branch_callback(struct hist_entry_iter *iter,
+ struct addr_location *al __maybe_unused,
+ bool single __maybe_unused,
+ void *arg)
+{
+   struct hist_entry *he = iter->he;
+   struct report *rep = arg;
+   struct branch_info *bi;
+
+   if (sort__mode == SORT_MODE__BRANCH) {
+   bi = he->branch_info;
+   branch_type_count(>brtype_stat, >flags,
+ bi->from.addr, bi->to.addr);
+   }
+
+   return 0;
+}
+
 static int process_sample_event(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -182,6 +201,8 @@ static int process_sample_event(struct perf_tool *tool,
 */
if (!sample->branch_stack)
goto out_put;
+
+   iter.add_entry_cb = hist_iter__branch_callback;
iter.ops = _iter_branch;
} else if (rep->mem_mode) {
iter.ops = _iter_mem;
@@ -369,6 +390,50 @@ static size_t hists__fprintf_nr_sample_events(struct hists 
*hists, struct report
return ret + fprintf(fp, "\n#\n");
 }
 
+static void branch_type_stat_display(FILE *fp, struct branch_type_stat *stat)
+{
+   u64 total = 0;
+   int i;
+
+   for (i = 0; i < PERF_BR_MAX; i++)
+   total += stat->counts[i];
+
+   if (total == 0)
+   return;
+
+   fprintf(fp, "\n#");
+   fprintf(fp, "\n# Branch Statistics:");
+   fprintf(fp, "\n#");
+
+   if (stat->jcc_fwd > 0)
+   fprintf(fp, "\n%12s: %5.1f%%",
+   "JCC forward",
+   100.0 * (double)stat->jcc_fwd / (double)total);
+
+   if (stat->jcc_bwd > 0)
+   fprintf(fp, "\n%12s: %5.1f%%",
+   "JCC backward",
+   100.0 * (double)stat->jcc_bwd / (double)total);
+
+   if (stat->cross_4k > 0)
+   fprintf(fp, "\n%12s: %5.1f%%",
+   "CROSS_4K",
+   100.0 * (double)stat->cross_4k / (double)total);
+
+   if (stat->cross_2m > 0)
+   fprintf(fp, "\n%12s: %5.1f%%",
+   "CROSS_2M",
+   100.0 * (double)stat->cross_2m / (double)total);
+
+   for (i = 0; i < PERF_BR_MAX; i++) {
+   if (stat->counts[i] > 0)
+   fprintf(fp, "\n%12s: %5.1f%%",
+   branch_type_name(i),
+   100.0 *
+   (double)stat->counts[i] / (double)total);
+   }
+}
+
 static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
 struct report *rep,
 const char *help)
@@ -404,6 +469,9 @@ static int 

[PATCH v3 3/5] perf record: Create a new option save_type in --branch-filter

2017-04-10 Thread Jin Yao
The option indicates the kernel to save branch type during sampling.

One example:
perf record -g --branch-filter any,save_type 

Signed-off-by: Jin Yao 
---
 tools/perf/Documentation/perf-record.txt | 1 +
 tools/perf/util/parse-branch-options.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tools/perf/Documentation/perf-record.txt 
b/tools/perf/Documentation/perf-record.txt
index ea3789d..e2f5a4f 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -332,6 +332,7 @@ following filters are defined:
- no_tx: only when the target is not in a hardware transaction
- abort_tx: only when the target is a hardware transaction abort
- cond: conditional branches
+   - save_type: save branch type during sampling in case binary is not 
available later
 
 +
 The option requires at least one branch type among any, any_call, any_ret, 
ind_call, cond.
diff --git a/tools/perf/util/parse-branch-options.c 
b/tools/perf/util/parse-branch-options.c
index 38fd115..e71fb5f 100644
--- a/tools/perf/util/parse-branch-options.c
+++ b/tools/perf/util/parse-branch-options.c
@@ -28,6 +28,7 @@ static const struct branch_mode branch_modes[] = {
BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
BRANCH_OPT("ind_jmp", PERF_SAMPLE_BRANCH_IND_JUMP),
BRANCH_OPT("call", PERF_SAMPLE_BRANCH_CALL),
+   BRANCH_OPT("save_type", PERF_SAMPLE_BRANCH_TYPE_SAVE),
BRANCH_END
 };
 
-- 
2.7.4



[PATCH v3 2/5] perf/x86/intel: Record branch type

2017-04-10 Thread Jin Yao
Perf already has support for disassembling the branch instruction
and using the branch type for filtering. The patch just records
the branch type in perf_branch_entry.

Before recording, the patch converts the x86 branch classification
to common branch classification.

Signed-off-by: Jin Yao 
---
 arch/x86/events/intel/lbr.c | 53 -
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 81b321a..6968c63 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -109,6 +109,9 @@ enum {
X86_BR_ZERO_CALL= 1 << 15,/* zero length call */
X86_BR_CALL_STACK   = 1 << 16,/* call stack */
X86_BR_IND_JMP  = 1 << 17,/* indirect jump */
+
+   X86_BR_TYPE_SAVE= 1 << 18,/* indicate to save branch type */
+
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -670,6 +673,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event 
*event)
 
if (br_type & PERF_SAMPLE_BRANCH_CALL)
mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+   if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+   mask |= X86_BR_TYPE_SAVE;
+
/*
 * stash actual user request into reg, it may
 * be used by fixup code for some CPU
@@ -923,6 +930,44 @@ static int branch_type(unsigned long from, unsigned long 
to, int abort)
return ret;
 }
 
+#define X86_BR_TYPE_MAP_MAX16
+
+static int
+common_branch_type(int type)
+{
+   int i, mask;
+   const int branch_map[X86_BR_TYPE_MAP_MAX] = {
+   PERF_BR_CALL,   /* X86_BR_CALL */
+   PERF_BR_RET,/* X86_BR_RET */
+   PERF_BR_SYSCALL,/* X86_BR_SYSCALL */
+   PERF_BR_SYSRET, /* X86_BR_SYSRET */
+   PERF_BR_INT,/* X86_BR_INT */
+   PERF_BR_IRET,   /* X86_BR_IRET */
+   PERF_BR_JCC,/* X86_BR_JCC */
+   PERF_BR_JMP,/* X86_BR_JMP */
+   PERF_BR_IRQ,/* X86_BR_IRQ */
+   PERF_BR_IND_CALL,   /* X86_BR_IND_CALL */
+   PERF_BR_NONE,   /* X86_BR_ABORT */
+   PERF_BR_NONE,   /* X86_BR_IN_TX */
+   PERF_BR_NONE,   /* X86_BR_NO_TX */
+   PERF_BR_CALL,   /* X86_BR_ZERO_CALL */
+   PERF_BR_NONE,   /* X86_BR_CALL_STACK */
+   PERF_BR_IND_JMP,/* X86_BR_IND_JMP */
+   };
+
+   type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+   mask = ~(~0 << 1);
+
+   for (i = 0; i < X86_BR_TYPE_MAP_MAX; i++) {
+   if (type & mask)
+   return branch_map[i];
+
+   type >>= 1;
+   }
+
+   return PERF_BR_NONE;
+}
+
 /*
  * implement actual branch filter based on user demand.
  * Hardware may not exactly satisfy that request, thus
@@ -939,7 +984,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
bool compress = false;
 
/* if sampling all branches, then nothing to filter */
-   if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+   if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+   ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
return;
 
for (i = 0; i < cpuc->lbr_stack.nr; i++) {
@@ -960,6 +1006,11 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[i].from = 0;
compress = true;
}
+
+   if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+   cpuc->lbr_entries[i].type = common_branch_type(type);
+   else
+   cpuc->lbr_entries[i].type = PERF_BR_NONE;
}
 
if (!compress)
-- 
2.7.4



[PATCH v3 1/5] perf/core: Define the common branch type classification

2017-04-10 Thread Jin Yao
It is often useful to know the branch types while analyzing branch
data. For example, a call is very different from a conditional branch.

Currently we have to look it up in binary while the binary may later
not be available and even the binary is available but user has to take
some time. It is very useful for user to check it directly in perf
report.

Perf already has support for disassembling the branch instruction
to get the x86 branch type.

To keep consistent on kernel and userspace and make the classification
more common, the patch adds the common branch type classification
in perf_event.h.

PERF_BR_NONE  : unknown
PERF_BR_JCC   : conditional jump
PERF_BR_JMP   : jump
PERF_BR_IND_JMP   : indirect jump
PERF_BR_CALL  : call
PERF_BR_IND_CALL  : indirect call
PERF_BR_RET   : return
PERF_BR_SYSCALL   : syscall
PERF_BR_SYSRET: syscall return
PERF_BR_IRQ   : hw interrupt/trap/fault
PERF_BR_INT   : sw interrupt
PERF_BR_IRET  : return from interrupt
PERF_BR_FAR_BRANCH: not generic far branch type

Since the disassembling of branch instruction needs some overhead,
a new PERF_SAMPLE_BRANCH_TYPE_SAVE is introduced to indicate if it
needs to disassemble the branch instruction and record the branch
type.

Signed-off-by: Jin Yao 
---
 include/uapi/linux/perf_event.h   | 29 -
 tools/include/uapi/linux/perf_event.h | 29 -
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d09a9cd..69af012 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift {
PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT   = 14, /* no flags */
PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT  = 15, /* no cycles */
 
+   PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT  = 16, /* save branch type */
+
PERF_SAMPLE_BRANCH_MAX_SHIFT/* non-ABI */
 };
 
@@ -198,9 +200,32 @@ enum perf_branch_sample_type {
PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << 
PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
PERF_SAMPLE_BRANCH_NO_CYCLES= 1U << 
PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
 
+   PERF_SAMPLE_BRANCH_TYPE_SAVE=
+   1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
+
PERF_SAMPLE_BRANCH_MAX  = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
+/*
+ * Common flow change classification
+ */
+enum {
+   PERF_BR_NONE= 0,/* unknown */
+   PERF_BR_JCC = 1,/* conditional jump */
+   PERF_BR_JMP = 2,/* jump */
+   PERF_BR_IND_JMP = 3,/* indirect jump */
+   PERF_BR_CALL= 4,/* call */
+   PERF_BR_IND_CALL= 5,/* indirect call */
+   PERF_BR_RET = 6,/* return */
+   PERF_BR_SYSCALL = 7,/* syscall */
+   PERF_BR_SYSRET  = 8,/* syscall return */
+   PERF_BR_IRQ = 9,/* hw interrupt/trap/fault */
+   PERF_BR_INT = 10,   /* sw interrupt */
+   PERF_BR_IRET= 11,   /* return from interrupt */
+   PERF_BR_FAR_BRANCH  = 12,   /* not generic far branch type */
+   PERF_BR_MAX,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
(PERF_SAMPLE_BRANCH_USER|\
 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -999,6 +1024,7 @@ union perf_mem_data_src {
  * in_tx: running in a hardware transaction
  * abort: aborting a hardware transaction
  *cycles: cycles from last branch (or 0 if not supported)
+ *  type: branch type
  */
 struct perf_branch_entry {
__u64   from;
@@ -1008,7 +1034,8 @@ struct perf_branch_entry {
in_tx:1,/* in transaction */
abort:1,/* transaction abort */
cycles:16,  /* cycle count to last branch */
-   reserved:44;
+   type:4, /* branch type */
+   reserved:40;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index d09a9cd..69af012 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift {
PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT   = 14, /* no flags */
PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT  = 15, /* no cycles */
 
+   PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT  = 16, /* save branch type */
+
PERF_SAMPLE_BRANCH_MAX_SHIFT/* non-ABI */
 };
 
@@ -198,9 +200,32 @@ enum perf_branch_sample_type {
PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << 
PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
PERF_SAMPLE_BRANCH_NO_CYCLES= 1U << 
PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
 
+   PERF_SAMPLE_BRANCH_TYPE_SAVE=
+   1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
+
PERF_SAMPLE_BRANCH_MAX  = 1U << 

[PATCH v3 0/5] perf report: Show branch type

2017-04-10 Thread Jin Yao
v3:
---
1. Move the JCC forward/backward and cross page computing from
   kernel to userspace.

2. Use lookup table to replace original switch/case processing.

Changed:
  perf/core: Define the common branch type classification
  perf/x86/intel: Record branch type
  perf report: Show branch type statistics for stdio mode
  perf report: Show branch type in callchain entry

Not changed:
  perf record: Create a new option save_type in --branch-filter

v2:
---
1. Use 4 bits in perf_branch_entry to record branch type.

2. Pull out some common branch types from FAR_BRANCH. Now the branch
   types defined in perf_event.h:

PERF_BR_NONE  : unknown
PERF_BR_JCC_FWD   : conditional forward jump
PERF_BR_JCC_BWD   : conditional backward jump
PERF_BR_JMP   : jump
PERF_BR_IND_JMP   : indirect jump
PERF_BR_CALL  : call
PERF_BR_IND_CALL  : indirect call
PERF_BR_RET   : return
PERF_BR_SYSCALL   : syscall
PERF_BR_SYSRET: syscall return
PERF_BR_IRQ   : hw interrupt/trap/fault
PERF_BR_INT   : sw interrupt
PERF_BR_IRET  : return from interrupt
PERF_BR_FAR_BRANCH: others not generic far branch type

3. Use 2 bits in perf_branch_entry for a "cross" metrics checking
   for branch cross 4K or 2M area. It's an approximate computing
   for checking if the branch cross 4K page or 2MB page.

For example:

perf record -g --branch-filter any,save_type 

perf report --stdio

 JCC forward:  27.7%
JCC backward:   9.8%
 JMP:   0.0%
 IND_JMP:   6.5%
CALL:  26.6%
IND_CALL:   0.0%
 RET:  29.3%
IRET:   0.0%
CROSS_4K:   0.0%
CROSS_2M:  14.3%

perf report --branch-history --stdio --no-children

-23.60%--main div.c:42 (RET cycles:2)
 compute_flag div.c:28 (RET cycles:2)
 compute_flag div.c:27 (RET CROSS_2M cycles:1)
 rand rand.c:28 (RET CROSS_2M cycles:1)
 rand rand.c:28 (RET cycles:1)
 __random random.c:298 (RET cycles:1)
 __random random.c:297 (JCC forward cycles:1)
 __random random.c:295 (JCC forward cycles:1)
 __random random.c:295 (JCC forward cycles:1)
 __random random.c:295 (JCC forward cycles:1)
 __random random.c:295 (RET cycles:9)

Changed:
  perf/core: Define the common branch type classification
  perf/x86/intel: Record branch type
  perf report: Show branch type statistics for stdio mode
  perf report: Show branch type in callchain entry

Not changed:
  perf record: Create a new option save_type in --branch-filter

v1:
---
It is often useful to know the branch types while analyzing branch
data. For example, a call is very different from a conditional branch.

Currently we have to look it up in binary while the binary may later
not be available and even the binary is available but user has to take
some time. It is very useful for user to check it directly in perf
report.

Perf already has support for disassembling the branch instruction
to get the branch type.

The patch series records the branch type and show the branch type with
other LBR information in callchain entry via perf report. The patch
series also adds the branch type summary at the end of
perf report --stdio.

To keep consistent on kernel and userspace and make the classification
more common, the patch adds the common branch type classification
in perf_event.h.

The common branch types are:

 JCC forward: Conditional forward jump
JCC backward: Conditional backward jump
 JMP: Jump imm
 IND_JMP: Jump reg/mem
CALL: Call imm
IND_CALL: Call reg/mem
 RET: Ret
  FAR_BRANCH: SYSCALL/SYSRET, IRQ, IRET, TSX Abort

An example:

1. Record branch type (new option "save_type")

perf record -g --branch-filter any,save_type 

2. Show the branch type statistics at the end of perf report --stdio

perf report --stdio

 JCC forward:  34.0%
JCC backward:   3.6%
 JMP:   0.0%
 IND_JMP:   6.5%
CALL:  26.6%
IND_CALL:   0.0%
 RET:  29.3%
  FAR_BRANCH:   0.0%

3. Show branch type in callchain entry

perf report --branch-history --stdio --no-children

--23.91%--main div.c:42 (RET cycles:2)
  compute_flag div.c:28 (RET cycles:2)
  compute_flag div.c:27 (RET cycles:1)
  rand rand.c:28 (RET cycles:1)
  rand rand.c:28 (RET cycles:1)
  __random random.c:298 (RET cycles:1)
  __random random.c:297 (JCC forward cycles:1)
  __random random.c:295 (JCC forward cycles:1)
  __random random.c:295 (JCC forward cycles:1)
  __random random.c:295 (JCC forward cycles:1)
  __random random.c:295 (RET cycles:9)

Jin Yao (5):
  perf/core: Define the common branch type classification
  perf/x86/intel: Record branch type
  perf record: Create a new option save_type in --branch-filter
  perf report: Show branch type statistics for stdio mode
  perf report: Show branch 

Re: EEH error in doing DMA with PEX 8619

2017-04-10 Thread Benjamin Herrenschmidt
On Mon, 2017-04-10 at 19:04 -0700, IanJiang wrote:
> Thanks for your replay.
> 
> I fixed my test according your suggestion. The CPU physical addresses (0x
> 1f9e40 and 0x 1f82c0) converted with virt_to_phys() are used ,
> instead of DMA addresses, or BUS physical addresses (0x 60a0 and 0x
> 60c0). However, EEH still reports error.

That's incorrect. The system has an IOMMU, only addressed properly
mapped/translated by the IOMMU can be used for DMA. That is addresses
returned by things like dma_map_single/sg, dma_alloc_coherent, etc...

You also need to ensure you configure a proper dma mask.

Using virt_to_phys() is NEVER correct on *any* architecture.

Cheers,
Ben.

> Memory info.
> ==
> 
> [130508.050783] Plx8000_NT: Received PLX message ===> 
> [130508.050784] Plx8000_NT: PLX_IOCTL_PHYSICAL_MEM_ALLOCATE
> [130508.050785] Plx8000_NT: Attempt to allocate physical memory (1953KB)
> [130508.051165] Plx8000_NT: Allocated physical memory...
> [130508.051167] Plx8000_NT: CPU Phys Addr: 1f9e40
> [130508.051168] Plx8000_NT: Bus Phys Addr: 60a0
> [130508.051170] Plx8000_NT: Kernel VA: c01f9e40
> [130508.051171] Plx8000_NT: Size : 1E8480h (1MB)
> [130508.051173] Plx8000_NT: ...Completed message
> [130508.051184] Plx8000_NT: 
> [130508.051185] Plx8000_NT: Received message ===> MMAP
> [130508.051187] Plx8000_NT: Mapped Phys (1f9e40) ==> User VA
> (3fff83ad)
> [130508.051189] Plx8000_NT: ...Completed message
> [130508.051196] Plx8000_NT: 
> [130508.051198] Plx8000_NT: Received PLX message ===> 
> [130508.051199] Plx8000_NT: PLX_IOCTL_PHYSICAL_MEM_ALLOCATE
> [130508.051200] Plx8000_NT: Attempt to allocate physical memory (1953KB)
> [130508.051562] Plx8000_NT: Allocated physical memory...
> [130508.051564] Plx8000_NT: CPU Phys Addr: 1f82c0
> [130508.051565] Plx8000_NT: Bus Phys Addr: 60c0
> [130508.051566] Plx8000_NT: Kernel VA: c01f82c0
> [130508.051568] Plx8000_NT: Size : 1E8480h (1MB)
> [130508.051569] Plx8000_NT: ...Completed message
> [130508.051580] Plx8000_NT: 
> [130508.051581] Plx8000_NT: Received message ===> MMAP
> [130508.051583] Plx8000_NT: Mapped Phys (1f82c0) ==> User VA
> (3fff838e)
> [130508.051585] Plx8000_NT: ...Completed message
> [130508.051600] Plx8000_NT: 
> 
> EEH info.
> 
> 
> [130515.365924] Plx8000_DMA: Received PLX message ===> 
> [130515.365972] Plx8000_DMA: PLX_IOCTL_DMA_TRANSFER_BLOCK
> [130515.366033] PLX DMA[PlxDmaTransferBlock-2479]
> [130515.366084] PLX DMA[PlxDmaTransferBlock-2488]
> [130515.366131] PLX DMA[PlxDmaTransferBlock-2495]
> [130515.366181] Plx8000_DMA: Ch 0 - DMA 001F_9E40 -->
> 001F_82C0 (65536 bytes)
> [130515.366250] PLX DMA[PlxDmaTransferBlock-2503]
> [130515.366296] PLX DMA[PlxDmaTransferBlock-2511]
> [130515.366343] PLX DMA[PlxDmaTransferBlock-2516]
> [130515.366392] PLX DMA[PlxDmaTransferBlock-2521]
> [130515.366440] PLX DMA[PlxDmaTransferBlock-2532]
> [130515.366487] PLX DMA[PlxDmaTransferBlock-2535]
> [130515.366537] PLX DMA[PlxDmaTransferBlock-2539]
> [130515.366584] PLX DMA[PlxDmaTransferBlock-2550]
> [130515.366632] PLX DMA[PlxDmaTransferBlock-2557]
> [130515.366681] PLX DMA[PlxDmaTransferBlock-2562]
> [130515.366728] Plx8000_DMA: Start DMA transfer...
> [130515.366775] PLX DMA[PlxDmaTransferBlock-2565]
> [130515.366826] PLX DMA[PlxDmaTransferBlock-2569]
> [130515.366868] EEH: Frozen PE#1 on PHB#1 detected
> [130515.366872] EEH: PE location: Slot4, PHB location: N/A
> [130515.367997] EEH: This PCI device has failed 1 times in the last hour
> [130515.367997] EEH: Notify device drivers to shutdown
> [130515.368006] EEH: Collect temporary log
> [130515.368072] EEH: of node=0001:01:00:0
> [130515.368075] EEH: PCI device/vendor: 861910b5
> [130515.368077] EEH: PCI cmd/status register: 00100547
> [130515.368079] EEH: Bridge secondary status: 
> [130515.368081] EEH: Bridge control: 0002
> [130515.368081] EEH: PCI-E capabilities and status follow:
> [130515.368091] EEH: PCI-E 00: 0052a410 8004 0046 cc82 
> [130515.368098] EEH: PCI-E 10: 0082    
> [130515.368099] EEH: PCI-E 20:  
> [130515.368100] EEH: PCI-E AER capability register set follows:
> [130515.368109] EEH: PCI-E AER 00: 13810001   00062030 
> [130515.368116] EEH: PCI-E AER 10:  2000 00ff  
> [130515.368122] EEH: PCI-E AER 20:     
> [130515.368125] EEH: PCI-E AER 30:  0e0e0e0e 
> [130515.368127] EEH: of node=0001:01:00:1
> [130515.368294] Plx8000_DMA: ...Completed message
> [130515.368295] PLX DMA[Dispatch_IoControl-1053]
> [130515.368295] PLX DMA[Dispatch_IoControl-1061]
> [130515.368297] Plx8000_DMA: 
> [130515.368298] Plx8000_DMA: Received PLX message ===> 
> [130515.368298] Plx8000_DMA: PLX_IOCTL_NOTIFICATION_WAIT
> [130515.368299] Plx8000_DMA: Waiting for Interrupt wait object
> (c03c0705f880) to wake-up
> 

Re: EEH error in doing DMA with PEX 8619

2017-04-10 Thread IanJiang
Thanks for your replay.

I fixed my test according your suggestion. The CPU physical addresses (0x
1f9e40 and 0x 1f82c0) converted with virt_to_phys() are used ,
instead of DMA addresses, or BUS physical addresses (0x 60a0 and 0x
60c0). However, EEH still reports error.

Memory info.
==

[130508.050783] Plx8000_NT: Received PLX message ===> 
[130508.050784] Plx8000_NT: PLX_IOCTL_PHYSICAL_MEM_ALLOCATE
[130508.050785] Plx8000_NT: Attempt to allocate physical memory (1953KB)
[130508.051165] Plx8000_NT: Allocated physical memory...
[130508.051167] Plx8000_NT: CPU Phys Addr: 1f9e40
[130508.051168] Plx8000_NT: Bus Phys Addr: 60a0
[130508.051170] Plx8000_NT: Kernel VA: c01f9e40
[130508.051171] Plx8000_NT: Size : 1E8480h (1MB)
[130508.051173] Plx8000_NT: ...Completed message
[130508.051184] Plx8000_NT: 
[130508.051185] Plx8000_NT: Received message ===> MMAP
[130508.051187] Plx8000_NT: Mapped Phys (1f9e40) ==> User VA
(3fff83ad)
[130508.051189] Plx8000_NT: ...Completed message
[130508.051196] Plx8000_NT: 
[130508.051198] Plx8000_NT: Received PLX message ===> 
[130508.051199] Plx8000_NT: PLX_IOCTL_PHYSICAL_MEM_ALLOCATE
[130508.051200] Plx8000_NT: Attempt to allocate physical memory (1953KB)
[130508.051562] Plx8000_NT: Allocated physical memory...
[130508.051564] Plx8000_NT: CPU Phys Addr: 1f82c0
[130508.051565] Plx8000_NT: Bus Phys Addr: 60c0
[130508.051566] Plx8000_NT: Kernel VA: c01f82c0
[130508.051568] Plx8000_NT: Size : 1E8480h (1MB)
[130508.051569] Plx8000_NT: ...Completed message
[130508.051580] Plx8000_NT: 
[130508.051581] Plx8000_NT: Received message ===> MMAP
[130508.051583] Plx8000_NT: Mapped Phys (1f82c0) ==> User VA
(3fff838e)
[130508.051585] Plx8000_NT: ...Completed message
[130508.051600] Plx8000_NT: 

EEH info.


[130515.365924] Plx8000_DMA: Received PLX message ===> 
[130515.365972] Plx8000_DMA: PLX_IOCTL_DMA_TRANSFER_BLOCK
[130515.366033] PLX DMA[PlxDmaTransferBlock-2479]
[130515.366084] PLX DMA[PlxDmaTransferBlock-2488]
[130515.366131] PLX DMA[PlxDmaTransferBlock-2495]
[130515.366181] Plx8000_DMA: Ch 0 - DMA 001F_9E40 -->
001F_82C0 (65536 bytes)
[130515.366250] PLX DMA[PlxDmaTransferBlock-2503]
[130515.366296] PLX DMA[PlxDmaTransferBlock-2511]
[130515.366343] PLX DMA[PlxDmaTransferBlock-2516]
[130515.366392] PLX DMA[PlxDmaTransferBlock-2521]
[130515.366440] PLX DMA[PlxDmaTransferBlock-2532]
[130515.366487] PLX DMA[PlxDmaTransferBlock-2535]
[130515.366537] PLX DMA[PlxDmaTransferBlock-2539]
[130515.366584] PLX DMA[PlxDmaTransferBlock-2550]
[130515.366632] PLX DMA[PlxDmaTransferBlock-2557]
[130515.366681] PLX DMA[PlxDmaTransferBlock-2562]
[130515.366728] Plx8000_DMA: Start DMA transfer...
[130515.366775] PLX DMA[PlxDmaTransferBlock-2565]
[130515.366826] PLX DMA[PlxDmaTransferBlock-2569]
[130515.366868] EEH: Frozen PE#1 on PHB#1 detected
[130515.366872] EEH: PE location: Slot4, PHB location: N/A
[130515.367997] EEH: This PCI device has failed 1 times in the last hour
[130515.367997] EEH: Notify device drivers to shutdown
[130515.368006] EEH: Collect temporary log
[130515.368072] EEH: of node=0001:01:00:0
[130515.368075] EEH: PCI device/vendor: 861910b5
[130515.368077] EEH: PCI cmd/status register: 00100547
[130515.368079] EEH: Bridge secondary status: 
[130515.368081] EEH: Bridge control: 0002
[130515.368081] EEH: PCI-E capabilities and status follow:
[130515.368091] EEH: PCI-E 00: 0052a410 8004 0046 cc82 
[130515.368098] EEH: PCI-E 10: 0082    
[130515.368099] EEH: PCI-E 20:  
[130515.368100] EEH: PCI-E AER capability register set follows:
[130515.368109] EEH: PCI-E AER 00: 13810001   00062030 
[130515.368116] EEH: PCI-E AER 10:  2000 00ff  
[130515.368122] EEH: PCI-E AER 20:     
[130515.368125] EEH: PCI-E AER 30:  0e0e0e0e 
[130515.368127] EEH: of node=0001:01:00:1
[130515.368294] Plx8000_DMA: ...Completed message
[130515.368295] PLX DMA[Dispatch_IoControl-1053]
[130515.368295] PLX DMA[Dispatch_IoControl-1061]
[130515.368297] Plx8000_DMA: 
[130515.368298] Plx8000_DMA: Received PLX message ===> 
[130515.368298] Plx8000_DMA: PLX_IOCTL_NOTIFICATION_WAIT
[130515.368299] Plx8000_DMA: Waiting for Interrupt wait object
(c03c0705f880) to wake-up
[130515.369283] EEH: PCI device/vendor: 861910b5
[130515.369336] EEH: PCI cmd/status register: 10100546
[130515.369384] EEH: PCI-E capabilities and status follow:
[130515.369440] EEH: PCI-E 00: 0002a410 8fe4 0020204e cc82 
[130515.369506] EEH: PCI-E 10: 0082    
[130515.369564] EEH: PCI-E 20:  
[130515.393162] EEH: PCI-E AER capability register set follows:
[130515.420590] EEH: PCI-E AER 00: 1f410001   00062030 
[130515.441475] EEH: PCI-E AER 10:  2000 01ff  
[130515.454700] EEH: PCI-E AER 20: 

[PATCH v3 6/6] powerpc/perf: Add Power8 mem_access event to sysfs

2017-04-10 Thread Madhavan Srinivasan
Patch add "mem_access" event to sysfs. This as-is not a raw event
supported by Power8 pmu. Instead, it is formed based on
raw event encoding specificed in isa207-common.h.

Primary PMU event used here is PM_MRK_INST_CMPL.
This event tracks only the completed marked instructions.

Random sampling mode (MMCRA[SM]) with Random Instruction
Sampling (RIS) is enabled to mark type of instructions.

With Random sampling in RLS mode with PM_MRK_INST_CMPL event,
the LDST /DATA_SRC fields in SIER identifies the memory
hierarchy level (eg: L1, L2 etc) statisfied a data-cache
miss for a marked instruction.

Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/power8-events-list.h | 6 ++
 arch/powerpc/perf/power8-pmu.c | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/arch/powerpc/perf/power8-events-list.h 
b/arch/powerpc/perf/power8-events-list.h
index 3a2e6e8ebb92..0f1d184627cc 100644
--- a/arch/powerpc/perf/power8-events-list.h
+++ b/arch/powerpc/perf/power8-events-list.h
@@ -89,3 +89,9 @@ EVENT(PM_MRK_FILT_MATCH,  0x2013c)
 EVENT(PM_MRK_FILT_MATCH_ALT,   0x3012e)
 /* Alternate event code for PM_LD_MISS_L1 */
 EVENT(PM_LD_MISS_L1_ALT,   0x400f0)
+/*
+ * Memory Access Event -- mem_access
+ * Primary PMU event used here is PM_MRK_INST_CMPL, along with
+ * Random Load/Store Facility Sampling (RIS) in Random sampling mode 
(MMCRA[SM]).
+ */
+EVENT(MEM_ACCESS,  0x10401e0)
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index 932d7536f0eb..5463516e369b 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -90,6 +90,7 @@ GENERIC_EVENT_ATTR(branch-instructions,   
PM_BRU_FIN);
 GENERIC_EVENT_ATTR(branch-misses,  PM_BR_MPRED_CMPL);
 GENERIC_EVENT_ATTR(cache-references,   PM_LD_REF_L1);
 GENERIC_EVENT_ATTR(cache-misses,   PM_LD_MISS_L1);
+GENERIC_EVENT_ATTR(mem_access, MEM_ACCESS);
 
 CACHE_EVENT_ATTR(L1-dcache-load-misses,PM_LD_MISS_L1);
 CACHE_EVENT_ATTR(L1-dcache-loads,  PM_LD_REF_L1);
@@ -120,6 +121,7 @@ static struct attribute *power8_events_attr[] = {
GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
GENERIC_EVENT_PTR(PM_LD_REF_L1),
GENERIC_EVENT_PTR(PM_LD_MISS_L1),
+   GENERIC_EVENT_PTR(MEM_ACCESS),
 
CACHE_EVENT_PTR(PM_LD_MISS_L1),
CACHE_EVENT_PTR(PM_LD_REF_L1),
-- 
2.7.4



[PATCH v3 5/6] powerpc/perf: Support to export SIERs bit in Power9

2017-04-10 Thread Madhavan Srinivasan
Patch to export SIER bits to userspace via
perf_mem_data_src and perf_sample_data struct.

Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/power9-pmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 7f6582708e06..018f8e90ac35 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -427,6 +427,8 @@ static struct power_pmu power9_pmu = {
.bhrb_filter_map= power9_bhrb_filter_map,
.get_constraint = isa207_get_constraint,
.get_alternatives   = power9_get_alternatives,
+   .get_mem_data_src   = isa207_get_mem_data_src,
+   .get_mem_weight = isa207_get_mem_weight,
.disable_pmc= isa207_disable_pmc,
.flags  = PPMU_HAS_SIER | PPMU_ARCH_207S,
.n_generic  = ARRAY_SIZE(power9_generic_events),
-- 
2.7.4



[PATCH v3 4/6] powerpc/perf: Support to export SIERs bit in Power8

2017-04-10 Thread Madhavan Srinivasan
Patch to export SIER bits to userspace via
perf_mem_data_src and perf_sample_data struct.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/perf/power8-pmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index ce15b19a7962..932d7536f0eb 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -325,6 +325,8 @@ static struct power_pmu power8_pmu = {
.bhrb_filter_map= power8_bhrb_filter_map,
.get_constraint = isa207_get_constraint,
.get_alternatives   = power8_get_alternatives,
+   .get_mem_data_src   = isa207_get_mem_data_src,
+   .get_mem_weight = isa207_get_mem_weight,
.disable_pmc= isa207_disable_pmc,
.flags  = PPMU_HAS_SIER | PPMU_ARCH_207S,
.n_generic  = ARRAY_SIZE(power8_generic_events),
-- 
2.7.4



[PATCH v3 1/6] powerpc/perf: Define big-endian version of perf_mem_data_src

2017-04-10 Thread Madhavan Srinivasan
From: Sukadev Bhattiprolu 

perf_mem_data_src is an union that is initialized via the ->val field
and accessed via the bitmap fields. For this to work on big endian
platforms (Which is broken now), we also need a big-endian represenation
of perf_mem_data_src. i.e, in a big endian system, if user request
PERF_SAMPLE_DATA_SRC (perf report -d), will get the default value from
perf_sample_data_init(), which is PERF_MEM_NA. Value for PERF_MEM_NA
is constructed using shifts:

  /* TLB access */
  #define PERF_MEM_TLB_NA   0x01 /* not available */
  ...
  #define PERF_MEM_TLB_SHIFT26

  #define PERF_MEM_S(a, s) \
(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)

  #define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
PERF_MEM_S(LVL, NA)   |\
PERF_MEM_S(SNOOP, NA) |\
PERF_MEM_S(LOCK, NA)  |\
PERF_MEM_S(TLB, NA))

Which works out as:

  ((0x01 << 0) | (0x01 << 5) | (0x01 << 19) | (0x01 << 24) | (0x01 << 26))

Which means the PERF_MEM_NA value comes out of the kernel as 0x5080021
in CPU endian.

But then in the perf tool, the code uses the bitfields to inspect the
value, and currently the bitfields are defined using little endian
ordering.

So eg. in perf_mem__tlb_scnprintf() we see:
  data_src->val = 0x5080021
 op = 0x0
lvl = 0x0
  snoop = 0x0
   lock = 0x0
   dtlb = 0x0
   rsvd = 0x5080021

Patch does a minimal fix of adding big endian definition of the bitfields
to match the values that are already exported by the kernel on big endian.
And it makes no change on little endian.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Madhavan Srinivasan 
---
 include/uapi/linux/perf_event.h   | 16 
 tools/include/uapi/linux/perf_event.h | 16 
 2 files changed, 32 insertions(+)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485a24ac..c4af1159a200 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -891,6 +891,7 @@ enum perf_callchain_context {
 #define PERF_FLAG_PID_CGROUP   (1UL << 2) /* pid=cgroup id, per-cpu 
mode only */
 #define PERF_FLAG_FD_CLOEXEC   (1UL << 3) /* O_CLOEXEC */
 
+#if defined(__LITTLE_ENDIAN_BITFIELD)
 union perf_mem_data_src {
__u64 val;
struct {
@@ -902,6 +903,21 @@ union perf_mem_data_src {
mem_rsvd:31;
};
 };
+#elif defined(__BIG_ENDIAN_BITFIELD)
+union perf_mem_data_src {
+   __u64 val;
+   struct {
+   __u64   mem_rsvd:31,
+   mem_dtlb:7, /* tlb access */
+   mem_lock:2, /* lock instr */
+   mem_snoop:5,/* snoop mode */
+   mem_lvl:14, /* memory hierarchy level */
+   mem_op:5;   /* type of opcode */
+   };
+};
+#else
+#error "Unknown endianness"
+#endif
 
 /* type of opcode (load/store/prefetch,code) */
 #define PERF_MEM_OP_NA 0x01 /* not available */
diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index c66a485a24ac..c4af1159a200 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -891,6 +891,7 @@ enum perf_callchain_context {
 #define PERF_FLAG_PID_CGROUP   (1UL << 2) /* pid=cgroup id, per-cpu 
mode only */
 #define PERF_FLAG_FD_CLOEXEC   (1UL << 3) /* O_CLOEXEC */
 
+#if defined(__LITTLE_ENDIAN_BITFIELD)
 union perf_mem_data_src {
__u64 val;
struct {
@@ -902,6 +903,21 @@ union perf_mem_data_src {
mem_rsvd:31;
};
 };
+#elif defined(__BIG_ENDIAN_BITFIELD)
+union perf_mem_data_src {
+   __u64 val;
+   struct {
+   __u64   mem_rsvd:31,
+   mem_dtlb:7, /* tlb access */
+   mem_lock:2, /* lock instr */
+   mem_snoop:5,/* snoop mode */
+   mem_lvl:14, /* memory hierarchy level */
+   mem_op:5;   /* type of opcode */
+   };
+};
+#else
+#error "Unknown endianness"
+#endif
 
 /* type of opcode (load/store/prefetch,code) */
 #define PERF_MEM_OP_NA 0x01 /* not available */
-- 
2.7.4



[PATCH v3 3/6] powerpc/perf: Support to export MMCRA[TEC*] field to userspace

2017-04-10 Thread Madhavan Srinivasan
Threshold feature when used with MMCRA [Threshold Event Counter Event],
MMCRA[Threshold Start event] and MMCRA[Threshold End event] will update
MMCRA[Threashold Event Counter Exponent] and MMCRA[Threshold Event
Counter Multiplier] with the corresponding threshold event count values.
Patch to export MMCRA[TECX/TECM] to userspace in 'weight' field of
struct perf_sample_data.

Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/perf_event_server.h |  1 +
 arch/powerpc/perf/core-book3s.c  |  4 
 arch/powerpc/perf/isa207-common.c|  8 
 arch/powerpc/perf/isa207-common.h| 10 ++
 4 files changed, 23 insertions(+)

diff --git a/arch/powerpc/include/asm/perf_event_server.h 
b/arch/powerpc/include/asm/perf_event_server.h
index 446cdcd9b7f5..723bf48e7494 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -40,6 +40,7 @@ struct power_pmu {
u64 alt[]);
void(*get_mem_data_src)(union perf_mem_data_src *dsrc,
u32 flags, struct pt_regs *regs);
+   void(*get_mem_weight)(u64 *weight);
u64 (*bhrb_filter_map)(u64 branch_sample_type);
void(*config_bhrb)(u64 pmu_bhrb_filter);
void(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index e241ebebab6f..6c2d4168daec 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2053,6 +2053,10 @@ static void record_and_restart(struct perf_event *event, 
unsigned long val,
ppmu->get_mem_data_src)
ppmu->get_mem_data_src(_src, ppmu->flags, 
regs);
 
+   if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
+   ppmu->get_mem_weight)
+   ppmu->get_mem_weight();
+
if (perf_event_overflow(event, , regs))
power_pmu_stop(event, 0);
}
diff --git a/arch/powerpc/perf/isa207-common.c 
b/arch/powerpc/perf/isa207-common.c
index a8b100ef8e6c..8125160be7bc 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -221,6 +221,14 @@ void isa207_get_mem_data_src(union perf_mem_data_src 
*dsrc, u32 flags,
}
 }
 
+void isa207_get_mem_weight(u64 *weight)
+{
+   u64 mmcra = mfspr(SPRN_MMCRA);
+   u64 exp = MMCRA_THR_CTR_EXP(mmcra);
+   u64 mantissa = MMCRA_THR_CTR_MANT(mmcra);
+
+   *weight = mantissa << (2 * exp);
+}
 
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 {
diff --git a/arch/powerpc/perf/isa207-common.h 
b/arch/powerpc/perf/isa207-common.h
index f711f337e358..8acbe6e802c7 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -248,6 +248,15 @@
 #define MMCRA_SDAR_MODE_TLB(1ull << MMCRA_SDAR_MODE_SHIFT)
 #define MMCRA_SDAR_MODE_NO_UPDATES ~(0x3ull << MMCRA_SDAR_MODE_SHIFT)
 #define MMCRA_IFM_SHIFT30
+#define MMCRA_THR_CTR_MANT_SHIFT   19
+#define MMCRA_THR_CTR_MANT_MASK0x7Ful
+#define MMCRA_THR_CTR_MANT(v)  (((v) >> MMCRA_THR_CTR_MANT_SHIFT) &\
+   MMCRA_THR_CTR_MANT_MASK)
+
+#define MMCRA_THR_CTR_EXP_SHIFT27
+#define MMCRA_THR_CTR_EXP_MASK 0x7ul
+#define MMCRA_THR_CTR_EXP(v)   (((v) >> MMCRA_THR_CTR_EXP_SHIFT) &\
+   MMCRA_THR_CTR_EXP_MASK)
 
 /* MMCR1 Threshold Compare bit constant for power9 */
 #define p9_MMCRA_THR_CMP_SHIFT 45
@@ -282,5 +291,6 @@ int isa207_get_alternatives(u64 event, u64 alt[],
const unsigned int ev_alt[][MAX_ALT], int size);
 void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
struct pt_regs *regs);
+void isa207_get_mem_weight(u64 *weight);
 
 #endif
-- 
2.7.4



[PATCH v3 2/6] powerpc/perf: Export memory hierarchy info to user space

2017-04-10 Thread Madhavan Srinivasan
The LDST field and DATA_SRC in SIER identifies the memory hierarchy level
(eg: L1, L2 etc), from which a data-cache miss for a marked instruction
was satisfied. Use the 'perf_mem_data_src' object to export this
hierarchy level to user space.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Madhavan Srinivasan 
---
 arch/powerpc/include/asm/perf_event_server.h |  2 +
 arch/powerpc/perf/core-book3s.c  |  4 ++
 arch/powerpc/perf/isa207-common.c| 74 
 arch/powerpc/perf/isa207-common.h| 16 +-
 4 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/perf_event_server.h 
b/arch/powerpc/include/asm/perf_event_server.h
index ae0a23091a9b..446cdcd9b7f5 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -38,6 +38,8 @@ struct power_pmu {
unsigned long *valp);
int (*get_alternatives)(u64 event_id, unsigned int flags,
u64 alt[]);
+   void(*get_mem_data_src)(union perf_mem_data_src *dsrc,
+   u32 flags, struct pt_regs *regs);
u64 (*bhrb_filter_map)(u64 branch_sample_type);
void(*config_bhrb)(u64 pmu_bhrb_filter);
void(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 2ff13249f87a..e241ebebab6f 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2049,6 +2049,10 @@ static void record_and_restart(struct perf_event *event, 
unsigned long val,
data.br_stack = >bhrb_stack;
}
 
+   if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC &&
+   ppmu->get_mem_data_src)
+   ppmu->get_mem_data_src(_src, ppmu->flags, 
regs);
+
if (perf_event_overflow(event, , regs))
power_pmu_stop(event, 0);
}
diff --git a/arch/powerpc/perf/isa207-common.c 
b/arch/powerpc/perf/isa207-common.c
index cd951fd231c4..a8b100ef8e6c 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -148,6 +148,80 @@ static bool is_thresh_cmp_valid(u64 event)
return true;
 }
 
+static inline u64 isa207_find_source(u64 idx, u32 sub_idx)
+{
+   u64 ret = PERF_MEM_NA;
+
+   switch(idx) {
+   case 0:
+   /* Nothing to do */
+   break;
+   case 1:
+   ret = PH(LVL, L1);
+   break;
+   case 2:
+   ret = PH(LVL, L2);
+   break;
+   case 3:
+   ret = PH(LVL, L3);
+   break;
+   case 4:
+   if (sub_idx <= 1)
+   ret = PH(LVL, LOC_RAM);
+   else if (sub_idx > 1 && sub_idx <= 2)
+   ret = PH(LVL, REM_RAM1);
+   else
+   ret = PH(LVL, REM_RAM2);
+   ret |= P(SNOOP, HIT);
+   break;
+   case 5:
+   ret = PH(LVL, REM_CCE1);
+   if ((sub_idx == 0) || (sub_idx == 2) || (sub_idx == 4))
+   ret |= P(SNOOP, HIT);
+   else if ((sub_idx == 1) || (sub_idx == 3) || (sub_idx == 5))
+   ret |= P(SNOOP, HITM);
+   break;
+   case 6:
+   ret = PH(LVL, REM_CCE2);
+   if ((sub_idx == 0) || (sub_idx == 2))
+   ret |= P(SNOOP, HIT);
+   else if ((sub_idx == 1) || (sub_idx == 3))
+   ret |= P(SNOOP, HITM);
+   break;
+   case 7:
+   ret = PM(LVL, L1);
+   break;
+   }
+
+   return ret;
+}
+
+void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
+   struct pt_regs *regs)
+{
+   u64 idx;
+   u32 sub_idx;
+   u64 sier;
+   u64 val;
+
+   /* Skip if no SIER support */
+   if (!(flags & PPMU_HAS_SIER)) {
+   dsrc->val = 0;
+   return;
+   }
+
+   sier = mfspr(SPRN_SIER);
+   val = (sier & ISA207_SIER_TYPE_MASK) >> ISA207_SIER_TYPE_SHIFT;
+   if (val == 1 || val == 2) {
+   idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT;
+   sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> 
ISA207_SIER_DATA_SRC_SHIFT;
+
+   dsrc->val = isa207_find_source(idx, sub_idx);
+   dsrc->val |= (val == 1) ? P(OP, LOAD) : P(OP, STORE);
+   }
+}
+
+
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 {
unsigned int unit, pmc, cache, ebb;
diff --git a/arch/powerpc/perf/isa207-common.h 
b/arch/powerpc/perf/isa207-common.h
index 

[PATCH v3 0/6] powerpc/perf: Export memory hierarchy level

2017-04-10 Thread Madhavan Srinivasan
Power8/Power9 Perforence Monitoring Unit (PMU) supports
different sampling modes (SM) such as Random Instruction
Sampling (RIS), Random Load/Store Facility Sampling (RLS)
and Random Branch Sampling (RBS). Sample mode RLS updates
Sampled Instruction Event Register [SIER] bits with memory
hierarchy information for a cache reload. Patchset exports
the hierarchy information to the user via the perf_mem_data_src
object from SIER.

Patchset is a rebase of the work posted previously with minor
updates to it.

https://lkml.org/lkml/2015/6/11/92

Changelog v3:
-Removed is_load_store() and merged the same to get_memdata_src callback
-Added a check to update OP_LOAD or OP_STORE in data_src->val

Changelog v2:
-Updated the commit messages
-Fixed isa207_find_source() to consider all the possible sier[ldst] values.

Changelog v1:
- Fixed author-ship for the first patch and added suka's "Signed-off-by:".

Madhavan Srinivasan (5):
  powerpc/perf: Export memory hierarchy info to user space
  powerpc/perf: Support to export MMCRA[TEC*] field to userspace
  powerpc/perf: Support to export SIERs bit in Power8
  powerpc/perf: Support to export SIERs bit in Power9
  powerpc/perf: Add Power8 mem_access event to sysfs

Sukadev Bhattiprolu (1):
  powerpc/perf: Define big-endian version of perf_mem_data_src

 arch/powerpc/include/asm/perf_event_server.h |  3 +
 arch/powerpc/perf/core-book3s.c  |  8 +++
 arch/powerpc/perf/isa207-common.c| 82 
 arch/powerpc/perf/isa207-common.h| 26 -
 arch/powerpc/perf/power8-events-list.h   |  6 ++
 arch/powerpc/perf/power8-pmu.c   |  4 ++
 arch/powerpc/perf/power9-pmu.c   |  2 +
 include/uapi/linux/perf_event.h  | 16 ++
 tools/include/uapi/linux/perf_event.h| 16 ++
 9 files changed, 162 insertions(+), 1 deletion(-)

-- 
2.7.4



Re: powerpc/powernv: Fix powernv Kconfig dependencies

2017-04-10 Thread Michael Ellerman
On Mon, 2017-04-10 at 05:24:35 UTC, Alistair Popple wrote:
> The patch to introduce address translation services for Nvlink2 uses
> MMU notifiers. However usage of MMU notifiers requires a Kconfig
> option which is not selected by default on powerpc so add it to the
> powernv Kconfig.
> 
> Signed-off-by: Alistair Popple 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/abfe8026b505d66dad7a3e5ebe7235

cheers


Re: powerpc/mm: Remove reduntant initmem information from log

2017-04-10 Thread Michael Ellerman
On Fri, 2017-04-07 at 06:53:11 UTC, Anshuman Khandual wrote:
> Generic core VM already prints these information in the log
> buffer, hence there is no need for a second print. This just
> removes the second print from arch powerpc NUMA init path.
> 
> Before the patch:
> 
> $dmesg | grep "Initmem"
> 
> numa: Initmem setup node 0 [mem 0x-0x]
> numa: Initmem setup node 1 [mem 0x1-0x1]
> numa: Initmem setup node 2 [mem 0x2-0x2]
> numa: Initmem setup node 3 [mem 0x3-0x3]
> numa: Initmem setup node 4 [mem 0x4-0x4]
> numa: Initmem setup node 5 [mem 0x5-0x5]
> numa: Initmem setup node 6 [mem 0x6-0x6]
> numa: Initmem setup node 7 [mem 0x7-0x7]
> Initmem setup node 0 [mem 0x-0x]
> Initmem setup node 1 [mem 0x0001-0x0001]
> Initmem setup node 2 [mem 0x0002-0x0002]
> Initmem setup node 3 [mem 0x0003-0x0003]
> Initmem setup node 4 [mem 0x0004-0x0004]
> Initmem setup node 5 [mem 0x0005-0x0005]
> Initmem setup node 6 [mem 0x0006-0x0006]
> Initmem setup node 7 [mem 0x0007-0x0007]
> 
> After the patch:
> 
> $dmesg | grep "Initmem"
> 
> Initmem setup node 0 [mem 0x-0x]
> Initmem setup node 1 [mem 0x0001-0x0001]
> Initmem setup node 2 [mem 0x0002-0x0002]
> Initmem setup node 3 [mem 0x0003-0x0003]
> Initmem setup node 4 [mem 0x0004-0x0004]
> Initmem setup node 5 [mem 0x0005-0x0005]
> Initmem setup node 6 [mem 0x0006-0x0006]
> Initmem setup node 7 [mem 0x0007-0x0007]
> 
> Signed-off-by: Anshuman Khandual 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/ea6145557400b38faa6b3cee946ebc

cheers


Re: [V4] powerpc/hugetlb: Add ABI defines for supported HugeTLB page sizes

2017-04-10 Thread Michael Ellerman
On Fri, 2017-04-07 at 03:55:39 UTC, Anshuman Khandual wrote:
> This just adds user space exported ABI definitions for 2MB, 16MB, 1GB,
> 16GB non default huge page sizes to be used with mmap() system call.
> 
> Signed-off-by: Anshuman Khandual 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/2c9faa7675fec57f6ac0372688fae2

cheers


Re: [v2] powerpc: Make sparsemem the default on 64-bit Book3S

2017-04-10 Thread Michael Ellerman
On Wed, 2017-04-05 at 06:10:48 UTC, Michael Ellerman wrote:
> Make sparsemem the default on all 64-bit Book3S platforms. It already is
> for pseries and ps3, and we need to enable it for powernv because on
> POWER9 memory between chips is discontiguous.
> 
> For the other platforms sparsemem should work fine, though it might add
> a small amount of overhead. We can always force FLATMEM in the
> defconfigs if necessary.
> 
> Signed-off-by: Benjamin Herrenschmidt 
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/7b3912f4223541c5108565d4bad289

cheers


Re: powerpc/nohash: Fix use of mmu_has_feature() in setup_initial_memory_limit()

2017-04-10 Thread Michael Ellerman
On Mon, 2017-04-03 at 02:05:55 UTC, Michael Ellerman wrote:
> setup_initial_memory_limit() is called from early_init_devtree(), which
> runs prior to feature patching. If the kernel is built with 
> CONFIG_JUMP_LABEL=y
> and CONFIG_JUMP_LABEL_FEATURE_CHECKS=y then we will potentially get the
> wrong value.
> 
> If we also have CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG=y we get a warning
> and backtrace:
> 
>   Warning! mmu_has_feature() used prior to jump label init!
>   CPU: 0 PID: 0 Comm: swapper Not tainted 
> 4.11.0-rc4-gccN-next-20170331-g6af2434 #1
>   Call Trace:
>   [c0fc3d50] [c0a26c30] .dump_stack+0xa8/0xe8 (unreliable)
>   [c0fc3de0] [c002e6b8] .setup_initial_memory_limit+0xa4/0x104
>   [c0fc3e60] [c0d5c23c] .early_init_devtree+0xd0/0x2f8
>   [c0fc3f00] [c0d5d3b0] .early_setup+0x90/0x11c
>   [c0fc3f90] [c520] start_here_multiplatform+0x68/0x80
> 
> Fix it by using early_mmu_has_feature().
> 
> Fixes: c12e6f24d413 ("powerpc: Add option to use jump label for 
> mmu_has_feature()")
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/4868e3508d1934d28961f940ed6b9f

cheers


Re: [2/2] powerpc/mm/radix: Remove unnecessary ptesync

2017-04-10 Thread Michael Ellerman
On Sat, 2017-04-01 at 14:41:48 UTC, "Aneesh Kumar K.V" wrote:
> For a tlbiel with pid, we need to issue tlbiel with set number encoded. We
> don't need to do ptesync for each of those. Instead we need one for the entire
> tlbiel pid operation.
> 
> Signed-off-by: Benjamin Herrenschmidt 
> Signed-off-by: Aneesh Kumar K.V 
> Acked-by: Anton Blanchard 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/f7327e0ba3805470cced2acfa053e7

cheers


Re: [1/2] powerpc/mm/radix: Don't do page walk cache flush when doing full mm flush

2017-04-10 Thread Michael Ellerman
On Sat, 2017-04-01 at 14:41:47 UTC, "Aneesh Kumar K.V" wrote:
> For fullmm tlb flush, we do a flush with RIC_FLUSH_ALL which will invalidate 
> all
> related caches (radix__tlb_flush()). Hence the pwc flush is not needed.
> 
> Signed-off-by: Aneesh Kumar K.V 
> Acked-by: Anton Blanchard 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/f6b0df55cad252fedd60aa2ba75a02

cheers


[PATCH 2/2] powerpc: Remove unnecessary includes of asm/debug.h

2017-04-10 Thread Michael Ellerman
These files don't seem to have any need for asm/debug.h, now that all it
includes are the debugger hooks and breakpoint definitions.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/fadump.c | 1 -
 arch/powerpc/kernel/irq.c| 1 -
 arch/powerpc/kernel/prom.c   | 1 -
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 1 -
 4 files changed, 4 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index c7acc6651ce7..243dbef7e926 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -40,7 +40,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 static struct fw_dump fw_dump;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index a018f5cae899..097f2f9ff85d 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -65,7 +65,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f5d399e46193..d2f0afeae5a0 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -55,7 +55,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e78542d99cd6..d9e312f253fa 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -16,7 +16,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
-- 
2.7.4



[PATCH 1/2] powerpc: Create asm/debugfs.h and move powerpc_debugfs_root there

2017-04-10 Thread Michael Ellerman
powerpc_debugfs_root is the dentry representing the root of the
"powerpc" directory tree in debugfs.

Currently it sits in asm/debug.h, a long with some other things that
have "debug" in the name, but are otherwise unrelated.

Pull it out into a separate header, which also includes linux/debugfs.h,
and convert all the users to include debugfs.h instead of debug.h.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/debug.h  |  2 --
 arch/powerpc/include/asm/debugfs.h| 17 +
 arch/powerpc/kernel/eeh.c |  3 +--
 arch/powerpc/kernel/fadump.c  |  2 +-
 arch/powerpc/kernel/setup-common.c|  2 +-
 arch/powerpc/kernel/traps.c   |  2 +-
 arch/powerpc/kvm/book3s_xics.c|  3 +--
 arch/powerpc/mm/hash_utils_64.c   |  3 +--
 arch/powerpc/platforms/cell/axon_msi.c|  2 +-
 arch/powerpc/platforms/powernv/opal-lpc.c |  3 +--
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 +--
 arch/powerpc/platforms/pseries/dtl.c  |  3 +--
 arch/powerpc/sysdev/scom.c|  3 +--
 arch/powerpc/xmon/xmon.c  |  5 +
 14 files changed, 29 insertions(+), 24 deletions(-)
 create mode 100644 arch/powerpc/include/asm/debugfs.h

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 86308f177f2d..5d5af3fddfd8 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -8,8 +8,6 @@
 
 struct pt_regs;
 
-extern struct dentry *powerpc_debugfs_root;
-
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
 
 extern int (*__debugger)(struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/debugfs.h 
b/arch/powerpc/include/asm/debugfs.h
new file mode 100644
index ..4f3b39f3e3d2
--- /dev/null
+++ b/arch/powerpc/include/asm/debugfs.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_POWERPC_DEBUGFS_H
+#define _ASM_POWERPC_DEBUGFS_H
+
+/*
+ * Copyright 2017, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include 
+
+extern struct dentry *powerpc_debugfs_root;
+
+#endif /* _ASM_POWERPC_DEBUGFS_H */
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9de7f79e702b..63992b2d8e15 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -22,7 +22,6 @@
  */
 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -37,7 +36,7 @@
 #include 
 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 33b2da302730..c7acc6651ce7 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -30,12 +30,12 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index fcdca741f660..5c10b5925ac2 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -31,11 +31,11 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index ff365f9de27a..354946236c61 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -35,13 +35,13 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index e48803e2918d..dc4352395887 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -19,10 +19,9 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 
-#include 
 #include 
 
 #include "book3s_xics.h"
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 69a05b3e4d3f..f2095ce9d4b0 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,9 +35,8 @@
 #include 
 #include 
 #include 
-#include 
 
-#include 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/powerpc/platforms/cell/axon_msi.c 
b/arch/powerpc/platforms/cell/axon_msi.c
index 8b55c5f19d4c..8d3ae2cc52bf 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -15,9 +15,9 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c 
b/arch/powerpc/platforms/powernv/opal-lpc.c
index a91d7876fae2..6c7ad1d8b32e 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -12,7 +12,6 @@
 

Re: [PATCH] ppc64/kprobe: Fix oops when kprobed on 'stdu' instruction

2017-04-10 Thread Anton Blanchard
Hi Ravi,

> If we set a kprobe on a 'stdu' instruction on powerpc64, we see a
> kernel OOPS:

Ouch! We should mark this for stable.

Anton


Re: EEH error in doing DMA with PEX 8619

2017-04-10 Thread Gavin Shan
On Mon, Apr 10, 2017 at 05:22:33AM -0700, IanJiang wrote:
>Hi all!
>
>I am porting PLX driver for PEX 8619 to a power8 machine with CentOS-7.3.
>The PEX 8619 is used as an NTB (Non-Transparent Bridge).
>
>First, two DMA buffer are allocated with dma_alloc_coherent() and the
>physical address are:
>src: 0x _6060
>dst: 0x _6080
>Then, a DMA transfer is started and an EEH is reported in dmesg.
>
>This DMA test is OK at an x86_64 platform.
>
>Here are the details. Any suggestion is appreciated! 

.../...

>[72634.742182] brdgCtl: 0002
>[72634.742183] RootSts: 0002004f 0040 f0820008 00100147 2800
>[72634.742184] RootErrSts:   8000 
>[72634.742185] PhbSts:  001c 001c
>[72634.742186] Lem: 0400 42498e327f502eae
>
>[72634.742189] InAErr:  4000 4000
>80006061 8  401
>[72634.742190] PE[  1] A/B: 82003025 80006060

Please check if memory physical address (instead of DMA address) is used
in the driver. The EEH error is caused by DMA address 0x6060 and it's
not having a corresponding TCE entry.

Thanks,
Gavin



Re: [PATCH] ibmveth: Support to enable LSO/CSO for Trunk VEA.

2017-04-10 Thread Sivakumar Krishnasamy

Let me give some background before I answer your queries.

In IBM PowerVM environment, ibmveth driver supports largesend and 
checksum offload today, but only for virtual ethernet adapters (VEA) 
which are *not *configured in "Trunk mode".  In trunk mode, one cannot 
enable checksum and largesend offload capabilities. Without these 
offloads enabled, the performance numbers are not good. This patch is to 
enable these offloads for "Trunk" VEAs.


The following shows a typical configuration for network packet flow, 
when VMs in the PowerVM server have their network virtualized and 
communicate to external world.


VM (ibmveth) <-> PowerVM Hypervisor <-> PowerVM I/O Server VM ( 
ibmveth in "Trunk mode" <-> OVS <-> Physical NIC ) <->  External Network


As you can see the packets originating in VM will travel through local 
ibmveth driver and then to PowerVM Hypervisor, then it gets delivered to 
ibmveth driver configured in "Trunk" mode in I/O Server, which is then 
bridged by OVS to external network via Physical NIC.  To have largesend 
and checksum offload enabled end to end, from VM up to Physical NIC, 
ibmveth needs to support these offload capabilities when configured in 
"Trunk" mode too.


Before this patch, when a VM communicates with external network (in a 
configuration similar to above), throughput numbers were not so good 
(~1.5 Gbps) and with the patch, I see ~9.4 Gbps throughput for a 10G NIC 
(iperf used for measurements).


On 4/9/2017 12:15 AM, David Miller wrote:

From: Sivakumar Krishnasamy 
Date: Fri,  7 Apr 2017 05:57:59 -0400


Enable largesend and checksum offload for ibmveth configured in trunk mode.
Added support to SKB frag_list in TX path by skb_linearize'ing such SKBs.

Signed-off-by: Sivakumar Krishnasamy 

Why is linearization necessary?

It would seem that the gains you get from GRO are nullified by
linearizing the SKB and thus copying all the data around and
allocating buffers.
When Physical NIC has GRO enabled and when OVS bridges these packets, 
OVS vport send code will end up calling /dev_queue_xmit/, which in turn 
calls /validate_xmit_skb/.


/validate_xmit_skb/ has the below code snippet,

   /if (netif_needs_gso(skb, features)) {//
   //struct sk_buff *segs;//
   //
   //segs = skb_gso_segment(skb, features); /<=== Segments the GSO
   packet into MTU sized segments.

When the OVS outbound vport is ibmveth, /netif_needs_gso/ returns 
positively if the SKB has a /frag_list/ and if the driver doesn't 
support the same (NETIF_F_FRAGLIST feature).  So all the packets 
received by ibmveth are of MSS size (or lesser) due to the above code.


On a 10G physical NIC, the maximum throughput achieved was 2.2 Gbps due 
to the above segmentation in /validate_xmit_skb/. With the patch to 
linearize the SKB, the throughput increased to 9 Gbps (and ibmveth 
received packets without being segmented). This is ~4X improvement even 
though we end up allocating buffers and copying data.


Finally, all of that new checksumming stuff looks extremely
suspicious.  You have to explain why that is happening and why it
isn't because this driver is doing something incorrectly.

Thanks.

We are now enabling support for OVS and improving bridging performance 
in IBM's PowerVM environment, which brings in these new offload 
requirements for ibmveth driver configured in Trunk mode.


Please let me know if you need more details.


Freescale mpc8315 IRQ0 setup

2017-04-10 Thread Juergen Schindele
Dear mailing list,
i found out on our platform with freescale mpc8315 SOC that in
linux kernel code the setup of IRQ0 which we use is not correct.
One should be able to use falling EDGE interrupt capabilities like on
IRQ1-IRQ7. These setups are fixed in "arch/powerpc/sysdev/ipic.c"
The internal interrupt number of IRQ0 is not like IRQ1-IRQ7 in one block
but on number 48. To verify details please consult MPC8315ERM.pdf
developpers manual.

To correct these "EDGE" capabilities of IRQ0 i suggest the following 
patch:
please consider integrating it to your patches.

--- arch/powerpc/sysdev/ipic.c  (Revision correct)
+++ arch/powerpc/sysdev/ipic.c  (Arbeitskopie)
@@ -316,6 +316,7 @@
.prio_mask = 7,
},
[48] = {
+   .ack= IPIC_SEPNR,
.mask   = IPIC_SEMSR,
.prio   = IPIC_SMPRR_A,
.force  = IPIC_SEFCR,

Thank you for your attention
-- 
i. A.
Jürgen Schindele
Softwareentwicklung

PSI Nentec GmbH
Greschbachstraße 12
76229 Karlsruhe
Deutschland
Telefon: +49 721 94249-51
Telefax: +49 721 94249-10
schind...@nentec.de
www.nentec.de

Geschäftsführung: Klaus Becker, Wolfgang Fischer
Sitz der Gesellschaft: Karlsruhe
Handelsregister: Amtsgericht Mannheim HRB 107658

Diese E-Mail enthält vertrauliche oder rechtlich geschützte 
Informationen. Wenn Sie nicht der vorgesehene Empfänger sind, 
informieren Sie bitte sofort den Absender und löschen Sie diese E-Mail. 
Das unbefugte Kopieren dieser E-Mail oder die unbefugte Weitergabe 
der enthaltenen Informationen ist nicht gestattet.

The information contained in this message is confidential or protected 
by law. If you are not the intended recipient, please contact the sender 
and delete this message. Any unauthorised copying of this message or 
unauthorised distribution of the information contained herein is 
prohibited. 


Re: WARN @lib/refcount.c:128 during hot unplug of I/O adapter.

2017-04-10 Thread Tyrel Datwyler
On 04/06/2017 09:04 PM, Michael Ellerman wrote:
> Tyrel Datwyler  writes:
> 
>> On 04/06/2017 03:27 AM, Sachin Sant wrote:
>>> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
>>> any I/O adapter results in the following warning
>>>
>>> This problem has been in the code for some time now. I had first seen this 
>>> in
>>> -next tree.
>>>



>>> Have attached the dmesg log from the system. Let me know if any additional
>>> information is required to help debug this problem.
>>
>> I remember you mentioning this when the issue was brought up for CPUs. I
>> assume the case is the same here where the issue is only seen with
>> adapters that were hot-added after boot (ie. hot-remove of adapter
>> present at boot doesn't trip the warning)?
> 
> So who's fixing this?

I started looking at it when Bharata submitted a patch trying to fix the
issue for CPUs, but got side tracked by other things. I suspect that
this underflow has actually been an issue for quite some time, and we
are just now becoming aware of it thanks to the recount_t patchset being
merged. I'll look into it again this week.

-Tyrel

> 
> cheers
> 



Re: [PATCH v3] powerpc: mm: support ARCH_MMAP_RND_BITS

2017-04-10 Thread Bhupesh Sharma
Hi Michael,

On Wed, Mar 29, 2017 at 1:15 AM, Bhupesh Sharma  wrote:
> powerpc arch_mmap_rnd() currently uses hard-coded values - (23-PAGE_SHIFT) for
> 32-bit and (30-PAGE_SHIFT) for 64-bit, to generate the random offset
> for the mmap base address for a ASLR ELF.
>
> This patch makes sure that powerpc mmap arch_mmap_rnd() implementation
> is similar to other ARCHs (like x86, arm64) and uses mmap_rnd_bits
> and helpers to generate the mmap address randomization.
>
> The maximum and minimum randomization range values represent
> a compromise between increased ASLR effectiveness and avoiding
> address-space fragmentation.
>
> Using the Kconfig option and suitable /proc tunable, platform
> developers may choose where to place this compromise.
>
> Also this patch keeps the default values as new minimums.
>
> Signed-off-by: Bhupesh Sharma 
> Reviewed-by: Kees Cook 
> ---
> * Changes since v2:
> v2 can be seen here (https://patchwork.kernel.org/patch/9551509/)
> - Changed a few minimum and maximum randomization ranges as per Michael's 
> suggestion.
> - Corrected Kees's email address in the Reviewed-by line.
> - Added further comments in kconfig to explain how the address ranges 
> were worked out.
>
> * Changes since v1:
> v1 can be seen here 
> (https://lists.ozlabs.org/pipermail/linuxppc-dev/2017-February/153594.html)
> - No functional change in this patch.
> - Dropped PATCH 2/2 from v1 as recommended by Kees Cook.
>
>  arch/powerpc/Kconfig   | 44 
>  arch/powerpc/mm/mmap.c |  7 ---
>  2 files changed, 48 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 97a8bc8..84aae67 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -22,6 +22,48 @@ config MMU
> bool
> default y
>
> +# min bits determined by the following formula:
> +# VA_BITS - PAGE_SHIFT - CONSTANT
> +# where,
> +#  VA_BITS = 46 bits for 64BIT and 4GB - 1 Page = 31 bits for 32BIT
> +#  CONSTANT = 16 for 64BIT and 8 for 32BIT
> +config ARCH_MMAP_RND_BITS_MIN
> +   default 5 if PPC_256K_PAGES && 32BIT  # 31 - 18 - 8 = 5
> +   default 7 if PPC_64K_PAGES && 32BIT   # 31 - 16 - 8 = 7
> +   default 9 if PPC_16K_PAGES && 32BIT   # 31 - 14 - 8 = 9
> +   default 11 if PPC_4K_PAGES && 32BIT   # 31 - 12 - 8 = 11
> +   default 12 if PPC_256K_PAGES && 64BIT # 46 - 18 - 16 = 12
> +   default 14 if PPC_64K_PAGES && 64BIT  # 46 - 16 - 16 = 14
> +   default 16 if PPC_16K_PAGES && 64BIT  # 46 - 14 - 16 = 16
> +   default 18 if PPC_4K_PAGES && 64BIT   # 46 - 12 - 16 = 18
> +
> +# max bits determined by the following formula:
> +# VA_BITS - PAGE_SHIFT - CONSTANT
> +# where,
> +#  VA_BITS = 46 bits for 64BIT, and 4GB - 1 Page = 31 bits for 32BIT
> +#  CONSTANT = 2, both for 64BIT and 32BIT
> +config ARCH_MMAP_RND_BITS_MAX
> +   default 11 if PPC_256K_PAGES && 32BIT # 31 - 18 - 2 = 11
> +   default 13 if PPC_64K_PAGES && 32BIT  # 31 - 16 - 2 = 13
> +   default 15 if PPC_16K_PAGES && 32BIT  # 31 - 14 - 2 = 15
> +   default 17 if PPC_4K_PAGES && 32BIT   # 31 - 12 - 2 = 17
> +   default 26 if PPC_256K_PAGES && 64BIT # 46 - 18 - 2 = 26
> +   default 28 if PPC_64K_PAGES && 64BIT  # 46 - 16 - 2 = 28
> +   default 30 if PPC_16K_PAGES && 64BIT  # 46 - 14 - 2 = 30
> +   default 32 if PPC_4K_PAGES && 64BIT   # 46 - 12 - 2 = 32
> +
> +config ARCH_MMAP_RND_COMPAT_BITS_MIN
> +   default 5 if PPC_256K_PAGES
> +   default 7 if PPC_64K_PAGES
> +   default 9 if PPC_16K_PAGES
> +   default 11
> +
> +config ARCH_MMAP_RND_COMPAT_BITS_MAX
> +   default 11 if PPC_256K_PAGES
> +   default 13 if PPC_64K_PAGES
> +   default 15 if PPC_16K_PAGES
> +   default 17
> +
>  config HAVE_SETUP_PER_CPU_AREA
> def_bool PPC64
>
> @@ -142,6 +184,8 @@ config PPC
> select HAVE_IRQ_EXIT_ON_IRQ_STACK
> select HAVE_KERNEL_GZIP
> select HAVE_KPROBES
> +   select HAVE_ARCH_MMAP_RND_BITS
> +   select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
> select HAVE_KRETPROBES
> select HAVE_LIVEPATCH   if 
> HAVE_DYNAMIC_FTRACE_WITH_REGS
> select HAVE_MEMBLOCK
> diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
> index a5d9ef5..92a9355 100644
> --- a/arch/powerpc/mm/mmap.c
> +++ b/arch/powerpc/mm/mmap.c
> @@ -61,11 +61,12 @@ unsigned long arch_mmap_rnd(void)
>  {
> unsigned long rnd;
>
> -   /* 8MB for 32bit, 1GB for 64bit */
> +#ifdef CONFIG_COMPAT
> if (is_32bit_task())
> -   rnd = get_random_long() % (1<<(23-PAGE_SHIFT));
> +   rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
> else
> -   rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT));
> +#endif
> +   rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
>
> return 

Re: [PATCH V4 6/7] cxl: Isolate few psl8 specific calls

2017-04-10 Thread Frederic Barrat



Le 07/04/2017 à 16:11, Christophe Lombard a écrit :

Point out the specific Coherent Accelerator Interface Architecture,
level 1, registers.
Code and functions specific to PSL8 (CAIA1) must be framed.

Signed-off-by: Christophe Lombard 
---


There are a few changes in native.c which are about splitting long 
strings, but that's minor. And the rest looks ok.


I'll do the last patch tomorrow.

Acked-by: Frederic Barrat 



 drivers/misc/cxl/context.c | 28 +++-
 drivers/misc/cxl/cxl.h | 35 +++--
 drivers/misc/cxl/debugfs.c |  6 +++--
 drivers/misc/cxl/native.c  | 43 +--
 drivers/misc/cxl/pci.c | 64 +++---
 5 files changed, 120 insertions(+), 56 deletions(-)

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 2e935ea..ac2531e 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -39,23 +39,26 @@ int cxl_context_init(struct cxl_context *ctx, struct 
cxl_afu *afu, bool master)
 {
int i;

-   spin_lock_init(>sste_lock);
ctx->afu = afu;
ctx->master = master;
ctx->pid = NULL; /* Set in start work ioctl */
mutex_init(>mapping_lock);
ctx->mapping = NULL;

-   /*
-* Allocate the segment table before we put it in the IDR so that we
-* can always access it when dereferenced from IDR. For the same
-* reason, the segment table is only destroyed after the context is
-* removed from the IDR.  Access to this in the IOCTL is protected by
-* Linux filesytem symantics (can't IOCTL until open is complete).
-*/
-   i = cxl_alloc_sst(ctx);
-   if (i)
-   return i;
+   if (cxl_is_psl8(afu)) {
+   spin_lock_init(>sste_lock);
+
+   /*
+* Allocate the segment table before we put it in the IDR so 
that we
+* can always access it when dereferenced from IDR. For the same
+* reason, the segment table is only destroyed after the 
context is
+* removed from the IDR.  Access to this in the IOCTL is 
protected by
+* Linux filesytem symantics (can't IOCTL until open is 
complete).
+*/
+   i = cxl_alloc_sst(ctx);
+   if (i)
+   return i;
+   }

INIT_WORK(>fault_work, cxl_handle_fault);

@@ -308,7 +311,8 @@ static void reclaim_ctx(struct rcu_head *rcu)
 {
struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu);

-   free_page((u64)ctx->sstp);
+   if (cxl_is_psl8(ctx->afu))
+   free_page((u64)ctx->sstp);
if (ctx->ff_page)
__free_page(ctx->ff_page);
ctx->sstp = NULL;
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index a54c003..82335c0 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -73,7 +73,7 @@ static const cxl_p1_reg_t CXL_PSL_Control = {0x0020};
 static const cxl_p1_reg_t CXL_PSL_DLCNTL  = {0x0060};
 static const cxl_p1_reg_t CXL_PSL_DLADDR  = {0x0068};

-/* PSL Lookaside Buffer Management Area */
+/* PSL Lookaside Buffer Management Area - CAIA 1 */
 static const cxl_p1_reg_t CXL_PSL_LBISEL  = {0x0080};
 static const cxl_p1_reg_t CXL_PSL_SLBIE   = {0x0088};
 static const cxl_p1_reg_t CXL_PSL_SLBIA   = {0x0090};
@@ -82,7 +82,7 @@ static const cxl_p1_reg_t CXL_PSL_TLBIA   = {0x00A8};
 static const cxl_p1_reg_t CXL_PSL_AFUSEL  = {0x00B0};

 /* 0x00C0:7EFF Implementation dependent area */
-/* PSL registers */
+/* PSL registers - CAIA 1 */
 static const cxl_p1_reg_t CXL_PSL_FIR1  = {0x0100};
 static const cxl_p1_reg_t CXL_PSL_FIR2  = {0x0108};
 static const cxl_p1_reg_t CXL_PSL_Timebase  = {0x0110};
@@ -109,7 +109,7 @@ static const cxl_p1n_reg_t CXL_PSL_AMBAR_An   = {0x10};
 static const cxl_p1n_reg_t CXL_PSL_SPOffset_An= {0x18};
 static const cxl_p1n_reg_t CXL_PSL_ID_An  = {0x20};
 static const cxl_p1n_reg_t CXL_PSL_SERR_An= {0x28};
-/* Memory Management and Lookaside Buffer Management */
+/* Memory Management and Lookaside Buffer Management - CAIA 1*/
 static const cxl_p1n_reg_t CXL_PSL_SDR_An = {0x30};
 static const cxl_p1n_reg_t CXL_PSL_AMOR_An= {0x38};
 /* Pointer Area */
@@ -124,6 +124,7 @@ static const cxl_p1n_reg_t CXL_PSL_IVTE_Limit_An  = {0xB8};
 /* 0xC0:FF Implementation Dependent Area */
 static const cxl_p1n_reg_t CXL_PSL_FIR_SLICE_An   = {0xC0};
 static const cxl_p1n_reg_t CXL_AFU_DEBUG_An   = {0xC8};
+/* 0xC0:FF Implementation Dependent Area - CAIA 1 */
 static const cxl_p1n_reg_t CXL_PSL_APCALLOC_A = {0xD0};
 static const cxl_p1n_reg_t CXL_PSL_COALLOC_A  = {0xD8};
 static const cxl_p1n_reg_t CXL_PSL_RXCTL_A= {0xE0};
@@ -133,12 +134,14 @@ static const cxl_p1n_reg_t CXL_PSL_SLICE_TRACE= 
{0xE8};
 /* Configuration and Control Area */
 static 

Re: [PATCH V4 5/7] cxl: Rename some psl8 specific functions

2017-04-10 Thread Frederic Barrat



Le 07/04/2017 à 16:11, Christophe Lombard a écrit :

Rename a few functions, changing the '_psl' suffix to '_psl8', to make
clear that the implementation is psl8 specific.
Those functions will have an equivalent implementation for the psl9 in
a later patch.

Signed-off-by: Christophe Lombard 
---


Acked-by: Frederic Barrat 



 drivers/misc/cxl/cxl.h | 26 ++--
 drivers/misc/cxl/debugfs.c |  6 ++---
 drivers/misc/cxl/guest.c   |  2 +-
 drivers/misc/cxl/irq.c |  2 +-
 drivers/misc/cxl/native.c  | 12 +-
 drivers/misc/cxl/pci.c | 60 +++---
 6 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 626073d..a54c003 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -813,10 +813,10 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count);
 void afu_release_irqs(struct cxl_context *ctx, void *cookie);
 void afu_irq_name_free(struct cxl_context *ctx);

-int cxl_attach_afu_directed_psl(struct cxl_context *ctx, u64 wed, u64 amr);
-int cxl_activate_dedicated_process_psl(struct cxl_afu *afu);
-int cxl_attach_dedicated_process_psl(struct cxl_context *ctx, u64 wed, u64 
amr);
-void cxl_update_dedicated_ivtes_psl(struct cxl_context *ctx);
+int cxl_attach_afu_directed_psl8(struct cxl_context *ctx, u64 wed, u64 amr);
+int cxl_activate_dedicated_process_psl8(struct cxl_afu *afu);
+int cxl_attach_dedicated_process_psl8(struct cxl_context *ctx, u64 wed, u64 
amr);
+void cxl_update_dedicated_ivtes_psl8(struct cxl_context *ctx);

 #ifdef CONFIG_DEBUG_FS

@@ -826,10 +826,10 @@ int cxl_debugfs_adapter_add(struct cxl *adapter);
 void cxl_debugfs_adapter_remove(struct cxl *adapter);
 int cxl_debugfs_afu_add(struct cxl_afu *afu);
 void cxl_debugfs_afu_remove(struct cxl_afu *afu);
-void cxl_stop_trace_psl(struct cxl *cxl);
-void cxl_debugfs_add_adapter_regs_psl(struct cxl *adapter, struct dentry *dir);
+void cxl_stop_trace_psl8(struct cxl *cxl);
+void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry 
*dir);
 void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir);
-void cxl_debugfs_add_afu_regs_psl(struct cxl_afu *afu, struct dentry *dir);
+void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir);

 #else /* CONFIG_DEBUG_FS */

@@ -860,11 +860,11 @@ static inline void cxl_debugfs_afu_remove(struct cxl_afu 
*afu)
 {
 }

-static inline void cxl_stop_trace(struct cxl *cxl)
+static inline void cxl_stop_trace_psl8(struct cxl *cxl)
 {
 }

-static inline void cxl_debugfs_add_adapter_regs_psl(struct cxl *adapter,
+static inline void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter,
struct dentry *dir)
 {
 }
@@ -874,7 +874,7 @@ static inline void cxl_debugfs_add_adapter_regs_xsl(struct 
cxl *adapter,
 {
 }

-static inline void cxl_debugfs_add_afu_regs_psl(struct cxl_afu *afu, struct 
dentry *dir)
+static inline void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct 
dentry *dir)
 {
 }

@@ -919,8 +919,8 @@ struct cxl_irq_info {
 };

 void cxl_assign_psn_space(struct cxl_context *ctx);
-int cxl_invalidate_all_psl(struct cxl *adapter);
-irqreturn_t cxl_irq_psl(int irq, struct cxl_context *ctx, struct cxl_irq_info 
*irq_info);
+int cxl_invalidate_all_psl8(struct cxl *adapter);
+irqreturn_t cxl_irq_psl8(int irq, struct cxl_context *ctx, struct cxl_irq_info 
*irq_info);
 irqreturn_t cxl_fail_irq_psl(struct cxl_afu *afu, struct cxl_irq_info 
*irq_info);
 int cxl_register_one_irq(struct cxl *adapter, irq_handler_t handler,
void *cookie, irq_hw_number_t *dest_hwirq,
@@ -932,7 +932,7 @@ int cxl_data_cache_flush(struct cxl *adapter);
 int cxl_afu_disable(struct cxl_afu *afu);
 int cxl_psl_purge(struct cxl_afu *afu);

-void cxl_native_irq_dump_regs_psl(struct cxl_context *ctx);
+void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx);
 void cxl_native_err_irq_dump_regs(struct cxl *adapter);
 int cxl_pci_vphb_add(struct cxl_afu *afu);
 void cxl_pci_vphb_remove(struct cxl_afu *afu);
diff --git a/drivers/misc/cxl/debugfs.c b/drivers/misc/cxl/debugfs.c
index 4848ebf..2ff10a9 100644
--- a/drivers/misc/cxl/debugfs.c
+++ b/drivers/misc/cxl/debugfs.c
@@ -15,7 +15,7 @@

 static struct dentry *cxl_debugfs;

-void cxl_stop_trace_psl(struct cxl *adapter)
+void cxl_stop_trace_psl8(struct cxl *adapter)
 {
int slice;

@@ -53,7 +53,7 @@ static struct dentry *debugfs_create_io_x64(const char *name, 
umode_t mode,
  (void __force *)value, _io_x64);
 }

-void cxl_debugfs_add_adapter_regs_psl(struct cxl *adapter, struct dentry *dir)
+void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir)
 {
debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, 
CXL_PSL_FIR1));
debugfs_create_io_x64("fir2", S_IRUSR, dir, 

Re: [PATCH V4 4/7] cxl: Update implementation service layer

2017-04-10 Thread Frederic Barrat


Le 07/04/2017 à 16:11, Christophe Lombard a écrit :

The service layer API (in cxl.h) lists some low-level functions whose
implementation is different on PSL8, PSL9 and XSL:
- Init implementation for the adapter and the afu.
- Invalidate TLB/SLB.
- Attach process for dedicated/directed models.
- Handle psl interrupts.
- Debug registers for the adapter and the afu.
- Traces.
Each environment implements its own functions, and the common code uses
them through function pointers, defined in cxl_service_layer_ops.

Signed-off-by: Christophe Lombard 
---



Acked-by: Frederic Barrat 



 drivers/misc/cxl/cxl.h | 40 +++--
 drivers/misc/cxl/debugfs.c | 16 +++---
 drivers/misc/cxl/guest.c   |  2 +-
 drivers/misc/cxl/irq.c |  2 +-
 drivers/misc/cxl/native.c  | 54 ++---
 drivers/misc/cxl/pci.c | 55 +-
 6 files changed, 110 insertions(+), 59 deletions(-)

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 4bcbf7a..626073d 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -553,13 +553,23 @@ struct cxl_context {
struct mm_struct *mm;
 };

+struct cxl_irq_info;
+
 struct cxl_service_layer_ops {
int (*adapter_regs_init)(struct cxl *adapter, struct pci_dev *dev);
+   int (*invalidate_all)(struct cxl *adapter);
int (*afu_regs_init)(struct cxl_afu *afu);
+   int (*sanitise_afu_regs)(struct cxl_afu *afu);
int (*register_serr_irq)(struct cxl_afu *afu);
void (*release_serr_irq)(struct cxl_afu *afu);
-   void (*debugfs_add_adapter_sl_regs)(struct cxl *adapter, struct dentry 
*dir);
-   void (*debugfs_add_afu_sl_regs)(struct cxl_afu *afu, struct dentry 
*dir);
+   irqreturn_t (*handle_interrupt)(int irq, struct cxl_context *ctx, 
struct cxl_irq_info *irq_info);
+   irqreturn_t (*fail_irq)(struct cxl_afu *afu, struct cxl_irq_info 
*irq_info);
+   int (*activate_dedicated_process)(struct cxl_afu *afu);
+   int (*attach_afu_directed)(struct cxl_context *ctx, u64 wed, u64 amr);
+   int (*attach_dedicated_process)(struct cxl_context *ctx, u64 wed, u64 
amr);
+   void (*update_dedicated_ivtes)(struct cxl_context *ctx);
+   void (*debugfs_add_adapter_regs)(struct cxl *adapter, struct dentry 
*dir);
+   void (*debugfs_add_afu_regs)(struct cxl_afu *afu, struct dentry *dir);
void (*psl_irq_dump_registers)(struct cxl_context *ctx);
void (*err_irq_dump_registers)(struct cxl *adapter);
void (*debugfs_stop_trace)(struct cxl *adapter);
@@ -803,6 +813,11 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count);
 void afu_release_irqs(struct cxl_context *ctx, void *cookie);
 void afu_irq_name_free(struct cxl_context *ctx);

+int cxl_attach_afu_directed_psl(struct cxl_context *ctx, u64 wed, u64 amr);
+int cxl_activate_dedicated_process_psl(struct cxl_afu *afu);
+int cxl_attach_dedicated_process_psl(struct cxl_context *ctx, u64 wed, u64 
amr);
+void cxl_update_dedicated_ivtes_psl(struct cxl_context *ctx);
+
 #ifdef CONFIG_DEBUG_FS

 int cxl_debugfs_init(void);
@@ -811,10 +826,10 @@ int cxl_debugfs_adapter_add(struct cxl *adapter);
 void cxl_debugfs_adapter_remove(struct cxl *adapter);
 int cxl_debugfs_afu_add(struct cxl_afu *afu);
 void cxl_debugfs_afu_remove(struct cxl_afu *afu);
-void cxl_stop_trace(struct cxl *cxl);
-void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir);
-void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir);
-void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir);
+void cxl_stop_trace_psl(struct cxl *cxl);
+void cxl_debugfs_add_adapter_regs_psl(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_afu_regs_psl(struct cxl_afu *afu, struct dentry *dir);

 #else /* CONFIG_DEBUG_FS */

@@ -849,17 +864,17 @@ static inline void cxl_stop_trace(struct cxl *cxl)
 {
 }

-static inline void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter,
+static inline void cxl_debugfs_add_adapter_regs_psl(struct cxl *adapter,
struct dentry *dir)
 {
 }

-static inline void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter,
+static inline void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter,
struct dentry *dir)
 {
 }

-static inline void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct 
dentry *dir)
+static inline void cxl_debugfs_add_afu_regs_psl(struct cxl_afu *afu, struct 
dentry *dir)
 {
 }

@@ -904,19 +919,20 @@ struct cxl_irq_info {
 };

 void cxl_assign_psn_space(struct cxl_context *ctx);
-irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info 
*irq_info);
+int cxl_invalidate_all_psl(struct cxl *adapter);

Re: [PATCH V4 3/7] cxl: Keep track of mm struct associated with a context

2017-04-10 Thread Frederic Barrat



Le 07/04/2017 à 16:11, Christophe Lombard a écrit :

The mm_struct corresponding to the current task is acquired each time
an interrupt is raised. So to simplify the code, we only get the
mm_struct when attaching an AFU context to the process.
The mm_count reference is increased to ensure that the mm_struct can't
be freed. The mm_struct will be released when the context is detached.
A reference on mm_users is not kept to avoid a circular dependency if
the process mmaps its cxl mmio and forget to unmap before exiting.
The field glpid (pid of the group leader associated with the pid), of
the structure cxl_context, is removed because it's no longer useful.

Signed-off-by: Christophe Lombard 
---


Thanks for the update, I think it looks good now.

Acked-by: Frederic Barrat 



 drivers/misc/cxl/api.c | 17 +--
 drivers/misc/cxl/context.c | 21 +++--
 drivers/misc/cxl/cxl.h | 10 --
 drivers/misc/cxl/fault.c   | 76 --
 drivers/misc/cxl/file.c| 15 +++--
 drivers/misc/cxl/main.c| 12 ++--
 6 files changed, 61 insertions(+), 90 deletions(-)

diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index bcc030e..1a138c8 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "cxl.h"

@@ -321,19 +322,29 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,

if (task) {
ctx->pid = get_task_pid(task, PIDTYPE_PID);
-   ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID);
kernel = false;
ctx->real_mode = false;
+
+   /* acquire a reference to the task's mm */
+   ctx->mm = get_task_mm(current);
+
+   /* ensure this mm_struct can't be freed */
+   cxl_context_mm_count_get(ctx);
+
+   /* decrement the use count */
+   if (ctx->mm)
+   mmput(ctx->mm);
}

cxl_ctx_get();

if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) {
-   put_pid(ctx->glpid);
put_pid(ctx->pid);
-   ctx->glpid = ctx->pid = NULL;
+   ctx->pid = NULL;
cxl_adapter_context_put(ctx->afu->adapter);
cxl_ctx_put();
+   if (task)
+   cxl_context_mm_count_put(ctx);
goto out;
}

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 062bf6c..2e935ea 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -41,7 +42,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu 
*afu, bool master)
spin_lock_init(>sste_lock);
ctx->afu = afu;
ctx->master = master;
-   ctx->pid = ctx->glpid = NULL; /* Set in start work ioctl */
+   ctx->pid = NULL; /* Set in start work ioctl */
mutex_init(>mapping_lock);
ctx->mapping = NULL;

@@ -242,12 +243,16 @@ int __detach_context(struct cxl_context *ctx)

/* release the reference to the group leader and mm handling pid */
put_pid(ctx->pid);
-   put_pid(ctx->glpid);

cxl_ctx_put();

/* Decrease the attached context count on the adapter */
cxl_adapter_context_put(ctx->afu->adapter);
+
+   /* Decrease the mm count on the context */
+   cxl_context_mm_count_put(ctx);
+   ctx->mm = NULL;
+
return 0;
 }

@@ -325,3 +330,15 @@ void cxl_context_free(struct cxl_context *ctx)
mutex_unlock(>afu->contexts_lock);
call_rcu(>rcu, reclaim_ctx);
 }
+
+void cxl_context_mm_count_get(struct cxl_context *ctx)
+{
+   if (ctx->mm)
+   atomic_inc(>mm->mm_count);
+}
+
+void cxl_context_mm_count_put(struct cxl_context *ctx)
+{
+   if (ctx->mm)
+   mmdrop(ctx->mm);
+}
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 36bc213..4bcbf7a 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -482,8 +482,6 @@ struct cxl_context {
unsigned int sst_size, sst_lru;

wait_queue_head_t wq;
-   /* pid of the group leader associated with the pid */
-   struct pid *glpid;
/* use mm context associated with this pid for ds faults */
struct pid *pid;
spinlock_t lock; /* Protects pending_irq_mask, pending_fault and 
fault_addr */
@@ -551,6 +549,8 @@ struct cxl_context {
 * CX4 only:
 */
struct list_head extra_irq_contexts;
+
+   struct mm_struct *mm;
 };

 struct cxl_service_layer_ops {
@@ -1012,4 +1012,10 @@ int cxl_adapter_context_lock(struct cxl *adapter);
 /* Unlock the contexts-lock if taken. Warn and force unlock otherwise */
 void cxl_adapter_context_unlock(struct cxl *adapter);

+/* Increases the reference 

Re: [PATCH V4 2/7] cxl: Remove unused values in bare-metal environment.

2017-04-10 Thread Frederic Barrat



Le 07/04/2017 à 16:11, Christophe Lombard a écrit :

The two previously fields pid and tid, located in the structure
cxl_irq_info, are only used in the guest environment. To avoid confusion,
it's not necessary to fill the fields in the bare-metal environment.
Pid_tid is now renamed to 'reserved' to avoid undefined behavior on
bare-metal. The PSL Process and Thread Identification Register
(CXL_PSL_PID_TID_An) is only used when attaching a dedicated process
for PSL8 only. This register goes away in CAIA2.

Signed-off-by: Christophe Lombard 
---


Acked-by: Frederic Barrat 




 drivers/misc/cxl/cxl.h| 20 
 drivers/misc/cxl/hcalls.c |  6 +++---
 drivers/misc/cxl/native.c |  5 -
 3 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 79e60ec..36bc213 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -888,27 +888,15 @@ int __detach_context(struct cxl_context *ctx);
 /*
  * This must match the layout of the H_COLLECT_CA_INT_INFO retbuf defined
  * in PAPR.
- * A word about endianness: a pointer to this structure is passed when
- * calling the hcall. However, it is not a block of memory filled up by
- * the hypervisor. The return values are found in registers, and copied
- * one by one when returning from the hcall. See the end of the call to
- * plpar_hcall9() in hvCall.S
- * As a consequence:
- * - we don't need to do any endianness conversion
- * - the pid and tid are an exception. They are 32-bit values returned in
- *   the same 64-bit register. So we do need to worry about byte ordering.
+ * Field pid_tid is now 'reserved' because it's no more used on bare-metal.
+ * On a guest environment, PSL_PID_An is located on the upper 32 bits and
+ * PSL_TID_An register in the lower 32 bits.
  */
 struct cxl_irq_info {
u64 dsisr;
u64 dar;
u64 dsr;
-#ifndef CONFIG_CPU_LITTLE_ENDIAN
-   u32 pid;
-   u32 tid;
-#else
-   u32 tid;
-   u32 pid;
-#endif
+   u64 reserved;
u64 afu_err;
u64 errstat;
u64 proc_handle;
diff --git a/drivers/misc/cxl/hcalls.c b/drivers/misc/cxl/hcalls.c
index d6d11f4..9b8bb0f 100644
--- a/drivers/misc/cxl/hcalls.c
+++ b/drivers/misc/cxl/hcalls.c
@@ -413,9 +413,9 @@ long cxl_h_collect_int_info(u64 unit_address, u64 
process_token,

switch (rc) {
case H_SUCCESS: /* The interrupt info is returned in return 
registers. */
-   pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid:%u, tid:%u, 
afu_err:%#llx, errstat:%#llx\n",
-   info->dsisr, info->dar, info->dsr, info->pid,
-   info->tid, info->afu_err, info->errstat);
+   pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid_tid:%#llx, 
afu_err:%#llx, errstat:%#llx\n",
+   info->dsisr, info->dar, info->dsr, info->reserved,
+   info->afu_err, info->errstat);
return 0;
case H_PARAMETER:   /* An incorrect parameter was supplied. */
return -EINVAL;
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 7ae7105..7257e8b 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -859,8 +859,6 @@ static int native_detach_process(struct cxl_context *ctx)

 static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
 {
-   u64 pidtid;
-
/* If the adapter has gone away, we can't get any meaningful
 * information.
 */
@@ -870,9 +868,6 @@ static int native_get_irq_info(struct cxl_afu *afu, struct 
cxl_irq_info *info)
info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
info->dar = cxl_p2n_read(afu, CXL_PSL_DAR_An);
info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An);
-   pidtid = cxl_p2n_read(afu, CXL_PSL_PID_TID_An);
-   info->pid = pidtid >> 32;
-   info->tid = pidtid & 0x;
info->afu_err = cxl_p2n_read(afu, CXL_AFU_ERR_An);
info->errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
info->proc_handle = 0;





Re: [PATCH V4 1/7] cxl: Read vsec perst load image

2017-04-10 Thread Frederic Barrat



Le 07/04/2017 à 16:11, Christophe Lombard a écrit :

This bit is used to cause a flash image load for programmable
CAIA-compliant implementation. If this bit is set to ‘0’, a power
cycle of the adapter is required to load a programmable CAIA-com-
pliant implementation from flash.
This field will be used by the following patches.

Signed-off-by: Christophe Lombard 
---


Acked-by: Frederic Barrat 



 drivers/misc/cxl/pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index b27ea98..1f4c351 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1332,6 +1332,7 @@ static int cxl_read_vsec(struct cxl *adapter, struct 
pci_dev *dev)
CXL_READ_VSEC_IMAGE_STATE(dev, vsec, _state);
adapter->user_image_loaded = !!(image_state & 
CXL_VSEC_USER_IMAGE_LOADED);
adapter->perst_select_user = !!(image_state & 
CXL_VSEC_USER_IMAGE_LOADED);
+   adapter->perst_loads_image = !!(image_state & 
CXL_VSEC_PERST_LOADS_IMAGE);

CXL_READ_VSEC_NAFUS(dev, vsec, >slices);
CXL_READ_VSEC_AFU_DESC_OFF(dev, vsec, _desc_off);





Re: [PATCH] ppc64/kprobe: Fix oops when kprobed on 'stdu' instruction

2017-04-10 Thread Naveen N. Rao

Excerpts from Ravi Bangoria's message of April 10, 2017 20:50:
If we set a kprobe on a 'stdu' instruction on powerpc64, we see a kernel 
OOPS:


  [ 1275.165932] Bad kernel stack pointer cd93c840 at c0009868
  [ 1275.166378] Oops: Bad kernel stack pointer, sig: 6 [#1]
  ...
  GPR00: c01fcd93cb30 cd93c840 c15c5e00 cd93c840
  ...
  [ 1275.178305] NIP [c0009868] resume_kernel+0x2c/0x58
  [ 1275.178594] LR [c0006208] program_check_common+0x108/0x180

Basically, on 64 bit system, when user probes on 'stdu' instruction,
kernel does not emulate actual store in emulate_step itself because it
may corrupt exception frame. So kernel does actual store operation in
exception return code i.e. resume_kernel().

resume_kernel() loads the saved stack pointer from memory using lwz,
effectively loading a corrupt (32bit) address, causing the kernel crash.

Fix this by loading the 64bit value instead.


Thanks for fixing this!



Fixes: 8e9f69371536 ("powerpc/kprobe: Don't emulate store when kprobe stwu r1")


I think this should really be:
Fixes: be96f63375a1 ("powerpc: Split out instruction analysis part of 
emulate_step()")


...since the original commit just handled stwu on powerpc64 as well. In 
some ways, the 64-bit part of that commit wasn't that useful, but it 
never addressed stdu directly.



Signed-off-by: Ravi Bangoria 
---
History:
  Commit 8e9f69371536 ("powerpc/kprobe: Don't emulate store when kprobe
  stwu r1") fixed exception frame corruption for 32 bit system which uses
  'stwu' instruction for stack frame allocation. This commit also added
  code for 64 bit system but did not enabled it for 'stdu' instruction.
  So 'stdu' instruction on 64 bit machine was emulating actual store in
  emulate_step() itself until...

  Commit be96f63375a1 ("powerpc: Split out instruction analysis part of
  emulate_step()"), enabled it for 'stdu' instruction on 64 bit machine.

  Since then it's broken. So this should also go into stable.


Hmm... so I think kprobe on 'stdu' has always been broken on powerpc64.  
We haven't noticed since most stdu operations were probably landing in 
the red zone so the exception frame never got corrupted. In that sense, 
this fix is needed for BE ever since load/store emulation was added.


For LE, this is only getting exposed now due to your recent patch to 
enable load/store emulation on LE.




 arch/powerpc/kernel/entry_64.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 6432d4b..530f6e9 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -689,7 +689,7 @@ resume_kernel:

addir8,r1,INT_FRAME_SIZE/* Get the kprobed function entry */

-   lwz r3,GPR1(r1)
+   ld  r3,GPR1(r1)
subir3,r3,INT_FRAME_SIZE/* dst: Allocate a trampoline exception 
frame */
mr  r4,r1   /* src:  current exception frame */
mr  r1,r3   /* Reroute the trampoline frame to r1 */
@@ -704,7 +704,7 @@ resume_kernel:
bdnz2b

/* Do real store operation to complete stwu */


Can you also update the above comment to refer to 'stdu'?
Apart from that, for this patch:
Reviewed-by: Naveen N. Rao 

- Naveen



-   lwz r5,GPR1(r1)
+   ld  r5,GPR1(r1)
std r8,0(r5)

/* Clear _TIF_EMULATE_STACK_STORE flag */
--
1.9.3






Re: [PATCH v5 01/15] stacktrace/x86: add function for detecting reliable stack traces

2017-04-10 Thread Petr Mladek
On Mon 2017-02-13 19:42:28, Josh Poimboeuf wrote:
> For live patching and possibly other use cases, a stack trace is only
> useful if it can be assured that it's completely reliable.  Add a new
> save_stack_trace_tsk_reliable() function to achieve that.
> 
> Note that if the target task isn't the current task, and the target task
> is allowed to run, then it could be writing the stack while the unwinder
> is reading it, resulting in possible corruption.  So the caller of
> save_stack_trace_tsk_reliable() must ensure that the task is either
> 'current' or inactive.
> 
> save_stack_trace_tsk_reliable() relies on the x86 unwinder's detection
> of pt_regs on the stack.  If the pt_regs are not user-mode registers
> from a syscall, then they indicate an in-kernel interrupt or exception
> (e.g. preemption or a page fault), in which case the stack is considered
> unreliable due to the nature of frame pointers.
> 
> It also relies on the x86 unwinder's detection of other issues, such as:
> 
> - corrupted stack data
> - stack grows the wrong way
> - stack walk doesn't reach the bottom
> - user didn't provide a large enough entries array
> 
> Such issues are reported by checking unwind_error() and !unwind_done().
> 
> Also add CONFIG_HAVE_RELIABLE_STACKTRACE so arch-independent code can
> determine at build time whether the function is implemented.
> 
> Signed-off-by: Josh Poimboeuf 

Just for record, this version looks fine to me:

Reviewed-by: Petr Mladek 

Best Regards,
Petr

PS: I was on the sick leave longer then expected. The patch set
has been pushed into the for-4.12 branch in jikos/livepatching.git
in the meantime. I check it there just for completeness. You do not
need to add my Reviewed-by tags.


[PATCH] ppc64/kprobe: Fix oops when kprobed on 'stdu' instruction

2017-04-10 Thread Ravi Bangoria
If we set a kprobe on a 'stdu' instruction on powerpc64, we see a kernel 
OOPS:

  [ 1275.165932] Bad kernel stack pointer cd93c840 at c0009868
  [ 1275.166378] Oops: Bad kernel stack pointer, sig: 6 [#1]
  ...
  GPR00: c01fcd93cb30 cd93c840 c15c5e00 cd93c840
  ...
  [ 1275.178305] NIP [c0009868] resume_kernel+0x2c/0x58
  [ 1275.178594] LR [c0006208] program_check_common+0x108/0x180

Basically, on 64 bit system, when user probes on 'stdu' instruction,
kernel does not emulate actual store in emulate_step itself because it
may corrupt exception frame. So kernel does actual store operation in
exception return code i.e. resume_kernel().

resume_kernel() loads the saved stack pointer from memory using lwz,
effectively loading a corrupt (32bit) address, causing the kernel crash.

Fix this by loading the 64bit value instead.

Fixes: 8e9f69371536 ("powerpc/kprobe: Don't emulate store when kprobe stwu r1")
Signed-off-by: Ravi Bangoria 
---
History:
  Commit 8e9f69371536 ("powerpc/kprobe: Don't emulate store when kprobe
  stwu r1") fixed exception frame corruption for 32 bit system which uses
  'stwu' instruction for stack frame allocation. This commit also added
  code for 64 bit system but did not enabled it for 'stdu' instruction.
  So 'stdu' instruction on 64 bit machine was emulating actual store in
  emulate_step() itself until...

  Commit be96f63375a1 ("powerpc: Split out instruction analysis part of
  emulate_step()"), enabled it for 'stdu' instruction on 64 bit machine.

  Since then it's broken. So this should also go into stable.

 arch/powerpc/kernel/entry_64.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 6432d4b..530f6e9 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -689,7 +689,7 @@ resume_kernel:
 
addir8,r1,INT_FRAME_SIZE/* Get the kprobed function entry */
 
-   lwz r3,GPR1(r1)
+   ld  r3,GPR1(r1)
subir3,r3,INT_FRAME_SIZE/* dst: Allocate a trampoline exception 
frame */
mr  r4,r1   /* src:  current exception frame */
mr  r1,r3   /* Reroute the trampoline frame to r1 */
@@ -704,7 +704,7 @@ resume_kernel:
bdnz2b
 
/* Do real store operation to complete stwu */
-   lwz r5,GPR1(r1)
+   ld  r5,GPR1(r1)
std r8,0(r5)
 
/* Clear _TIF_EMULATE_STACK_STORE flag */
-- 
1.9.3



Re: [PATCH 2/3] of/fdt: introduce of_scan_flat_dt_subnodes and of_get_flat_dt_phandle

2017-04-10 Thread Rob Herring
On Mon, Apr 10, 2017 at 12:43 AM, Nicholas Piggin  wrote:
> On Thu, 6 Apr 2017 09:09:41 -0500
> Rob Herring  wrote:
>
>> On Wed, Apr 5, 2017 at 7:38 PM, Nicholas Piggin  wrote:
>> > Given that it's quite a small addition to of/fdt code, hopefully
>> > that gives you a reasonable justification to accept it.
>> >
>> > If you prefer not to, that's okay, but I think we would have to carry
>> > it in arch/powerpc at least for a time, because of the schedule we're
>> > working to for POWER9 enablement. As a longer term item I agree with you
>> > and Ben, it would be worth considering unflattening earlier.
>>
>> As I mentioned, keeping it in arch/powerpc I like even less. So this is fine.
>
> Here is the patch with the change you suggested. Can I add your
> ack and send it via the powerpc tree with the change that uses
> these interfaces?

Acked-by: Rob Herring 


EEH error in doing DMA with PEX 8619

2017-04-10 Thread IanJiang
Hi all!

I am porting PLX driver for PEX 8619 to a power8 machine with CentOS-7.3.
The PEX 8619 is used as an NTB (Non-Transparent Bridge).

First, two DMA buffer are allocated with dma_alloc_coherent() and the
physical address are:
src: 0x _6060
dst: 0x _6080
Then, a DMA transfer is started and an EEH is reported in dmesg.

This DMA test is OK at an x86_64 platform.

Here are the details. Any suggestion is appreciated! 

[root@localhost ~]# uname -r
3.10.0-514.10.2.el7.ppc64le
[root@localhost ~]# cat /etc/system-release
CentOS Linux release 7.3.1611 (AltArch)
[root@localhost ~]# dmesg --clear
[root@localhost ~]# dmesg -w
[72579.982217] usb 1-1.3: USB disconnect, device number 61
[72581.516186] usb 1-1.3: new low-speed USB device number 62 using xhci_hcd
[72581.643767] usb 1-1.3: New USB device found, idVendor=04ca,
idProduct=0061
[72581.644045] usb 1-1.3: New USB device strings: Mfr=1, Product=2,
SerialNumber  =0
[72581.644135] usb 1-1.3: Product: USB Optical Mouse
[72581.644184] usb 1-1.3: Manufacturer: PixArt
[72581.680383] input: PixArt USB Optical Mouse as
/devices/pci0003:00/0003:00:00 
.0/0003:01:00.0/0003:02:09.0/0003:0d:00.0/usb1/1-1/1-1.3/1-1.3:1.0/input/input12
 
46
[72581.680806] hid-generic 0003:04CA:0061.04DF: input,hidraw1: USB HID v1.11
Mou  se [PixArt USB Optical Mouse]
on usb-0003:0d:00.0-1.3/input0

[72582.424769] Plx8000_NT:
<   
  
>
[72582.425013] Plx8000_NT: PLX 8000_NT driver v7.25 (64-bit)
[72582.425058] Plx8000_NT: Supports Linux kernel
v3.10.0-514.10.2.el7.ppc64le
[72582.425115] Plx8000_NT: Allocated global driver object (c03c8427cc00)
[72582.425120] Plx8000_NT: Registered driver (MajorID = 247)
[72582.425161] Plx8000_NT:
[72582.425167] Plx8000_NT: Probe: 8619 10B5 [D1 01:00.1]
[72582.425180] Plx8000_NT: Probe: -- Unsupported Device --
[72582.425204] Plx8000_NT:
[72582.425206] Plx8000_NT: Probe: 8619 10B5 [D1 02:01.0]
[72582.425222] Plx8000_NT: Enabled PCI device
[72582.425233] Plx8000_NT: Created Device (Plx8000_NT-0)
[72582.425235] Plx8000_NT: Start: 8619 10B5 [D1 02:01.0]
[72582.425237] Debug StartDevice 723: Reading PCI header command...
[72582.425385] Debug StartDevice 725: Reading PCI header command... =
0x100146
[72582.425445] Plx8000_NT:Resource 00
[72582.425447] Plx8000_NT:  Type : Memory
[72582.425452] Plx8000_NT:  PCI BAR 0: 8100
[72582.425454] Plx8000_NT:  Phys Addr: 3FE08100
[72582.425456] Plx8000_NT:  Size : 2h (128KB)
[72582.425458] Plx8000_NT:  Property : Non-Prefetchable 32-bit
[72582.425475] Plx8000_NT:  Kernel VA: d8008148
[72582.425478] Debug StartDevice 841: Read BAR0[0xd8008148] after
map...
[72582.425551] Debug StartDevice 843: Read BAR0[0xd8008148] after
map...   = 0x861910b5
[72582.425621] Plx8000_NT:Resource 01
[72582.425622] Plx8000_NT:  Type : Memory
[72582.425627] Plx8000_NT:  PCI BAR 2: 8000
[72582.425629] Plx8000_NT:  Phys Addr: 3FE08000
[72582.425631] Plx8000_NT:  Size : 40h (4MB)
[72582.425633] Plx8000_NT:  Property : Non-Prefetchable 32-bit
[72582.425639] Plx8000_NT:  Kernel VA: d8008400
[72582.425641] Debug StartDevice 849: Read BAR2[0xd8008400] after
map...
[72582.425727] Debug StartDevice 851: Read BAR2[0xd8008400] after
map...   = 0xf000eef3
[72582.425798] Plx8000_NT:Resource 02
[72582.425799] Plx8000_NT:  Type : Memory
[72582.425804] Plx8000_NT:  PCI BAR 3: 8040
[72582.425806] Plx8000_NT:  Phys Addr: 3FE08040
[72582.425808] Plx8000_NT:  Size : 40h (4MB)
[72582.425809] Plx8000_NT:  Property : Non-Prefetchable 32-bit
[72582.425813] Plx8000_NT:  Kernel VA: d8008480
[72582.425815] Plx8000_NT:Resource 03
[72582.425816] Plx8000_NT:  Type : Memory
[72582.425821] Plx8000_NT:  PCI BAR 4: 8080
[72582.425822] Plx8000_NT:  Phys Addr: 3FE08080
[72582.425824] Plx8000_NT:  Size : 40h (4MB)
[72582.425826] Plx8000_NT:  Property : Non-Prefetchable 32-bit
[72582.425830] Plx8000_NT:  Kernel VA: d8008500
[72582.425831] Plx8000_NT:Resource 04
[72582.425832] Plx8000_NT:  Type : Memory
[72582.425837] Plx8000_NT:  PCI BAR 5: 80C0
[72582.425839] Plx8000_NT:  Phys Addr: 3FE080C0
[72582.425841] Plx8000_NT:  Size : 40h (4MB)
[72582.425842] Plx8000_NT:  Property : Non-Prefetchable 32-bit
[72582.425846] Plx8000_NT:  Kernel VA: d8008580
[72582.425848] Debug StartDevice 862: Reading PCI header command...
[72582.425911] Debug StartDevice 864: Reading PCI header 

[PATCH v10 4/4] PCI: Don't extend device's size when using default alignment for all devices

2017-04-10 Thread Yongji Xie
Currently we reassign the alignment by extending resources' size in
pci_reassigndev_resource_alignment(). This could potentially break
some drivers when the driver uses the size to locate register
whose length is related to the size. Some examples as below:

- misc/Hpilo.c:
off = pci_resource_len(pdev, bar) - 0x2000;

- net/ethernet/chelsio/cxgb4/cxgb4_uld.h:
(pci_resource_len((pdev), 2) - roundup_pow_of_two((vres)->ocq.size))

- infiniband/hw/nes/Nes_hw.c:
num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT;

This risk could be easily prevented before because we only had one way
(kernel parameter resource_alignment) to touch those codes. And even
some users may be happy to see the extended size.

But now we introduce pcibios_default_alignment() to set default alignment
for all PCI devices which would also touch those codes. It would be hard
to prevent the risk in this case. So this patch tries to use
START_ALIGNMENT to identify the resource's alignment without extending
the size when the alignment reassigning is caused by the default alignment.

Signed-off-by: Yongji Xie 
---
 drivers/pci/pci.c |   34 --
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 02f1255..358366e 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4959,11 +4959,13 @@ resource_size_t __weak pcibios_default_alignment(struct 
pci_dev *dev)
 /**
  * pci_specified_resource_alignment - get resource alignment specified by user.
  * @dev: the PCI device to get
+ * @resize: whether or not to change resources' size when reassigning alignment
  *
  * RETURNS: Resource alignment if it is specified.
  *  Zero if it is not specified.
  */
-static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev)
+static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev,
+   bool *resize)
 {
int seg, bus, slot, func, align_order, count;
unsigned short vendor, device, subsystem_vendor, subsystem_device;
@@ -5005,6 +5007,7 @@ static resource_size_t 
pci_specified_resource_alignment(struct pci_dev *dev)
(!device || (device == dev->device)) &&
(!subsystem_vendor || (subsystem_vendor == 
dev->subsystem_vendor)) &&
(!subsystem_device || (subsystem_device == 
dev->subsystem_device))) {
+   *resize = true;
if (align_order == -1)
align = PAGE_SIZE;
else
@@ -5030,6 +5033,7 @@ static resource_size_t 
pci_specified_resource_alignment(struct pci_dev *dev)
bus == dev->bus->number &&
slot == PCI_SLOT(dev->devfn) &&
func == PCI_FUNC(dev->devfn)) {
+   *resize = true;
if (align_order == -1)
align = PAGE_SIZE;
else
@@ -5062,6 +5066,7 @@ void pci_reassigndev_resource_alignment(struct pci_dev 
*dev)
struct resource *r;
resource_size_t align, size;
u16 command;
+   bool resize = false;
 
/*
 * VF BARs are read-only zero according to SR-IOV spec r1.1, sec
@@ -5073,7 +5078,7 @@ void pci_reassigndev_resource_alignment(struct pci_dev 
*dev)
return;
 
/* check if specified PCI is target device to reassign */
-   align = pci_specified_resource_alignment(dev);
+   align = pci_specified_resource_alignment(dev, );
if (!align)
return;
 
@@ -5101,15 +5106,24 @@ void pci_reassigndev_resource_alignment(struct pci_dev 
*dev)
}
 
size = resource_size(r);
-   if (size < align) {
-   size = align;
-   dev_info(>dev,
-   "Rounding up size of resource #%d to %#llx.\n",
-   i, (unsigned long long)size);
+   if (resize) {
+   if (size < align) {
+   size = align;
+   dev_info(>dev,
+   "Rounding up size of resource #%d to 
%#llx.\n",
+   i, (unsigned long long)size);
+   }
+   r->flags |= IORESOURCE_UNSET;
+   r->start = 0;
+   } else {
+   if (size < align) {
+   r->flags &= ~IORESOURCE_SIZEALIGN;
+   r->flags |= IORESOURCE_STARTALIGN |
+   IORESOURCE_UNSET;
+   r->start = align;
+   

[PATCH v10 3/4] powerpc/powernv: Override pcibios_default_alignment() to force PCI devices to be page aligned

2017-04-10 Thread Yongji Xie
This overrides pcibios_default_alignment() to set default alignment
to PAGE_SIZE for all PCI devices on PowerNV platform. Thus sub-page
BARs would not share a page and could be mapped into guest when VFIO
passthrough them.

Signed-off-by: Yongji Xie 
---
 arch/powerpc/include/asm/machdep.h|2 ++
 arch/powerpc/kernel/pci-common.c  |8 
 arch/powerpc/platforms/powernv/pci-ioda.c |7 +++
 3 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 5011b69..a82c192 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -173,6 +173,8 @@ struct machdep_calls {
/* Called after scan and before resource survey */
void (*pcibios_fixup_phb)(struct pci_controller *hose);
 
+   resource_size_t (*pcibios_default_alignment)(struct pci_dev *);
+
 #ifdef CONFIG_PCI_IOV
void (*pcibios_fixup_sriov)(struct pci_dev *pdev);
resource_size_t (*pcibios_iov_resource_alignment)(struct pci_dev *, int 
resno);
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index ffda24a..ceda574 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -233,6 +233,14 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev)
pci_reset_secondary_bus(dev);
 }
 
+resource_size_t pcibios_default_alignment(struct pci_dev *pdev)
+{
+   if (ppc_md.pcibios_default_alignment)
+   return ppc_md.pcibios_default_alignment(pdev);
+
+   return 0;
+}
+
 #ifdef CONFIG_PCI_IOV
 resource_size_t pcibios_iov_resource_alignment(struct pci_dev *pdev, int resno)
 {
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index e367382..354c852 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3297,6 +3297,11 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, 
unsigned long type)
}
 }
 
+static resource_size_t pnv_pci_default_alignment(struct pci_dev *pdev)
+{
+   return PAGE_SIZE;
+}
+
 #ifdef CONFIG_PCI_IOV
 static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
  int resno)
@@ -3830,6 +3835,8 @@ static void __init pnv_pci_init_ioda_phb(struct 
device_node *np,
hose->controller_ops = pnv_pci_ioda_controller_ops;
}
 
+   ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
+
 #ifdef CONFIG_PCI_IOV
ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
-- 
1.7.9.5



[PATCH v10 2/4] PCI: Add pcibios_default_alignment() for arch-specific alignment control

2017-04-10 Thread Yongji Xie
When VFIO passes through a PCI device to a guest, it does not allow
the guest to mmap BARs that are smaller than PAGE_SIZE unless it
can reserve the rest of the page (see vfio_pci_probe_mmaps()). This
is because a page might contain several small BARs for unrelated
devices and a guest should not be able to access all of them.

VFIO emulates guest accesses to non-mappable BARs, which is functional
but slow. On systems with large page sizes, e.g., PowerNV with 64K pages,
BARs are more likely to share a page and performance is more likely to
be a problem.

Add a weak function to set default alignment for all PCI devices.
An arch can override it to force the PCI core to place memory BARs on
their own pages.

Signed-off-by: Yongji Xie 
---
 drivers/pci/pci.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7904d02..02f1255 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4947,6 +4947,11 @@ void pci_ignore_hotplug(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_ignore_hotplug);
 
+resource_size_t __weak pcibios_default_alignment(struct pci_dev *dev)
+{
+   return 0;
+}
+
 #define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE
 static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0};
 static DEFINE_SPINLOCK(resource_alignment_lock);
@@ -4962,14 +4967,15 @@ static resource_size_t 
pci_specified_resource_alignment(struct pci_dev *dev)
 {
int seg, bus, slot, func, align_order, count;
unsigned short vendor, device, subsystem_vendor, subsystem_device;
-   resource_size_t align = 0;
+   resource_size_t align = pcibios_default_alignment(dev);
char *p;
 
spin_lock(_alignment_lock);
p = resource_alignment_param;
-   if (!*p)
+   if (!*p && !align)
goto out;
if (pci_has_flag(PCI_PROBE_ONLY)) {
+   align = 0;
pr_info_once("PCI: Ignoring requested alignments 
(PCI_PROBE_ONLY)\n");
goto out;
}
-- 
1.7.9.5



[PATCH v10 1/4] PCI: A fix for caculating bridge window's size and alignment

2017-04-10 Thread Yongji Xie
In case that one device's alignment is greater than its size,
we may get an incorrect size and alignment for its bus's memory
window in pbus_size_mem(). This patch fixes this case.

Signed-off-by: Yongji Xie 
---
 drivers/pci/setup-bus.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index cb389277..958da7d 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1066,10 +1066,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
r->flags = 0;
continue;
}
-   size += r_size;
+   size += max(r_size, align);
/* Exclude ranges with size > align from
   calculation of the alignment. */
-   if (r_size == align)
+   if (r_size <= align)
aligns[order] += align;
if (order > max_order)
max_order = order;
-- 
1.7.9.5



[PATCH v10 0/4] PCI: Introduce a way to enforce all MMIO BARs not to share PAGE_SIZE

2017-04-10 Thread Yongji Xie
This series introduces a way for PCI resource allocator to force
MMIO BARs not to share PAGE_SIZE. This would make sense to VFIO
driver. Because current VFIO implementation disallows to mmap
sub-page(size < PAGE_SIZE) MMIO BARs which may share the same page
with other BARs for security reasons. Thus, we have to handle mmio
access to these BARs in QEMU emulation rather than in guest which
will cause some performance loss.

In our solution, we try to make use of the existing code path of
resource_alignment kernel parameter and add a macro to set default
alignment for it. Thus we can define this macro by default on some
archs which may easily hit the performance issue because of their
64K page.

In this series, patch 1 fixes a bug related to bridge window
size/alignment caculating; patch 2,3 add support for setting default
alignment of all MMIO BAR.

Changelog v10:
- Introduce an arch-specific function to set default alignment
  for all PCI devices instead of using macro
- Fix some minor comment issues
- Code style improvements

Changelog v9:
- Add a patch to fix for caculating bridge window's size and alignment
- Remove an unrelated patch
- Rework the patch that fix bug that reassign resources's alignment by
  changing its size

Changelog v8:
- Rebased against v4.10-rc4
- Rework the patch 2
- Change the commit log of patch 1

Changelog v7:
- Rebased against v4.9-rc2
- Drop two merged patches
- Rework the patch which fix a bug that resources's size is changed when
  using resource_alignment
- Add a patch that fix a bug for IOV BARs when using resource_alignment

Changelog v6:
- Remove the option "noresize@" of resource_alignment

Changelog v5:
- Rebased against v4.8-rc6
- Drop the patch that forbidding disable memory decoding in
  pci_reassigndev_resource_alignment()

Changelog v4:
- Rebased against v4.8-rc1
- Drop one irrelevant patch
- Drop the patch that adding wildcard to resource_alignment to enforce
  the alignment of all MMIO BARs to be at least PAGE_SIZE
- Change the format of option "noresize" of resource_alignment
- Code style improvements

Changelog v3:
- Ignore enforced alignment to fixed BARs
- Fix issue that disabling memory decoding when reassigning the alignment
- Only enable default alignment on PowerNV platform

Changelog v2:
- Ignore enforced alignment to VF BARs on pci_reassigndev_resource_alignment()

Yongji Xie (4):
  PCI: A fix for caculating bridge window's size and alignment
  PCI: Add pcibios_default_alignment() for arch-specific alignment control
  powerpc/powernv: Override pcibios_default_alignment() to force PCI devices to 
be page aligned
  PCI: Don't extend device's size when using default alignment for all devices

 arch/powerpc/include/asm/machdep.h|2 ++
 arch/powerpc/kernel/pci-common.c  |8 ++
 arch/powerpc/platforms/powernv/pci-ioda.c |7 +
 drivers/pci/pci.c |   44 +
 drivers/pci/setup-bus.c   |4 +--
 5 files changed, 51 insertions(+), 14 deletions(-)

-- 
1.7.9.5



Re: [PATCH] powerpc/powernv: Fix powernv Kconfig dependencies

2017-04-10 Thread Anshuman Khandual
On 04/10/2017 10:54 AM, Alistair Popple wrote:
> The patch to introduce address translation services for Nvlink2 uses
> MMU notifiers. However usage of MMU notifiers requires a Kconfig
> option which is not selected by default on powerpc so add it to the
> powernv Kconfig.

We would also need this for HMM enablement on powerpc going forward.



Re: [PATCH v2] powerpc/eeh: Avoid use after free in eeh_handle_special_event()

2017-04-10 Thread Alexey Kardashevskiy
On 10/04/17 17:11, Russell Currey wrote:
> eeh_handle_special_event() is called when an EEH event is detected but
> can't be narrowed down to a specific PE.  This function looks through
> every PE to find one in an erroneous state, then calls the regular event
> handler eeh_handle_normal_event() once it knows which PE has an error.
> 
> However, if eeh_handle_normal_event() found that the PE cannot possibly
> be recovered, it will free it, rendering the passed PE stale.
> This leads to a use after free in eeh_handle_special_event() as it attempts to
> clear the "recovering" state on the PE after eeh_handle_normal_event() 
> returns.
> 
> Thus, make sure the PE is valid when attempting to clear state in
> eeh_handle_special_event().
> 
> Cc:  #3.10+
> Reported-by: Alexey Kardashevskiy 
> Signed-off-by: Russell Currey 
> ---
> V2: check a specific return path instead of looking at the PE itself
> ---
>  arch/powerpc/kernel/eeh_driver.c | 19 +++
>  1 file changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/eeh_driver.c 
> b/arch/powerpc/kernel/eeh_driver.c
> index b94887165a10..e510408e08e1 100644
> --- a/arch/powerpc/kernel/eeh_driver.c
> +++ b/arch/powerpc/kernel/eeh_driver.c
> @@ -724,7 +724,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct 
> pci_bus *bus,
>   */
>  #define MAX_WAIT_FOR_RECOVERY 300
>  
> -static void eeh_handle_normal_event(struct eeh_pe *pe)
> +static int eeh_handle_normal_event(struct eeh_pe *pe)
>  {
>   struct pci_bus *frozen_bus;
>   struct eeh_dev *edev, *tmp;
> @@ -736,7 +736,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
>   if (!frozen_bus) {
>   pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
>   __func__, pe->phb->global_number, pe->addr);
> - return;
> + return -EIO;
>   }
>  
>   eeh_pe_update_time_stamp(pe);
> @@ -870,7 +870,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
>   pr_info("EEH: Notify device driver to resume\n");
>   eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
>  
> - return;
> + return rc;


eeh_handle_normal_event() uses "rc" to store return values from different
things:

- eeh_ops->wait_state() (which is pnv_eeh_wait_state) returns mask of
EEH_STATE_MMIO_ACTIVE/etc, errors would be EEH_STATE_UNAVAILABLE or
EEH_STATE_NOT_SUPPORT, both positive and the latter is used everywhere in
EEH code to report errors instead of negative linux errors.

- eeh_reset_device() returns usual linux negative errors, such as -EIO,
-ENODEV.

I'd suggest following one scheme or another here.


>  
>  excess_failures:
>   /*
> @@ -915,8 +915,12 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
>   pci_lock_rescan_remove();
>   pci_hp_remove_devices(frozen_bus);
>   pci_unlock_rescan_remove();
> +
> + /* The passed PE should no longer be used */
> + rc = -EFAULT;


I looked if eeh_handle_normal_event() could return -EFAULT in any other
branch. eeh_pci_enable() could return linux error if it failed in
pnv_eeh_set_option() which at least in theory can get -EFAULT from
unfreeze_pe(). Do we want to rely of the fact that unfreeze_pe() won't
return -EFAULT and we do not end up with non removed device?

May be instead of very popular EFAULT, eeh_handle_normal_event() is better
to return bool saying that pe is dead? Or move the whole
if(frozen_bus){
...
pci_hp_remove_devices()
...
}

to a eeh_teardown_frozen_bus() helper, make eeh_handle_normal_event()
return frozen_bus if it needs to be frozen and pass that to
eeh_teardown_frozen_bus() if not NULL?


btw eeh_handle_normal_event() does not really need the excess_failures
label, that "excess_failures: pr_err()" could be moved to "goto
excess_failures" (which will become "goto perm_error") and then perm_error:
could go too :)


>   }
>   }
> + return rc;
>  }
>  
>  static void eeh_handle_special_event(void)
> @@ -982,7 +986,14 @@ static void eeh_handle_special_event(void)
>*/
>   if (rc == EEH_NEXT_ERR_FROZEN_PE ||
>   rc == EEH_NEXT_ERR_FENCED_PHB) {
> - eeh_handle_normal_event(pe);
> + /*
> +  * eeh_handle_normal_event() can make the PE stale if it
> +  * determines that the PE cannot possibly be recovered.
> +  * Don't modify the PE state if that's the case.
> +  */
> + if (eeh_handle_normal_event(pe) == -EFAULT)
> + continue;
> +
>   eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
>   } else {
>   pci_lock_rescan_remove();
> 


-- 
Alexey


Re: kselftest:lost_exception_test failure with 4.11.0-rc5

2017-04-10 Thread Sachin Sant

> On 07-Apr-2017, at 6:06 PM, Michael Ellerman  wrote:
> 
> Sachin Sant  writes:
> 
>> I have run into few instances where the lost_exception_test from
>> powerpc kselftest fails with SIGABRT. Following o/p is against
>> 4.11.0-rc5. The failure is intermittent. 
> 
> What hardware are you on?

I have seen this problem on a POWER8 LPAR.

> 
> How long does it take to run when it fails? I assume ~2 minutes?

Yes somewhere around 2 min.


>> MMCR2 0x
>> EBBHR 0x10003dcc
>> BESCR 0x8001 GE PMAE 
> 
> And that says we have global enable set and events enabled.
> 
> 
> So I think there is a bug here somewhere. I don't really have time to
> dig into it now, neither does Maddy I think. But we should try and get
> to it at some point.
> 

Let me know if I can help with debug.

Thanks
-Sachin


> cheers
> 



Re: [PATCH v2] powerpc/eeh: Avoid use after free in eeh_handle_special_event()

2017-04-10 Thread Andrew Donnellan

On 10/04/17 17:11, Russell Currey wrote:

eeh_handle_special_event() is called when an EEH event is detected but
can't be narrowed down to a specific PE.  This function looks through
every PE to find one in an erroneous state, then calls the regular event
handler eeh_handle_normal_event() once it knows which PE has an error.

However, if eeh_handle_normal_event() found that the PE cannot possibly
be recovered, it will free it, rendering the passed PE stale.
This leads to a use after free in eeh_handle_special_event() as it attempts to
clear the "recovering" state on the PE after eeh_handle_normal_event() returns.

Thus, make sure the PE is valid when attempting to clear state in
eeh_handle_special_event().

Cc:  #3.10+
Reported-by: Alexey Kardashevskiy 
Signed-off-by: Russell Currey 


Reviewed-by: Andrew Donnellan 


--
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited



Re: [PATCH 3/3] powerpc/64s: cpufeatures: add initial implementation for cpufeatures

2017-04-10 Thread Nicholas Piggin
On Wed,  5 Apr 2017 22:37:06 +1000
Nicholas Piggin  wrote:

> The /cpus/features dt binding describes architected CPU features along
> with some compatibility, privilege, and enablement properties that allow
> flexibility with discovering and enabling capabilities.
> 
> Presence of this feature implies a base level of functionality, then
> additional feature nodes advertise the presence of new features.
> 
> A given feature and its setup procedure is defined once and used by all
> CPUs which are compatible by that feature. Features that follow a
> supported "prescription" can be enabled by a hypervisor or OS that
> does not understand them natively.
> 
> ---
> Since last post:
> - Update to v3.0B ISA.
> - Removed PVR tests for MCE and PMU, and add specific features for those.
> - Fixed CPU state restore.
> - Changed dt bit number specification for hfscr/fscr/aux to LSB0.
> - Broke the of/fdt changes into another patch.
> - Added a proper dependency checker.
> - Resolved most of the register/feature bits differences, made a patch
>   for VRMASD.
> - Didn't think of a better name.

Here's the latest version, which has some build and config cleanups
and updated with some more optional features.

Since last post:
- Fixed 32-bit build
- Made configurable on Book3s (for now)
- Updated base set of features


---
 .../devicetree/bindings/powerpc/cpufeatures.txt| 264 +
 arch/powerpc/Kconfig   |  16 +
 arch/powerpc/include/asm/cpu_has_feature.h |   4 +-
 arch/powerpc/include/asm/cpufeatures.h |  57 ++
 arch/powerpc/include/asm/cputable.h|   1 +
 arch/powerpc/kernel/Makefile   |   1 +
 arch/powerpc/kernel/cpufeatures.c  | 625 +
 arch/powerpc/kernel/cputable.c |  14 +-
 arch/powerpc/kernel/prom.c | 307 +-
 arch/powerpc/kernel/setup-common.c |   2 +-
 arch/powerpc/kernel/setup_64.c |  15 +-
 11 files changed, 1288 insertions(+), 18 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/powerpc/cpufeatures.txt
 create mode 100644 arch/powerpc/include/asm/cpufeatures.h
 create mode 100644 arch/powerpc/kernel/cpufeatures.c

diff --git a/Documentation/devicetree/bindings/powerpc/cpufeatures.txt 
b/Documentation/devicetree/bindings/powerpc/cpufeatures.txt
new file mode 100644
index ..325b263f4cdf
--- /dev/null
+++ b/Documentation/devicetree/bindings/powerpc/cpufeatures.txt
@@ -0,0 +1,264 @@
+powerpc cpu features binding
+
+
+The device tree describes supported CPU features as nodes containing
+compatibility and enablement information as properties.
+
+The binding specifies features common to all CPUs in the system.
+Heterogeneous CPU features are not supported at present (such could be added
+by providing nodes with additional features and linking those to particular
+CPUs).
+
+This binding is intended to provide fine grained control of CPU features at
+all levels of the stack (firmware, hypervisor, OS, userspace), with the
+ability for new CPU features to be used by some components without all
+components being upgraded (e.g., a new floating point instruction could be
+used by userspace math library without upgrading kernel and hypervisor).
+
+The binding is passed to the hypervisor by firmware. The hypervisor must
+remove any features that require hypervisor enablement but that it does not
+enable. It must remove any features that depend on removed features. It may
+pass remaining features usable to the OS and PR to guests, depending on
+configuration policy (not specified here).
+
+The modified binding is passed to the guest by hypervisor, with HV bit
+cleared from the usable-mask and the hv-support and hfscr-bit properties
+removed. The guest must similarly rmeove features that require OS enablement
+that it does not enable. The OS may pass PR usable features to userspace via
+ELF AUX vectors AT_HWCAP, AT_HWCAP2, AT_HWCAP3, etc., or use some other
+method (outside the scope of this specification).
+
+The binding will specify a "base" level of features that will be present
+when the cpu features binding exists. Additional features will be explicitly
+specified.
+
+/cpus/features node binding
+---
+
+Node: features
+
+Description: Container of CPU feature nodes.
+
+The node name must be "features" and it must be a child of the node "/cpus".
+
+The node is optional but should be provided by new firmware.
+
+Each child node of cpufeatures represents an architected CPU feature (e.g.,
+a new set of vector instructions) or an important CPU performance
+characteristic (e.g., fast unaligned memory operations). The specification
+of each feature (instructions, registers, exceptions, etc.) will be
+documented with device tree bindings.
+
+As a rough guide, features should be based on functional groups of changes,
+those 

[PATCH v2] powerpc/eeh: Avoid use after free in eeh_handle_special_event()

2017-04-10 Thread Russell Currey
eeh_handle_special_event() is called when an EEH event is detected but
can't be narrowed down to a specific PE.  This function looks through
every PE to find one in an erroneous state, then calls the regular event
handler eeh_handle_normal_event() once it knows which PE has an error.

However, if eeh_handle_normal_event() found that the PE cannot possibly
be recovered, it will free it, rendering the passed PE stale.
This leads to a use after free in eeh_handle_special_event() as it attempts to
clear the "recovering" state on the PE after eeh_handle_normal_event() returns.

Thus, make sure the PE is valid when attempting to clear state in
eeh_handle_special_event().

Cc:  #3.10+
Reported-by: Alexey Kardashevskiy 
Signed-off-by: Russell Currey 
---
V2: check a specific return path instead of looking at the PE itself
---
 arch/powerpc/kernel/eeh_driver.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index b94887165a10..e510408e08e1 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -724,7 +724,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct 
pci_bus *bus,
  */
 #define MAX_WAIT_FOR_RECOVERY 300
 
-static void eeh_handle_normal_event(struct eeh_pe *pe)
+static int eeh_handle_normal_event(struct eeh_pe *pe)
 {
struct pci_bus *frozen_bus;
struct eeh_dev *edev, *tmp;
@@ -736,7 +736,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
if (!frozen_bus) {
pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
__func__, pe->phb->global_number, pe->addr);
-   return;
+   return -EIO;
}
 
eeh_pe_update_time_stamp(pe);
@@ -870,7 +870,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
pr_info("EEH: Notify device driver to resume\n");
eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
 
-   return;
+   return rc;
 
 excess_failures:
/*
@@ -915,8 +915,12 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
pci_lock_rescan_remove();
pci_hp_remove_devices(frozen_bus);
pci_unlock_rescan_remove();
+
+   /* The passed PE should no longer be used */
+   rc = -EFAULT;
}
}
+   return rc;
 }
 
 static void eeh_handle_special_event(void)
@@ -982,7 +986,14 @@ static void eeh_handle_special_event(void)
 */
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
-   eeh_handle_normal_event(pe);
+   /*
+* eeh_handle_normal_event() can make the PE stale if it
+* determines that the PE cannot possibly be recovered.
+* Don't modify the PE state if that's the case.
+*/
+   if (eeh_handle_normal_event(pe) == -EFAULT)
+   continue;
+
eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
} else {
pci_lock_rescan_remove();
-- 
2.12.2



Re: [PATCH V4 4/7] cxl: Update implementation service layer

2017-04-10 Thread Andrew Donnellan



On 08/04/17 00:11, Christophe Lombard wrote:

The service layer API (in cxl.h) lists some low-level functions whose
implementation is different on PSL8, PSL9 and XSL:
- Init implementation for the adapter and the afu.
- Invalidate TLB/SLB.
- Attach process for dedicated/directed models.
- Handle psl interrupts.
- Debug registers for the adapter and the afu.
- Traces.
Each environment implements its own functions, and the common code uses
them through function pointers, defined in cxl_service_layer_ops.

Signed-off-by: Christophe Lombard 


Reviewed-by: Andrew Donnellan 

--
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited



Re: [v2] raid6/altivec: Add vpermxor implementation for raid6 Q syndrome

2017-04-10 Thread Michael Ellerman
Daniel Axtens  writes:

> Hi Matt,
>
> Thanks for answering my questions and doing those fixes.
>
>
>> Bugs fixed:
>>  - A small bug in pq.h regarding a missing and mismatched
>>ifdef statement
>>  - Fixed test/Makefile to correctly build test on ppc
>>
>
> I think this commit should be labelled:
> Fixes: 4f8c55c5ad49 ("lib/raid6: build proper files on corresponding arch")
>
> mpe can probably add that when he merges - no need to do a new version :)

Please send a separate patch which does that fix.

>>  else
>> -HAS_ALTIVEC := $(shell printf '\#include \nvector int 
>> a;\n' |\
>> - gcc -c -x c - >&/dev/null && \
>> - rm ./-.o && echo yes)
>> -ifeq ($(HAS_ALTIVEC),yes)
>> -OBJS += altivec1.o altivec2.o altivec4.o altivec8.o
>> + HAS_ALTIVEC := $(shell printf '\#include \nvector int a;\n' 
>> |\
>> + gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
>> + ifeq ($(HAS_ALTIVEC),yes)
>> +CFLAGS += -I../../../arch/powerpc/include
>> +CFLAGS += -DCONFIG_ALTIVEC
>> +OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
>> +vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
>>  endif
>>  endif
> Looks like vim has replaced spaces with tabs here. Not sure how much we
> care...

We care at least because it makes the diff look bigger than it really
is, if I'm reading it right the first three lines haven't actually
changed.

>> @@ -97,6 +99,18 @@ altivec4.c: altivec.uc ../unroll.awk
>>  altivec8.c: altivec.uc ../unroll.awk
>>  $(AWK) ../unroll.awk -vN=8 < altivec.uc > $@
>>
> ... especially seeing as tabs are already used in the file here!

It's a Makefile! Tabs have meaning :)

>> +# ifdef __KERNEL__
>> +return (cpu_has_feature(CONFIG_ALTIVEC) &&
>> +cpu_has_feature(CPU_FTR_ARCH_207S));
> I think CPU_FTR_ARCH_207S implies Altivec? Again, not a real problem,

It doesn't.

And also CONFIG_ALTIVEC is not a cpu feature!

You should be using CPU_FTR_ALTIVEC_COMP. That copes with the case where
the kernel is compiled without ALTIVEC support.

cheers


[PATCH v2 3/3] powerpc/xive: Extra sanity checks on cpu numbers

2017-04-10 Thread Benjamin Herrenschmidt
When targetting interrupts we do various manipulations of cpu numbers
and CPU masks. This adds some sanity checking to ensure we don't
break assumptions and manpulate cpu numbers that are out of bounds
of the various cpumasks.

Signed-off-by: Benjamin Herrenschmidt 
---
 arch/powerpc/sysdev/xive/common.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f37d257..f78a779 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -421,8 +421,10 @@ static void xive_dec_target_count(int cpu)
struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
struct xive_q *q = >queue[xive_irq_priority];
 
-   if (WARN_ON(cpu < 0))
+   if (unlikely(WARN_ON(cpu < 0 || !xc))) {
+   pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc);
return;
+   }
 
/*
 * We increment the "pending count" which will be used
@@ -446,8 +448,14 @@ static int xive_find_target_in_mask(const struct cpumask 
*mask,
 
/* Locate it */
cpu = cpumask_first(mask);
-   for (i = 0; i < first; i++)
+   for (i = 0; i < first && cpu < nr_cpu_ids; i++)
cpu = cpumask_next(cpu, mask);
+
+   /* Sanity check */
+   if (WARN_ON(cpu >= nr_cpu_ids))
+   cpu = cpumask_first(cpu_online_mask);
+
+   /* Remember first one to handle wrap-around */
first = cpu;
 
/*
@@ -540,6 +548,12 @@ static unsigned int xive_irq_startup(struct irq_data *d)
pr_warn("XIVE: irq %d started with broken affinity\n",
d->irq);
}
+
+   /* Sanity check */
+   if (WARN_ON(target == XIVE_INVALID_TARGET ||
+   target >= nr_cpu_ids))
+   target = smp_processor_id();
+
xd->target = target;
 
/*
@@ -670,6 +684,10 @@ static int xive_irq_set_affinity(struct irq_data *d,
if (target == XIVE_INVALID_TARGET)
return -ENXIO;
 
+   /* Sanity check */
+   if (WARN_ON(target >= nr_cpu_ids))
+   target = smp_processor_id();
+
old_target = xd->target;
 
/*
-- 
2.9.3



[PATCH v2 2/3] powerpx/xive: Fix irq target selection returning out of bounds cpu#

2017-04-10 Thread Benjamin Herrenschmidt
xive_pick_irq_target() first tries to construct a mask that is
the intersection of the requested affinity, online CPUs, and
the group of CPUs that are on the same chip as the interrupt
source.

If that resulting mask is empty, we were incorrectly returning
nr_cpu_ids as a target.

Signed-off-by: Benjamin Herrenschmidt 
---
 arch/powerpc/sysdev/xive/common.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index dbd0f45..f37d257 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -486,7 +486,7 @@ static int xive_pick_irq_target(struct irq_data *d,
 
/*
 * If we have chip IDs, first we try to build a mask of
-* CPUs matching ther CPU and find a target in there
+* CPUs matching the CPU and find a target in there
 */
if (xd->src_chip != XIVE_INVALID_CHIP_ID &&
zalloc_cpumask_var(, GFP_ATOMIC)) {
@@ -497,7 +497,9 @@ static int xive_pick_irq_target(struct irq_data *d,
cpumask_set_cpu(cpu, mask);
}
/* Try to find a target */
-   if (!cpumask_empty(mask))
+   if (cpumask_empty(mask))
+   cpu = -1;
+   else
cpu = xive_find_target_in_mask(mask, fuzz++);
free_cpumask_var(mask);
if (cpu >= 0)
-- 
2.9.3



[PATCH v2 1/3] powerpc/xive: Don't call cpu_online() on an invalid CPU number

2017-04-10 Thread Benjamin Herrenschmidt
If the interrupt didn't have a selected target yet, we could
call cpu_online() and do other cpumask tests with cpu #-1 which
would result in random outcomes.

Signed-off-by: Benjamin Herrenschmidt 
---
 arch/powerpc/sysdev/xive/common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 9201819..dbd0f45 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -656,7 +656,8 @@ static int xive_irq_set_affinity(struct irq_data *d,
/* If existing target is already in the new mask, and is
 * online then do nothing.
 */
-   if (cpu_online(xd->target) &&
+   if (xd->target != XIVE_INVALID_TARGET &&
+   cpu_online(xd->target) &&
cpumask_test_cpu(xd->target, cpumask))
return IRQ_SET_MASK_OK;
 
-- 
2.9.3



[PATCH 2/3] powerpx/xive: Fix irq target selection returning out of bounds cpu#

2017-04-10 Thread Benjamin Herrenschmidt
xive_pick_irq_target() first tries to construct a mask that is
the intersection of the requested affinity, online CPUs, and
the group of CPUs that are on the same chip as the interrupt
source.

If that resulting mask is empty, we were incorrectly returning
nr_cpu_ids as a target.

Signed-off-by: Benjamin Herrenschmidt 
---
 arch/powerpc/sysdev/xive/common.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index dbbe446..abda9b2 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -492,7 +492,7 @@ static int xive_pick_irq_target(struct irq_data *d,
 
/*
 * If we have chip IDs, first we try to build a mask of
-* CPUs matching ther CPU and find a target in there
+* CPUs matching the CPU and find a target in there
 */
if (xd->src_chip != XIVE_INVALID_CHIP_ID &&
zalloc_cpumask_var(, GFP_ATOMIC)) {
@@ -503,7 +503,9 @@ static int xive_pick_irq_target(struct irq_data *d,
cpumask_set_cpu(cpu, mask);
}
/* Try to find a target */
-   if (!cpumask_empty(mask))
+   if (cpumask_empty(mask))
+   cpu = -1;
+   else
cpu = xive_find_target_in_mask(mask, fuzz++);
free_cpumask_var(mask);
if (cpu >= 0)
-- 
2.9.3



[PATCH 3/3] powerpc/xive: Extra sanity checks on cpu numbers

2017-04-10 Thread Benjamin Herrenschmidt
When targetting interrupts we do various manipulations of cpu numbers
and CPU masks. This adds some sanity checking to ensure we don't
break assumptions and manpulate cpu numbers that are out of bounds
of the various cpumasks.

Signed-off-by: Benjamin Herrenschmidt 
---
 arch/powerpc/sysdev/xive/common.c | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index abda9b2..496036c 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -452,8 +452,14 @@ static int xive_find_target_in_mask(const struct cpumask 
*mask,
 
/* Locate it */
cpu = cpumask_first(mask);
-   for (i = 0; i < first; i++)
+   for (i = 0; i < first && cpu < nr_cpu_ids; i++)
cpu = cpumask_next(cpu, mask);
+
+   /* Sanity check */
+   if (WARN_ON(cpu >= nr_cpu_ids))
+   cpu = cpumask_first(cpu_online_mask);
+
+   /* Remember first one to handle wrap-around */
first = cpu;
 
/*
@@ -545,6 +551,12 @@ static unsigned int xive_irq_startup(struct irq_data *d)
return -ENXIO;
pr_warn("irq %d started with broken affinity\n", d->irq);
}
+
+   /* Sanity check */
+   if (WARN_ON(target == XIVE_INVALID_TARGET ||
+   target >= nr_cpu_ids))
+   target = smp_processor_id();
+
xd->target = target;
 
/*
@@ -676,6 +688,10 @@ static int xive_irq_set_affinity(struct irq_data *d,
if (target == XIVE_INVALID_TARGET)
return -ENXIO;
 
+   /* Sanity check */
+   if (WARN_ON(target >= nr_cpu_ids))
+   target = smp_processor_id();
+
old_target = xd->target;
 
/*
-- 
2.9.3



[PATCH 1/3] powerpc/xive: Don't call cpu_online() on an invalid CPU number

2017-04-10 Thread Benjamin Herrenschmidt
If the interrupt didn't have a selected target yet, we could
call cpu_online() and do other cpumask tests with cpu #-1 which
would result in random outcomes.

Signed-off-by: Benjamin Herrenschmidt 
---
 arch/powerpc/sysdev/xive/common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 006a53e..dbbe446 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -662,7 +662,8 @@ static int xive_irq_set_affinity(struct irq_data *d,
 * If existing target is already in the new mask, and is
 * online then do nothing.
 */
-   if (cpu_online(xd->target) &&
+   if (xd->target != XIVE_INVALID_TARGET &&
+   cpu_online(xd->target) &&
cpumask_test_cpu(xd->target, cpumask))
return IRQ_SET_MASK_OK;
 
-- 
2.9.3



Re: [PATCH V4 5/7] cxl: Rename some psl8 specific functions

2017-04-10 Thread Andrew Donnellan

On 08/04/17 00:11, Christophe Lombard wrote:

Rename a few functions, changing the '_psl' suffix to '_psl8', to make
clear that the implementation is psl8 specific.
Those functions will have an equivalent implementation for the psl9 in
a later patch.

Signed-off-by: Christophe Lombard 


Reviewed-by: Andrew Donnellan 

--
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited



[PATCH] powerpc/powernv/pci: Reduce spam when dumping PEST

2017-04-10 Thread Russell Currey
Dumping the PE State Tables (PEST) can be highly verbose if a number of PEs
are affected, especially in the case where the whole PHB is frozen and 255
lines get printed.  Check for duplicates when dumping the PEST to reduce
useless output.

For example:

PE[f8] A/B: 9726 8080d0f8
PE[f9] A/B: 8000 
PE[..fe] A/B: as above
PE[ff] A/B: 8440002b 

instead of:

PE[f8] A/B: 9726 8080d0f8
PE[f9] A/B: 8000 
PE[fa] A/B: 8000 
PE[fb] A/B: 8000 
PE[fc] A/B: 8000 
PE[fd] A/B: 8000 
PE[fe] A/B: 8000 
PE[ff] A/B: 8440002b 

and you can imagine how much worse it can get for 255 PEs.

Signed-off-by: Russell Currey 
---
 arch/powerpc/platforms/powernv/pci.c | 52 ++--
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index eb835e977e33..303c9d84d3d4 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -227,11 +227,40 @@ void pnv_teardown_msi_irqs(struct pci_dev *pdev)
 }
 #endif /* CONFIG_PCI_MSI */
 
+/* Nicely print the contents of the PE State Tables (PEST). */
+static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size)
+{
+   int i;
+   __be64 prevA, prevB;
+   bool dup = false;
+   prevA = prevB = ~0;
+
+   for (i = 0; i < pest_size; i++) {
+   __be64 peA = be64_to_cpu(pestA[i]);
+   __be64 peB = be64_to_cpu(pestB[i]);
+
+   if (peA != prevA || peB != prevB) {
+   if (dup) {
+   pr_info("PE[..%x] A/B: as above\n", i-1);
+   dup = false;
+   }
+   prevA = peA;
+   prevB = peB;
+   if (peA || peB)
+   pr_info("PE[%2x] A/B: %016llx %016llx\n",
+   i, peA, peB);
+   } else {
+   /* Don't need to track zeroes */
+   if (!dup && (peA || peB))
+   dup = true;
+   }
+   }
+}
+
 static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
 struct OpalIoPhbErrorCommon *common)
 {
struct OpalIoP7IOCPhbErrorData *data;
-   int i;
 
data = (struct OpalIoP7IOCPhbErrorData *)common;
pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n",
@@ -308,22 +337,13 @@ static void pnv_pci_dump_p7ioc_diag_data(struct 
pci_controller *hose,
be64_to_cpu(data->dma1ErrorLog0),
be64_to_cpu(data->dma1ErrorLog1));
 
-   for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
-   if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 &&
-   (be64_to_cpu(data->pestB[i]) >> 63) == 0)
-   continue;
-
-   pr_info("PE[%3d] A/B: %016llx %016llx\n",
-   i, be64_to_cpu(data->pestA[i]),
-   be64_to_cpu(data->pestB[i]));
-   }
+   pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_P7IOC_NUM_PEST_REGS);
 }
 
 static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
struct OpalIoPhbErrorCommon *common)
 {
struct OpalIoPhb3ErrorData *data;
-   int i;
 
data = (struct OpalIoPhb3ErrorData*)common;
pr_info("PHB3 PHB#%x Diag-data (Version: %d)\n",
@@ -404,15 +424,7 @@ static void pnv_pci_dump_phb3_diag_data(struct 
pci_controller *hose,
be64_to_cpu(data->dma1ErrorLog0),
be64_to_cpu(data->dma1ErrorLog1));
 
-   for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
-   if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 &&
-   (be64_to_cpu(data->pestB[i]) >> 63) == 0)
-   continue;
-
-   pr_info("PE[%3d] A/B: %016llx %016llx\n",
-   i, be64_to_cpu(data->pestA[i]),
-   be64_to_cpu(data->pestB[i]));
-   }
+   pnv_pci_dump_pest(data->pestA, data->pestB, OPAL_PHB3_NUM_PEST_REGS);
 }
 
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
-- 
2.12.2