Re: [PATCH] powerpc/spufs: Remove MAX_USER_PRIO define

2014-02-11 Thread Kamalesh Babulal
* Jeremy Kerr j...@ozlabs.org [2014-02-11 14:05:17]:

 Current ppc64_defconfig fails with:
 
  arch/powerpc/platforms/cell/spufs/sched.c:86:0: error: MAX_USER_PRIO 
 redefined [-Werror]
  cc1: all warnings being treated as errors
 
 6b6350f1 introduced a generic MAX_USER_PRIO macro to sched/prio.h, which
 is causing the conflit. Use that one instead of our own.

you can also use DEFAULT_PRIO from sched/prio.h instead of NORMAL_PRIO.

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c 
b/arch/powerpc/platforms/cell/spufs/sched.c
index 49318385d4fa..014979db2018 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -64,11 +64,6 @@ static struct timer_list spusched_timer;
 static struct timer_list spuloadavg_timer;
 
 /*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO120
-
-/*
  * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
  * tick for every 10 CPU scheduler ticks.
  */
@@ -97,7 +92,7 @@ static struct timer_list spuloadavg_timer;
  */
 void spu_set_timeslice(struct spu_context *ctx)
 {
-   if (ctx-prio  NORMAL_PRIO)
+   if (ctx-prio  DEFAULT_PRIO)
ctx-time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE * 4, ctx-prio);
else
ctx-time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE, ctx-prio);

Thanks,
Kamalesh.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v1 2/2] powernv, cpufreq: Add per-core locking to serialize frequency transitions

2014-02-11 Thread Preeti U Murthy
Hi Vaidy,

On 02/11/2014 12:32 PM, Vaidyanathan Srinivasan wrote:
 From: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
 
 On POWER systems, the CPU frequency is controlled at a core-level and
 hence we need to serialize so that only one of the threads in the core
 switches the core's frequency at a time.
 
 Using a global mutex lock would needlessly serialize _all_ frequency
 transitions in the system (across all cores). So introduce per-core
 locking to enable finer-grained synchronization and thereby enhance
 the speed and responsiveness of the cpufreq driver to varying workload
 demands.
 
 The design of per-core locking is very simple and straight-forward: we
 first define a Per-CPU lock and use the ones that belongs to the first
 thread sibling of the core.
 
 cpu_first_thread_sibling() macro is used to find the *common* lock for
 all thread siblings belonging to a core.
 
 Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
 Signed-off-by: Vaidyanathan Srinivasan sva...@linux.vnet.ibm.com
 ---
  drivers/cpufreq/powernv-cpufreq.c |   21 -
  1 file changed, 16 insertions(+), 5 deletions(-)
 
 diff --git a/drivers/cpufreq/powernv-cpufreq.c 
 b/drivers/cpufreq/powernv-cpufreq.c
 index ea3b630..8240e90 100644
 --- a/drivers/cpufreq/powernv-cpufreq.c
 +++ b/drivers/cpufreq/powernv-cpufreq.c
 @@ -24,8 +24,15 @@
  #include linux/of.h
  #include asm/cputhreads.h
 
 -/* FIXME: Make this per-core */
 -static DEFINE_MUTEX(freq_switch_mutex);
 +/* Per-Core locking for frequency transitions */
 +static DEFINE_PER_CPU(struct mutex, freq_switch_lock);
 +
 +#define lock_core_freq(cpu)  \
 + mutex_lock(per_cpu(freq_switch_lock,\
 + cpu_first_thread_sibling(cpu)));
 +#define unlock_core_freq(cpu)\
 + mutex_unlock(per_cpu(freq_switch_lock,\
 + cpu_first_thread_sibling(cpu)));
 
  #define POWERNV_MAX_PSTATES  256
 
 @@ -219,7 +226,7 @@ static int powernv_cpufreq_target(struct cpufreq_policy 
 *policy,
   freqs.new = powernv_freqs[new_index].frequency;
   freqs.cpu = policy-cpu;
 
 - mutex_lock(freq_switch_mutex);
 + lock_core_freq(policy-cpu);
   cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
 
   pr_debug(setting frequency for cpu %d to %d kHz index %d pstate %d,
 @@ -231,7 +238,7 @@ static int powernv_cpufreq_target(struct cpufreq_policy 
 *policy,
   rc = powernv_set_freq(policy-cpus, new_index);
 
   cpufreq_notify_transition(policy, freqs, CPUFREQ_POSTCHANGE);
 - mutex_unlock(freq_switch_mutex);
 + unlock_core_freq(policy-cpu);
 
   return rc;
  }
 @@ -248,7 +255,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
 
  static int __init powernv_cpufreq_init(void)
  {
 - int rc = 0;
 + int cpu, rc = 0;
 
   /* Discover pstates from device tree and init */
 
 @@ -258,6 +265,10 @@ static int __init powernv_cpufreq_init(void)
   pr_info(powernv-cpufreq disabled\n);
   return rc;
   }
 + /* Init per-core mutex */
 + for_each_possible_cpu(cpu) {
 + mutex_init(per_cpu(freq_switch_lock, cpu));
 + }
 
   rc = cpufreq_register_driver(powernv_cpufreq_driver);
   return rc;

This looks good to me.

Reviewed-by: Preeti U Murthy pre...@linux.vnet.ibm.com

Thanks

Regards
Preeti U Murthy
 
 ___
 Linuxppc-dev mailing list
 Linuxppc-dev@lists.ozlabs.org
 https://lists.ozlabs.org/listinfo/linuxppc-dev
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v1 1/2] powernv: cpufreq driver for powernv platform

2014-02-11 Thread Preeti U Murthy
Hi Vaidy,

On 02/11/2014 12:32 PM, Vaidyanathan Srinivasan wrote:
 Backend driver to dynamically set voltage and frequency on
 IBM POWER non-virtualized platforms.  Power management SPRs
 are used to set the required PState.
 
 This driver works in conjunction with cpufreq governors
 like 'ondemand' to provide a demand based frequency and
 voltage setting on IBM POWER non-virtualized platforms.
 
 PState table is obtained from OPAL v3 firmware through device
 tree.
 
 powernv_cpufreq back-end driver would parse the relevant device-tree
 nodes and initialise the cpufreq subsystem on powernv platform.
 
 Signed-off-by: Vaidyanathan Srinivasan sva...@linux.vnet.ibm.com
 Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
 Signed-off-by: Anton Blanchard an...@samba.org
 ---
snip

 +static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 +{
 + int base, i;
 +
 +#ifdef CONFIG_SMP
 + base = cpu_first_thread_sibling(policy-cpu);
 +
 + for (i = 0; i  threads_per_core; i++)
 + cpumask_set_cpu(base + i, policy-cpus);
 +#endif
 + policy-cpuinfo.transition_latency = 25000;

Is it ok to hard code this field? How about getting this also from the
device tree?

 +
 + /* Print frequency table */
 + for (i = 0; powernv_freqs[i].frequency != CPUFREQ_TABLE_END; i++)
 + pr_debug(%d: %d\n, i, powernv_freqs[i].frequency);

The frequency table as a result will be printed on every cpu when
cpufreq gets initialized. Considering this information will not vary
across CPUs, can we print this during powernv_cpufreq_init() after
parsing the device tree for the pstates?

Thanks

Regards
Preeti U Murthy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] powerpc/spufs: Fix duplicate definition of MAX_USER_PRIO

2014-02-11 Thread Peter Zijlstra
On Tue, Feb 11, 2014 at 03:24:17AM +0800, kbuild test robot wrote:
  arch/powerpc/platforms/cell/spufs/sched.c:86:0: warning: MAX_USER_PRIO 
  redefined [enabled by default]
 #define MAX_USER_PRIO  (MAX_PRIO - MAX_RT_PRIO)
 ^
In file included from include/linux/sched.h:6:0,
 from arch/powerpc/platforms/cell/spufs/sched.c:26:
include/linux/sched/prio.h:39:0: note: this is the location of the 
 previous definition
 #define MAX_USER_PRIO  (USER_PRIO(MAX_PRIO))
 ^

Since USER_PRIO(p) is ((p)-MAX_RT_PRIO) the above two definitions are
the same and we can simply remove the spufs one.

Fixes: 6b6350f155af (sched: Expose some macros related to priority)
Reported-by: Fengguang Wu fengguang...@intel.com
Signed-off-by: Peter Zijlstra pet...@infradead.org
---
 arch/powerpc/platforms/cell/spufs/sched.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c 
b/arch/powerpc/platforms/cell/spufs/sched.c
index 49318385d4fa..4a0a64fe25df 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -83,7 +83,6 @@ static struct timer_list spuloadavg_timer;
 #define MIN_SPU_TIMESLICE  max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
 #define DEF_SPU_TIMESLICE  (100 * HZ / (1000 * SPUSCHED_TICK))
 
-#define MAX_USER_PRIO  (MAX_PRIO - MAX_RT_PRIO)
 #define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
 
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V4 2/3] tick/cpuidle: Initialize hrtimer mode of broadcast

2014-02-11 Thread Daniel Lezcano

On 02/07/2014 09:06 AM, Preeti U Murthy wrote:

From: Thomas Gleixner t...@linutronix.de

On some architectures, in certain CPU deep idle states the local timers stop.
An external clock device is used to wakeup these CPUs. The kernel support for 
the
wakeup of these CPUs is provided by the tick broadcast framework by using the
external clock device as the wakeup source.

However not all implementations of architectures provide such an external
clock device. This patch includes support in the broadcast framework to handle
the wakeup of the CPUs in deep idle states on such systems by queuing a hrtimer
on one of the CPUs, which is meant to handle the wakeup of CPUs in deep idle 
states.

This patchset introduces a pseudo clock device which can be registered by the
archs as tick_broadcast_device in the absence of a real external clock
device. Once registered, the broadcast framework will work as is for these
architectures as long as the archs take care of the BROADCAST_ENTER
notification failing for one of the CPUs. This CPU is made the stand by CPU to
handle wakeup of the CPUs in deep idle and it *must not enter deep idle states*.

The CPU with the earliest wakeup is chosen to be this CPU. Hence this way the
stand by CPU dynamically moves around and so does the hrtimer which is queued
to trigger at the next earliest wakeup time. This is consistent with the case 
where
an external clock device is present. The smp affinity of this clock device is
set to the CPU with the earliest wakeup.


Hi Preeti,

jumping a bit late in the thread...

Setting the smp affinity on the earliest timer should be handled 
automatically with the CLOCK_EVT_FEAT_DYNIRQ flag. Did you look at using 
this flag ?


Another comment is the overall approach. We enter the cpuidle idle 
framework with a specific state to go to and it is the tick framework 
telling us we mustn't go to this state. IMO the logic is wrong, the 
decision to not enter this state should be moved somewhere else.


Why don't you create a cpuidle driver with the shallow idle states 
assigned to a cpu (let's say cpu0) and another one with all the deeper 
idle states for the rest of the cpus ? Using the multiple cpuidle driver 
support makes it possible. The timer won't be moving around and a cpu 
will be dedicated to act as the broadcast timer.


Wouldn't make sense and be less intrusive than the patchset you proposed ?



This patchset handles the hotplug of
the stand by CPU as well by moving the hrtimer on to the CPU handling the 
CPU_DEAD
notification.

Signed-off-by: Preeti U Murthy pre...@linux.vnet.ibm.com
[Added Changelog and code to handle reprogramming of hrtimer]
---

  include/linux/clockchips.h   |9 +++
  kernel/time/Makefile |2 -
  kernel/time/tick-broadcast-hrtimer.c |  105 ++
  kernel/time/tick-broadcast.c |   54 +
  4 files changed, 166 insertions(+), 4 deletions(-)
  create mode 100644 kernel/time/tick-broadcast-hrtimer.c

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index e0c5a6c..dbe9e14 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -62,6 +62,11 @@ enum clock_event_mode {
  #define CLOCK_EVT_FEAT_DYNIRQ 0x20
  #define CLOCK_EVT_FEAT_PERCPU 0x40

+/*
+ * Clockevent device is based on a hrtimer for broadcast
+ */
+#define CLOCK_EVT_FEAT_HRTIMER 0x80
+
  /**
   * struct clock_event_device - clock event device descriptor
   * @event_handler:Assigned by the framework to be called by the low
@@ -83,6 +88,7 @@ enum clock_event_mode {
   * @name: ptr to clock event name
   * @rating:   variable to rate clock event devices
   * @irq:  IRQ number (only for non CPU local devices)
+ * @bound_on:  Bound on CPU
   * @cpumask:  cpumask to indicate for which CPUs this device works
   * @list: list head for the management code
   * @owner:module reference
@@ -113,6 +119,7 @@ struct clock_event_device {
const char  *name;
int rating;
int irq;
+   int bound_on;
const struct cpumask*cpumask;
struct list_headlist;
struct module   *owner;
@@ -180,9 +187,11 @@ extern int tick_receive_broadcast(void);
  #endif

  #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)  
defined(CONFIG_TICK_ONESHOT)
+extern void tick_setup_hrtimer_broadcast(void);
  extern int tick_check_broadcast_expired(void);
  #else
  static inline int tick_check_broadcast_expired(void) { return 0; }
+static void tick_setup_hrtimer_broadcast(void) {};
  #endif

  #ifdef CONFIG_GENERIC_CLOCKEVENTS
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 9250130..06151ef 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -3,7 +3,7 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o

  

Re: [PATCH v2] powerpc ticket locks

2014-02-11 Thread Raghavendra KT
On Fri, Feb 7, 2014 at 10:28 PM, Torsten Duwe d...@lst.de wrote:
 Ticket locks for ppc, version 2. Changes since v1:
 * The atomically exchanged entity is always 32 bits.
 * asm inline string variations thus removed.
 * Carry the additional holder hint only #if defined(CONFIG_PPC_SPLPAR)

 Signed-off-by: Torsten Duwe d...@suse.de
 --
[...]
 +static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
  {
 +   register struct __raw_tickets old, tmp,
 +   inc = { .tail = TICKET_LOCK_INC };
 +
 CLEAR_IO_SYNC;
 -   while (1) {
 -   if (likely(__arch_spin_trylock(lock) == 0))
 -   break;
 +   __asm__ __volatile__(
 +1:lwarx   %0,0,%4 # arch_spin_lock\n
 +  add %1,%3,%0\n
 +   PPC405_ERR77(0, %4)
 +  stwcx.  %1,0,%4\n
 +  bne-1b
 +   : =r (old), =r (tmp), +m (lock-tickets)
 +   : r (inc), r (lock-tickets)
 +   : cc);
 +
 +   if (likely(old.head == old.tail))
 +   goto out;
 +
 +   for (;;) {
 +   unsigned count = 100;

I am sure you wanted to tune the total loops to typical lock holding time ...
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2] powerpc ticket locks

2014-02-11 Thread Raghavendra KT
On Mon, Feb 10, 2014 at 8:40 AM, Benjamin Herrenschmidt
b...@kernel.crashing.org wrote:
 On Fri, 2014-02-07 at 17:58 +0100, Torsten Duwe wrote:
  typedef struct {
 -   volatile unsigned int slock;
 -} arch_spinlock_t;
 +   union {
 +   __ticketpair_t head_tail;
 +   struct __raw_tickets {
 +#ifdef __BIG_ENDIAN__  /* The tail part should be in the MSBs */
 +   __ticket_t tail, head;
 +#else
 +   __ticket_t head, tail;
 +#endif
 +   } tickets;
 +   };
 +#if defined(CONFIG_PPC_SPLPAR)
 +   u32 holder;
 +#endif
 +} arch_spinlock_t __aligned(4);

 That's still broken with lockref (which we just merged).

 We must have the arch_spinlock_t and the ref in the same 64-bit word
 otherwise it will break.

 We can make it work in theory since the holder doesn't have to be
 accessed atomically, but the practicals are a complete mess ...
 lockref would essentially have to re-implement the holder handling
 of the spinlocks and use lower level ticket stuff.


Probably very basic and stupid question from me.
How much important to have holder information for PPC? From my
previous experiment
on x86, it was lock-waiter preemption which is problematic rather than
lock-holder preemption.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/3] powerpc: mm: Add new set flag argument to pte/pmd update function

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We will use this later to set the _PAGE_NUMA bit.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/hugetlb.h   |  2 +-
 arch/powerpc/include/asm/pgtable-ppc64.h | 26 +++---
 arch/powerpc/mm/pgtable_64.c | 12 +++-
 arch/powerpc/mm/subpage-prot.c   |  2 +-
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index d750336b171d..623f2971ce0e 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -127,7 +127,7 @@ static inline pte_t huge_ptep_get_and_clear(struct 
mm_struct *mm,
unsigned long addr, pte_t *ptep)
 {
 #ifdef CONFIG_PPC64
-   return __pte(pte_update(mm, addr, ptep, ~0UL, 1));
+   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
 #else
return __pte(pte_update(ptep, ~0UL, 0));
 #endif
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index bc141c950b1e..eb9261024f51 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -195,6 +195,7 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned 
long addr,
 static inline unsigned long pte_update(struct mm_struct *mm,
   unsigned long addr,
   pte_t *ptep, unsigned long clr,
+  unsigned long set,
   int huge)
 {
 #ifdef PTE_ATOMIC_UPDATES
@@ -205,14 +206,15 @@ static inline unsigned long pte_update(struct mm_struct 
*mm,
andi.   %1,%0,%6\n\
bne-1b \n\
andc%1,%0,%4 \n\
+   or  %1,%1,%7\n\
stdcx.  %1,0,%3 \n\
bne-1b
: =r (old), =r (tmp), =m (*ptep)
-   : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY)
+   : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY), r (set)
: cc );
 #else
unsigned long old = pte_val(*ptep);
-   *ptep = __pte(old  ~clr);
+   *ptep = __pte((old  ~clr) | set);
 #endif
/* huge pages use the old page table lock */
if (!huge)
@@ -231,9 +233,9 @@ static inline int __ptep_test_and_clear_young(struct 
mm_struct *mm,
 {
unsigned long old;
 
-   if ((pte_val(*ptep)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+   if ((pte_val(*ptep)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
return 0;
-   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0);
+   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
return (old  _PAGE_ACCESSED) != 0;
 }
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
@@ -252,7 +254,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, 
unsigned long addr,
if ((pte_val(*ptep)  _PAGE_RW) == 0)
return;
 
-   pte_update(mm, addr, ptep, _PAGE_RW, 0);
+   pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
 }
 
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
@@ -261,7 +263,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct 
*mm,
if ((pte_val(*ptep)  _PAGE_RW) == 0)
return;
 
-   pte_update(mm, addr, ptep, _PAGE_RW, 1);
+   pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
 }
 
 /*
@@ -284,14 +286,14 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
 {
-   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0);
+   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
return __pte(old);
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 pte_t * ptep)
 {
-   pte_update(mm, addr, ptep, ~0UL, 0);
+   pte_update(mm, addr, ptep, ~0UL, 0, 0);
 }
 
 
@@ -506,7 +508,9 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 
 extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
 unsigned long addr,
-pmd_t *pmdp, unsigned long clr);
+pmd_t *pmdp,
+unsigned long clr,
+unsigned long set);
 
 static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
  unsigned long addr, pmd_t *pmdp)
@@ -515,7 +519,7 @@ static inline int __pmdp_test_and_clear_young(struct 
mm_struct *mm,
 
if ((pmd_val(*pmdp)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
return 0;
-   old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED);
+   old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
return ((old  

[PATCH 2/3] mm: dirty accountable change only apply to non prot numa case

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

So move it within the if loop

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 mm/mprotect.c | 21 +++--
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7332c1785744..33eab902f10e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,6 +58,13 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
if (pte_numa(ptent))
ptent = pte_mknonnuma(ptent);
ptent = pte_modify(ptent, newprot);
+   /*
+* Avoid taking write faults for pages we
+* know to be dirty.
+*/
+   if (dirty_accountable  pte_dirty(ptent))
+   ptent = pte_mkwrite(ptent);
+   ptep_modify_prot_commit(mm, addr, pte, ptent);
updated = true;
} else {
struct page *page;
@@ -72,22 +79,8 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
}
}
}
-
-   /*
-* Avoid taking write faults for pages we know to be
-* dirty.
-*/
-   if (dirty_accountable  pte_dirty(ptent)) {
-   ptent = pte_mkwrite(ptent);
-   updated = true;
-   }
-
if (updated)
pages++;
-
-   /* Only !prot_numa always clears the pte */
-   if (!prot_numa)
-   ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION)  !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
 
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 3/3] mm: Use ptep/pmdp_set_numa for updating _PAGE_NUMA bit

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions. ppc64 also 
doesn't implement
flush_tlb_range. ppc64 require the tlb flushing to be batched within ptl locks. 
The reason
to do that is to ensure that the hash page table is in sync with linux page 
table.
We track the hpte index in linux pte and if we clear them without flushing hash 
and drop the
ptl lock, we can have another cpu update the pte and can end up with double 
hash. We also want
to keep set_pte_at simpler by not requiring them to do hash flush for 
performance reason.
Hence cannot use them while updating _PAGE_NUMA bit. Add new functions for 
marking pte/pmd numa

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable.h | 22 ++
 include/asm-generic/pgtable.h  | 24 
 mm/huge_memory.c   |  9 ++---
 mm/mprotect.c  |  4 +---
 4 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index f83b6f3e1b39..3ebb188c3ff5 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -75,12 +75,34 @@ static inline pte_t pte_mknuma(pte_t pte)
return pte;
 }
 
+#define ptep_set_numa ptep_set_numa
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+pte_t *ptep)
+{
+   if ((pte_val(*ptep)  _PAGE_PRESENT) == 0)
+   VM_BUG_ON(1);
+
+   pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
+   return;
+}
+
 #define pmd_numa pmd_numa
 static inline int pmd_numa(pmd_t pmd)
 {
return pte_numa(pmd_pte(pmd));
 }
 
+#define pmdp_set_numa pmdp_set_numa
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+pmd_t *pmdp)
+{
+   if ((pmd_val(*pmdp)  _PAGE_PRESENT) == 0)
+   VM_BUG_ON(1);
+
+   pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
+   return;
+}
+
 #define pmd_mknonnuma pmd_mknonnuma
 static inline pmd_t pmd_mknonnuma(pmd_t pmd)
 {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 8e4f41d9af4d..93fdb5315a0d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -669,6 +669,18 @@ static inline int pmd_numa(pmd_t pmd)
 }
 #endif
 
+#ifndef pmdp_set_numa
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+pmd_t *pmdp)
+{
+   pmd_t pmd = *pmdp;
+
+   pmd = pmd_mknuma(entry);
+   set_pmd_at(mm, addr, pmdp, pmd);
+   return;
+}
+#endif
+
 /*
  * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
  * because they're called by the NUMA hinting minor page fault. If we
@@ -701,6 +713,18 @@ static inline pte_t pte_mknuma(pte_t pte)
 }
 #endif
 
+#ifndef ptep_set_numa
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+pte_t *ptep)
+{
+   pte_t ptent = *ptep;
+
+   ptent = pte_mknuma(ptent);
+   set_pte_at(mm, addr, ptep, ptent);
+   return;
+}
+#endif
+
 #ifndef pmd_mknuma
 static inline pmd_t pmd_mknuma(pmd_t pmd)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 82166bf974e1..da23eb96779f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1545,6 +1545,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR;
+   set_pmd_at(mm, addr, pmd, entry);
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
@@ -1557,16 +1558,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
 */
if (!is_huge_zero_page(page) 
!pmd_numa(*pmd)) {
-   entry = *pmd;
-   entry = pmd_mknuma(entry);
+   pmdp_set_numa(mm, addr, pmd);
ret = HPAGE_PMD_NR;
}
}
-
-   /* Set PMD if cleared earlier */
-   if (ret == HPAGE_PMD_NR)
-   set_pmd_at(mm, addr, pmd, entry);
-
spin_unlock(ptl);
}
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 33eab902f10e..769a67a15803 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -69,12 +69,10 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
} else {
struct page *page;
 
-   ptent = *pte;
page = vm_normal_page(vma, addr, 

[PATCH 0/3] powerpc: Fix random application crashes with NUMA_BALANCING enabled

2014-02-11 Thread Aneesh Kumar K.V
Hello,

This patch series fix random application crashes observed on ppc64 with numa
balancing enabled. Without the patch we see crashes like

anacron[14551]: unhandled signal 11 at 0041 nip 3cfd54b4 lr 
3cfd5464 code 30001
anacron[14599]: unhandled signal 11 at 0041 nip 3efc54b4 lr 
3efc5464 code 30001

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2] powerpc ticket locks

2014-02-11 Thread Torsten Duwe
On Tue, Feb 11, 2014 at 03:23:51PM +0530, Raghavendra KT wrote:
 How much important to have holder information for PPC? From my
 previous experiment
 on x86, it was lock-waiter preemption which is problematic rather than
 lock-holder preemption.

It's something very special to IBM pSeries: the hypervisor can assign
fractions of physical CPUs to guests. Sometimes a guest with 4 quarter
CPUs will be faster than 1 monoprocessor. (correct me if I'm wrong).

The directed yield resolves the silly situation when holder and waiter
reside on the same physical CPU, as I understand it.

x86 has nothing comparable.

Torsten

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 0/2] Support of the kmcoge4 board

2014-02-11 Thread Valentin Longchamp
This series adds support for Keymile's COGE4 board, called kmcoge4. This
board is the reference design for further designs at Keymile around the
P2040/P2041 SoCs from Freescale. This reference design is internally
called kmp204x.

Changes in v2:
- add a patch so that the Zarlink vendor prefix is defined
- add some nodes on the localbus CS when possible
- only use the corenet_generic machine and add kmcoge4 to the supported
  boards instead of defining a new kmp204x machine
- set better and more precise device nodes for the spi devices
- remove the partion layout for the spi_flash@0

Valentin Longchamp (2):
  devicetree: bindings: add Zarlink to the vendor prefixes
  powerpc/mpc85xx: add support for Keymile's kmcoge4 board

 .../devicetree/bindings/vendor-prefixes.txt|   1 +
 arch/powerpc/boot/dts/kmcoge4.dts  | 161 +++
 arch/powerpc/configs/85xx/kmp204x_defconfig| 227 +
 arch/powerpc/platforms/85xx/Kconfig|   2 +-
 arch/powerpc/platforms/85xx/corenet_generic.c  |   3 +-
 5 files changed, 392 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/boot/dts/kmcoge4.dts
 create mode 100644 arch/powerpc/configs/85xx/kmp204x_defconfig

-- 
1.8.0.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 1/2] devicetree: bindings: add Zarlink to the vendor prefixes

2014-02-11 Thread Valentin Longchamp
Even though the company belongs to Microsemi, many chips are still
labeled as Zarlink. Among them is the family of network clock generators,
the zl3034x.

Signed-off-by: Valentin Longchamp valentin.longch...@keymile.com


---
Changes in v2:
- add a patch so that the Zarlink vendor prefix is defined

 Documentation/devicetree/bindings/vendor-prefixes.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt 
b/Documentation/devicetree/bindings/vendor-prefixes.txt
index edbb8d8..35f1c9b 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -81,3 +81,4 @@ winbond Winbond Electronics corp.
 wlfWolfson Microelectronics
 wm Wondermedia Technologies, Inc.
 xlnx   Xilinx
+zarlinkZarlink Semiconductor
-- 
1.8.0.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v2 2/2] powerpc/mpc85xx: add support for Keymile's kmcoge4 board

2014-02-11 Thread Valentin Longchamp
This patch introduces the support for Keymile's kmcoge4 board which is
the internal reference design for boards based on Freescale's
P2040/P2041 SoCs. This internal reference design is named kmp204x.

The peripherals used on this board are:
- SPI NOR Flash as bootloader medium
- NAND Flash with a ubi partition
- 2 PCIe busses (hosts 1 and 3)
- 3 FMAN Ethernet devices (FMAN1 DTSEC1/2/5)
- 4 Local Bus windows, with one dedicated to the QRIO reset/power mgmt
  CPLD
- 2 I2C busses
- last but not least, the mandatory serial port

The patch also adds a defconfig file for this reference design that is
necessary because of the lowmem option that must be set higher due to
the number of PCIe devices with big ioremapped mem ranges on the boad.

Signed-off-by: Valentin Longchamp valentin.longch...@keymile.com

---
Changes in v2:
- add some nodes on the localbus CS when possible
- only use the corenet_generic machine and add kmcoge4 to the supported
  boards instead of defining a new kmp204x machine
- set better and more precise device nodes for the spi devices
- remove the partion layout for the spi_flash@0

 arch/powerpc/boot/dts/kmcoge4.dts | 161 ++
 arch/powerpc/configs/85xx/kmp204x_defconfig   | 227 ++
 arch/powerpc/platforms/85xx/Kconfig   |   2 +-
 arch/powerpc/platforms/85xx/corenet_generic.c |   3 +-
 4 files changed, 391 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/boot/dts/kmcoge4.dts
 create mode 100644 arch/powerpc/configs/85xx/kmp204x_defconfig

diff --git a/arch/powerpc/boot/dts/kmcoge4.dts 
b/arch/powerpc/boot/dts/kmcoge4.dts
new file mode 100644
index 000..5eab9df
--- /dev/null
+++ b/arch/powerpc/boot/dts/kmcoge4.dts
@@ -0,0 +1,161 @@
+/*
+ * Keymile kmcoge4 Device Tree Source, based on the P2041RDB DTS
+ *
+ * (C) Copyright 2014
+ * Valentin Longchamp, Keymile AG, valentin.longch...@keymile.com
+ *
+ * Copyright 2011 Freescale Semiconductor Inc.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+/include/ fsl/p2041si-pre.dtsi
+
+/ {
+   model = keymile,kmcoge4;
+   compatible = keymile,kmcoge4, keymile,kmp204x;
+   #address-cells = 2;
+   #size-cells = 2;
+   interrupt-parent = mpic;
+
+   memory {
+   device_type = memory;
+   };
+
+   dcsr: dcsr@f {
+   ranges = 0x 0xf 0x 0x01008000;
+   };
+
+   soc: soc@ffe00 {
+   ranges = 0x 0xf 0xfe00 0x100;
+   reg = 0xf 0xfe00 0 0x1000;
+   spi@11 {
+   flash@0 {
+   #address-cells = 1;
+   #size-cells = 1;
+   compatible = spansion,s25fl256s1;
+   reg = 0;
+   spi-max-frequency = 2000; /* input clock 
*/
+   };
+
+   network_clock@1 {
+   compatible = zarlink,zl30343;
+   reg = 1;
+   spi-max-frequency = 800;
+   };
+
+   flash@2 {
+   #address-cells = 1;
+   #size-cells = 1;
+   compatible = micron,m25p32;
+   reg = 2;
+   spi-max-frequency = 1500;
+   };
+   };
+
+   i2c@119000 {
+   status = disabled;
+   };
+
+   i2c@119100 {
+   status = disabled;
+   };
+
+   usb0: usb@21 {
+   status = disabled;
+   };
+
+   usb1: usb@211000 {
+   status = disabled;
+   };
+
+   sata@22 {
+   status = disabled;
+   };
+
+   sata@221000 {
+   status = disabled;
+   };
+   };
+
+   rio: rapidio@ffe0c {
+   status = disabled;
+   };
+
+   lbc: localbus@ffe124000 {
+   reg = 0xf 0xfe124000 0 0x1000;
+   ranges = 0 0 0xf 0xffa0 0x0004 /* LB 0 */
+ 1 0 0xf 0xfb00 0x0001 /* LB 1 */
+ 2 0 0xf 0xd000 0x1000 /* LB 2 */
+ 3 0 0xf 0xe000 0x1000;   /* LB 3 */
+
+   nand@0,0 {
+   #address-cells = 1;
+   #size-cells = 1;
+   compatible = fsl,elbc-fcm-nand;
+   reg = 0 0 0x4;
+
+

Re: [PATCH 2/3] mm: dirty accountable change only apply to non prot numa case

2014-02-11 Thread Rik van Riel
On 02/11/2014 05:34 AM, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 So move it within the if loop
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Reviewed-by: Rik van Riel r...@redhat.com

-- 
All rights reversed
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3] mm: Use ptep/pmdp_set_numa for updating _PAGE_NUMA bit

2014-02-11 Thread Rik van Riel
On 02/11/2014 05:34 AM, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions. ppc64 also 
 doesn't implement
 flush_tlb_range. ppc64 require the tlb flushing to be batched within ptl 
 locks. The reason
 to do that is to ensure that the hash page table is in sync with linux page 
 table.
 We track the hpte index in linux pte and if we clear them without flushing 
 hash and drop the
 ptl lock, we can have another cpu update the pte and can end up with double 
 hash. We also want
 to keep set_pte_at simpler by not requiring them to do hash flush for 
 performance reason.
 Hence cannot use them while updating _PAGE_NUMA bit. Add new functions for 
 marking pte/pmd numa
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Reviewed-by: Rik van Riel r...@redhat.com

-- 
All rights reversed
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 1/3] powerpc: mm: Add new set flag argument to pte/pmd update function

2014-02-11 Thread Rik van Riel
On 02/11/2014 05:34 AM, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 We will use this later to set the _PAGE_NUMA bit.
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Acked-by: Rik van Riel r...@redhat.com

-- 
All rights reversed
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V4 2/3] tick/cpuidle: Initialize hrtimer mode of broadcast

2014-02-11 Thread Thomas Gleixner
On Tue, 11 Feb 2014, Daniel Lezcano wrote:
 On 02/07/2014 09:06 AM, Preeti U Murthy wrote:
 Setting the smp affinity on the earliest timer should be handled automatically
 with the CLOCK_EVT_FEAT_DYNIRQ flag. Did you look at using this flag ?

How should this flag help? Not at all, because the hrtimer based
broadcast device cannot assign affinities.
 
 Another comment is the overall approach. We enter the cpuidle idle framework
 with a specific state to go to and it is the tick framework telling us we
 mustn't go to this state. IMO the logic is wrong, the decision to not enter
 this state should be moved somewhere else.
 
 Why don't you create a cpuidle driver with the shallow idle states assigned to
 a cpu (let's say cpu0) and another one with all the deeper idle states for the
 rest of the cpus ? Using the multiple cpuidle driver support makes it
 possible. The timer won't be moving around and a cpu will be dedicated to act
 as the broadcast timer.
 
 Wouldn't make sense and be less intrusive than the patchset you proposed ?

How do you arm the broadcast timer on CPU0 from CPU1? You can't!

You cannot access the cpu local timer on a different cpu. So you would
have to send an IPI over to CPU0 so that it can reevaluate and
schedule the broadcast. That's even more backwards than telling the
cpuidle code that the CPU is not in a state to go deep.

Thanks,

tglx


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V4 2/3] tick/cpuidle: Initialize hrtimer mode of broadcast

2014-02-11 Thread Preeti U Murthy
Hi Daniel,

Thank you very much for the review.

On 02/11/2014 03:46 PM, Daniel Lezcano wrote:
 On 02/07/2014 09:06 AM, Preeti U Murthy wrote:
 From: Thomas Gleixner t...@linutronix.de

 On some architectures, in certain CPU deep idle states the local
 timers stop.
 An external clock device is used to wakeup these CPUs. The kernel
 support for the
 wakeup of these CPUs is provided by the tick broadcast framework by
 using the
 external clock device as the wakeup source.

 However not all implementations of architectures provide such an external
 clock device. This patch includes support in the broadcast framework
 to handle
 the wakeup of the CPUs in deep idle states on such systems by queuing
 a hrtimer
 on one of the CPUs, which is meant to handle the wakeup of CPUs in
 deep idle states.

 This patchset introduces a pseudo clock device which can be registered
 by the
 archs as tick_broadcast_device in the absence of a real external clock
 device. Once registered, the broadcast framework will work as is for
 these
 architectures as long as the archs take care of the BROADCAST_ENTER
 notification failing for one of the CPUs. This CPU is made the stand
 by CPU to
 handle wakeup of the CPUs in deep idle and it *must not enter deep
 idle states*.

 The CPU with the earliest wakeup is chosen to be this CPU. Hence this
 way the
 stand by CPU dynamically moves around and so does the hrtimer which is
 queued
 to trigger at the next earliest wakeup time. This is consistent with
 the case where
 an external clock device is present. The smp affinity of this clock
 device is
 set to the CPU with the earliest wakeup.
 
 Hi Preeti,
 
 jumping a bit late in the thread...
 
 Setting the smp affinity on the earliest timer should be handled
 automatically with the CLOCK_EVT_FEAT_DYNIRQ flag. Did you look at using
 this flag ?

This patch is not setting the smp affinity of the pseudo clock device at
all. Its not required to for the reason that it does not exist.

I mentioned this point because we assign a CPU with the earliest wakeup
as standby. I compared this logic to the one used by the tick broadcast
framework for archs which have an external clock device to set the smp
affinity of the device.

If these archs do not have the flag CLOCK_EVT_FEAT_DYNIRQ set for the
external clock device, the tick broadcast framework sets the smp
affinity of this device to the CPU with the earliest wakeup. We are
using the same logic in this patchset as well to assign the stand by CPU.

 
 Another comment is the overall approach. We enter the cpuidle idle
 framework with a specific state to go to and it is the tick framework
 telling us we mustn't go to this state. IMO the logic is wrong, the
 decision to not enter this state should be moved somewhere else.

Its not the tick framework which tells us that we cannot enter deep idle
state, its the *tick broadcast* framework specifically. The tick
broadcast framework was introduced with the primary intention of
handling wakeup of CPUs in deep idle states when the local timers become
non-functional. Therefore there is a co-operation between this tick
broadcast framework and cpuidle. This has always been the case.

That is why just before cpus go into deep idle, they call into the
broadcast framework. Till now it was assumed that the tick broadcast
framework would find no problems with the cpus entering deep idle.
Therefore cpuidle would simply assume that all is well and go ahead and
enter deep idle state.
  But today there is a scenario when there could be problems if all cpus
enter deep idle states and the tick broadcast framework now notifies the
cpuidle framework to hold back one cpu. This is just a simple extension
of the current interaction between cpuidle and tick broadcast framework.

 
 Why don't you create a cpuidle driver with the shallow idle states
 assigned to a cpu (let's say cpu0) and another one with all the deeper
 idle states for the rest of the cpus ? Using the multiple cpuidle driver
 support makes it possible. The timer won't be moving around and a cpu
 will be dedicated to act as the broadcast timer.
 

Having a dedicated stand by cpu for broadcasting has some issues which
were pointed to when I posted the initial versions of this patchset.
https://lkml.org/lkml/2013/7/27/14

1. This could create power/thermal imbalance on the chip since only the
standby cpu cannot enter deep idle state at all times.

2. If it is cpu0 it is fine, else with the logic that you suggest,
hot-plugging out the dedicated stand by cpu would mean moving the work
of broadcasting to another cpu and modifying the cpuidle state table for
it. Even with cpu0, if support to hotplug it out is enabled (maybe it is
already), we will face the same issue and this gets very messy.

 Wouldn't make sense and be less intrusive than the patchset you proposed ?

Actually this patchset brings in a solution that is as less intrusive as
possible. It makes the problem nearly invisible except for a failed
return from a 

Re: [PATCH 2/3] mm: dirty accountable change only apply to non prot numa case

2014-02-11 Thread Mel Gorman
On Tue, Feb 11, 2014 at 04:04:54PM +0530, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 So move it within the if loop
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Acked-by: Mel Gorman mgor...@suse.de

-- 
Mel Gorman
SUSE Labs
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3] mm: Use ptep/pmdp_set_numa for updating _PAGE_NUMA bit

2014-02-11 Thread Mel Gorman
On Tue, Feb 11, 2014 at 04:04:55PM +0530, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions. ppc64 also 
 doesn't implement
 flush_tlb_range. ppc64 require the tlb flushing to be batched within ptl 
 locks. The reason
 to do that is to ensure that the hash page table is in sync with linux page 
 table.
 We track the hpte index in linux pte and if we clear them without flushing 
 hash and drop the
 ptl lock, we can have another cpu update the pte and can end up with double 
 hash. We also want
 to keep set_pte_at simpler by not requiring them to do hash flush for 
 performance reason.
 Hence cannot use them while updating _PAGE_NUMA bit. Add new functions for 
 marking pte/pmd numa
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Acked-by: Mel Gorman mgor...@suse.de

-- 
Mel Gorman
SUSE Labs
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 1/3] powerpc: mm: Add new set flag argument to pte/pmd update function

2014-02-11 Thread Mel Gorman
On Tue, Feb 11, 2014 at 04:04:53PM +0530, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 We will use this later to set the _PAGE_NUMA bit.
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Acked-by: Mel Gorman mgor...@suse.de

-- 
Mel Gorman
SUSE Labs
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2] powerpc: thp: Fix crash on mremap

2014-02-11 Thread Greg KH
On Fri, Feb 07, 2014 at 07:21:57PM +0530, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 This patch fix the below crash
 
 NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
 LR [c00439ac] .hash_page+0x18c/0x5e0
 ...
 Call Trace:
 [c00736103c40] [1b00] 0x1b00(unreliable)
 [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58
 
 On ppc64 we use the pgtable for storing the hpte slot information and
 store address to the pgtable at a constant offset (PTRS_PER_PMD) from
 pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
 the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
 from new pmd.
 
 We also want to move the withdraw and deposit before the set_pmd so
 that, when page fault find the pmd as trans huge we can be sure that
 pgtable can be located at the offset.
 
 variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
 for 3.12 stable series

This doesn't look like a variant, it looks totally different.  Why
can't I just take the b3084f4db3aeb991c507ca774337c7e7893ed04f patch
(and follow-on fix) for 3.12?

I _REALLY_ dislike patches that are totally different from Linus's tree
in stable trees, it has caused nothing but problems in the past.

greg k-h
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2] powerpc ticket locks

2014-02-11 Thread Scott Wood
On Tue, 2014-02-11 at 11:40 +0100, Torsten Duwe wrote:
 On Tue, Feb 11, 2014 at 03:23:51PM +0530, Raghavendra KT wrote:
  How much important to have holder information for PPC? From my
  previous experiment
  on x86, it was lock-waiter preemption which is problematic rather than
  lock-holder preemption.
 
 It's something very special to IBM pSeries: the hypervisor can assign
 fractions of physical CPUs to guests. Sometimes a guest with 4 quarter
 CPUs will be faster than 1 monoprocessor. (correct me if I'm wrong).
 
 The directed yield resolves the silly situation when holder and waiter
 reside on the same physical CPU, as I understand it.
 
 x86 has nothing comparable.

How is this different from the very ordinary case of an SMP KVM guest
whose vcpus are not bound to host cpus, and thus you could have multiple
vcpus running on the same host cpu?

-Scott


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 2/3] topology: support node_numa_mem() for determining the fallback node

2014-02-11 Thread Christoph Lameter
On Mon, 10 Feb 2014, Joonsoo Kim wrote:

 On Fri, Feb 07, 2014 at 12:51:07PM -0600, Christoph Lameter wrote:
  Here is a draft of a patch to make this work with memoryless nodes.
 
  The first thing is that we modify node_match to also match if we hit an
  empty node. In that case we simply take the current slab if its there.

 Why not inspecting whether we can get the page on the best node such as
 numa_mem_id() node?

Its expensive to do so.

 empty_node cannot be set on memoryless node, since page allocation would
 succeed on different node.

Ok then we need to add a check for being on the rignt node there too.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3] mm: Use ptep/pmdp_set_numa for updating _PAGE_NUMA bit

2014-02-11 Thread Benjamin Herrenschmidt
On Tue, 2014-02-11 at 17:07 +, Mel Gorman wrote:
 On Tue, Feb 11, 2014 at 04:04:55PM +0530, Aneesh Kumar K.V wrote:
  From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
  
  Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions. ppc64 also 
  doesn't implement
  flush_tlb_range. ppc64 require the tlb flushing to be batched within ptl 
  locks. The reason
  to do that is to ensure that the hash page table is in sync with linux page 
  table.
  We track the hpte index in linux pte and if we clear them without flushing 
  hash and drop the
  ptl lock, we can have another cpu update the pte and can end up with double 
  hash. We also want
  to keep set_pte_at simpler by not requiring them to do hash flush for 
  performance reason.
  Hence cannot use them while updating _PAGE_NUMA bit. Add new functions for 
  marking pte/pmd numa
  
  Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 Acked-by: Mel Gorman mgor...@suse.de
 

How do you guys want me to proceed ? Will you (or Andrew) send these to
Linus or should I do it myself ?

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2] powerpc: thp: Fix crash on mremap

2014-02-11 Thread Benjamin Herrenschmidt
On Tue, 2014-02-11 at 09:31 -0800, Greg KH wrote:
 On Fri, Feb 07, 2014 at 07:21:57PM +0530, Aneesh Kumar K.V wrote:
  From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
  
  This patch fix the below crash
  
  NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
  LR [c00439ac] .hash_page+0x18c/0x5e0
  ...
  Call Trace:
  [c00736103c40] [1b00] 0x1b00(unreliable)
  [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
  [437908.479699] [c00736103e30] [c000924c] 
  .do_hash_page+0x4c/0x58
  
  On ppc64 we use the pgtable for storing the hpte slot information and
  store address to the pgtable at a constant offset (PTRS_PER_PMD) from
  pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
  the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
  from new pmd.
  
  We also want to move the withdraw and deposit before the set_pmd so
  that, when page fault find the pmd as trans huge we can be sure that
  pgtable can be located at the offset.
  
  variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
  for 3.12 stable series
 
 This doesn't look like a variant, it looks totally different.  Why
 can't I just take the b3084f4db3aeb991c507ca774337c7e7893ed04f patch
 (and follow-on fix) for 3.12?
 
 I _REALLY_ dislike patches that are totally different from Linus's tree
 in stable trees, it has caused nothing but problems in the past.

I don't think it applies... (I tried on an internal tree) but the
affected function changed in 3.13 in various ways. Aneesh, please
provide a more details explanation and whether we should backport those
other changes too or whether this is not necessary.

BTW. Aneesh, we need a 3.11.x one too

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2] powerpc ticket locks

2014-02-11 Thread Benjamin Herrenschmidt
On Tue, 2014-02-11 at 12:30 -0600, Scott Wood wrote:
  It's something very special to IBM pSeries: the hypervisor can assign
  fractions of physical CPUs to guests. Sometimes a guest with 4 quarter
  CPUs will be faster than 1 monoprocessor. (correct me if I'm wrong).
  
  The directed yield resolves the silly situation when holder and waiter
  reside on the same physical CPU, as I understand it.
  
  x86 has nothing comparable.
 
 How is this different from the very ordinary case of an SMP KVM guest
 whose vcpus are not bound to host cpus, and thus you could have multiple
 vcpus running on the same host cpu?

It's not really ... though I can see drawbacks with the scheme as well
and I think in KVM we should be careful to only confer if the owner
vcpu last scheduled on the same physical cpu where the waiter is, other
wise, there's too much chances of us bouncing things around the machine
for minor contention cases.

Paul, what's your policy today ?

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Linux-3.14-rc2: Order of serial node compatibles in DTS files.

2014-02-11 Thread Stephen N Chivers
I have been trial booting a 3.14-rc2 kernel for a 85xx platform 
(dtbImage).

After mounting the root filesystem there are no messages from the init 
scripts
and the serial console is not available for login.

In the kernel log messages there is:

of_serial f1004500.serial: Unknown serial port found, ignored.

The serial nodes in boards dts file are specified as:

serial0: serial@4500 {
cell-index = 0;
device_type = serial;
compatible = fsl,ns16550, ns16550;
reg = 0x4500 0x100;
clock-frequency = 0;
interrupts = 0x2a 0x2;
interrupt-parent = mpic;
};

Reversing the order of the compatible:

compatible = ns16550, fsl,ns16550;

restores the serial console.

Linux-3.13 does not have this behaviour.

There are 49 dts files in Linux-3.14-rc2 that have the fsl,ns16550 
compatible first.

Stephen Chivers,
CSC Australia Pty. Ltd.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V4 2/3] tick/cpuidle: Initialize hrtimer mode of broadcast

2014-02-11 Thread Daniel Lezcano

On 02/11/2014 04:58 PM, Thomas Gleixner wrote:

On Tue, 11 Feb 2014, Daniel Lezcano wrote:

On 02/07/2014 09:06 AM, Preeti U Murthy wrote:
Setting the smp affinity on the earliest timer should be handled automatically
with the CLOCK_EVT_FEAT_DYNIRQ flag. Did you look at using this flag ?


How should this flag help? Not at all, because the hrtimer based
broadcast device cannot assign affinities.


Another comment is the overall approach. We enter the cpuidle idle framework
with a specific state to go to and it is the tick framework telling us we
mustn't go to this state. IMO the logic is wrong, the decision to not enter
this state should be moved somewhere else.

Why don't you create a cpuidle driver with the shallow idle states assigned to
a cpu (let's say cpu0) and another one with all the deeper idle states for the
rest of the cpus ? Using the multiple cpuidle driver support makes it
possible. The timer won't be moving around and a cpu will be dedicated to act
as the broadcast timer.

Wouldn't make sense and be less intrusive than the patchset you proposed ?


How do you arm the broadcast timer on CPU0 from CPU1? You can't!

You cannot access the cpu local timer on a different cpu. So you would
have to send an IPI over to CPU0 so that it can reevaluate and
schedule the broadcast. That's even more backwards than telling the
cpuidle code that the CPU is not in a state to go deep.


Indeed :)

Thanks for the clarification.

  -- Daniel

--
 http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  http://www.facebook.com/pages/Linaro Facebook |
http://twitter.com/#!/linaroorg Twitter |
http://www.linaro.org/linaro-blog/ Blog

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V4 2/3] tick/cpuidle: Initialize hrtimer mode of broadcast

2014-02-11 Thread Daniel Lezcano

On 02/11/2014 05:09 PM, Preeti U Murthy wrote:

Hi Daniel,

Thank you very much for the review.

On 02/11/2014 03:46 PM, Daniel Lezcano wrote:

On 02/07/2014 09:06 AM, Preeti U Murthy wrote:

From: Thomas Gleixner t...@linutronix.de

On some architectures, in certain CPU deep idle states the local
timers stop.
An external clock device is used to wakeup these CPUs. The kernel
support for the
wakeup of these CPUs is provided by the tick broadcast framework by
using the
external clock device as the wakeup source.

However not all implementations of architectures provide such an external
clock device. This patch includes support in the broadcast framework
to handle
the wakeup of the CPUs in deep idle states on such systems by queuing
a hrtimer
on one of the CPUs, which is meant to handle the wakeup of CPUs in
deep idle states.

This patchset introduces a pseudo clock device which can be registered
by the
archs as tick_broadcast_device in the absence of a real external clock
device. Once registered, the broadcast framework will work as is for
these
architectures as long as the archs take care of the BROADCAST_ENTER
notification failing for one of the CPUs. This CPU is made the stand
by CPU to
handle wakeup of the CPUs in deep idle and it *must not enter deep
idle states*.

The CPU with the earliest wakeup is chosen to be this CPU. Hence this
way the
stand by CPU dynamically moves around and so does the hrtimer which is
queued
to trigger at the next earliest wakeup time. This is consistent with
the case where
an external clock device is present. The smp affinity of this clock
device is
set to the CPU with the earliest wakeup.


Hi Preeti,

jumping a bit late in the thread...

Setting the smp affinity on the earliest timer should be handled
automatically with the CLOCK_EVT_FEAT_DYNIRQ flag. Did you look at using
this flag ?


This patch is not setting the smp affinity of the pseudo clock device at
all. Its not required to for the reason that it does not exist.

I mentioned this point because we assign a CPU with the earliest wakeup
as standby. I compared this logic to the one used by the tick broadcast
framework for archs which have an external clock device to set the smp
affinity of the device.

If these archs do not have the flag CLOCK_EVT_FEAT_DYNIRQ set for the
external clock device, the tick broadcast framework sets the smp
affinity of this device to the CPU with the earliest wakeup. We are
using the same logic in this patchset as well to assign the stand by CPU.



Another comment is the overall approach. We enter the cpuidle idle
framework with a specific state to go to and it is the tick framework
telling us we mustn't go to this state. IMO the logic is wrong, the
decision to not enter this state should be moved somewhere else.


Its not the tick framework which tells us that we cannot enter deep idle
state, its the *tick broadcast* framework specifically. The tick
broadcast framework was introduced with the primary intention of
handling wakeup of CPUs in deep idle states when the local timers become
non-functional. Therefore there is a co-operation between this tick
broadcast framework and cpuidle. This has always been the case.

That is why just before cpus go into deep idle, they call into the
broadcast framework. Till now it was assumed that the tick broadcast
framework would find no problems with the cpus entering deep idle.
Therefore cpuidle would simply assume that all is well and go ahead and
enter deep idle state.
   But today there is a scenario when there could be problems if all cpus
enter deep idle states and the tick broadcast framework now notifies the
cpuidle framework to hold back one cpu. This is just a simple extension
of the current interaction between cpuidle and tick broadcast framework.



Why don't you create a cpuidle driver with the shallow idle states
assigned to a cpu (let's say cpu0) and another one with all the deeper
idle states for the rest of the cpus ? Using the multiple cpuidle driver
support makes it possible. The timer won't be moving around and a cpu
will be dedicated to act as the broadcast timer.



Having a dedicated stand by cpu for broadcasting has some issues which
were pointed to when I posted the initial versions of this patchset.
https://lkml.org/lkml/2013/7/27/14

1. This could create power/thermal imbalance on the chip since only the
standby cpu cannot enter deep idle state at all times.

2. If it is cpu0 it is fine, else with the logic that you suggest,
hot-plugging out the dedicated stand by cpu would mean moving the work
of broadcasting to another cpu and modifying the cpuidle state table for
it. Even with cpu0, if support to hotplug it out is enabled (maybe it is
already), we will face the same issue and this gets very messy.


Wouldn't make sense and be less intrusive than the patchset you proposed ?


Actually this patchset brings in a solution that is as less intrusive as
possible. It makes the problem nearly invisible except for a failed
return 

Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files.

2014-02-11 Thread Kumar Gala

On Feb 11, 2014, at 2:57 PM, Stephen N Chivers schiv...@csc.com.au wrote:

 I have been trial booting a 3.14-rc2 kernel for a 85xx platform 
 (dtbImage).
 
 After mounting the root filesystem there are no messages from the init 
 scripts
 and the serial console is not available for login.
 
 In the kernel log messages there is:
 
 of_serial f1004500.serial: Unknown serial port found, ignored.
 
 The serial nodes in boards dts file are specified as:
 
serial0: serial@4500 {
cell-index = 0;
device_type = serial;
compatible = fsl,ns16550, ns16550;
reg = 0x4500 0x100;
clock-frequency = 0;
interrupts = 0x2a 0x2;
interrupt-parent = mpic;
};
 
 Reversing the order of the compatible:
 
compatible = ns16550, fsl,ns16550;
 
 restores the serial console.
 
 Linux-3.13 does not have this behaviour.
 
 There are 49 dts files in Linux-3.14-rc2 that have the fsl,ns16550 
 compatible first.

Hmm,

Wondering if this caused the issue:

commit 105353145eafb3ea919f5cdeb652a9d8f270228e
Author: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
Date:   Tue Dec 3 14:52:00 2013 +0100

OF: base: match each node compatible against all given matches first


- k
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files.

2014-02-11 Thread Sebastian Hesselbarth

On 02/11/2014 11:33 PM, Kumar Gala wrote:


On Feb 11, 2014, at 2:57 PM, Stephen N Chivers schiv...@csc.com.au wrote:


I have been trial booting a 3.14-rc2 kernel for a 85xx platform
(dtbImage).

After mounting the root filesystem there are no messages from the init
scripts
and the serial console is not available for login.

In the kernel log messages there is:

of_serial f1004500.serial: Unknown serial port found, ignored.

The serial nodes in boards dts file are specified as:

serial0: serial@4500 {
cell-index = 0;
device_type = serial;
compatible = fsl,ns16550, ns16550;
reg = 0x4500 0x100;
clock-frequency = 0;
interrupts = 0x2a 0x2;
interrupt-parent = mpic;
};

Reversing the order of the compatible:

compatible = ns16550, fsl,ns16550;

restores the serial console.

Linux-3.13 does not have this behaviour.

There are 49 dts files in Linux-3.14-rc2 that have the fsl,ns16550
compatible first.


Hmm,

Wondering if this caused the issue:

commit 105353145eafb3ea919f5cdeb652a9d8f270228e
Author: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
Date:   Tue Dec 3 14:52:00 2013 +0100

 OF: base: match each node compatible against all given matches first


[adding Arnd on Cc]

Could be. I checked tty/serial/of_serial.c and it does not provide a
compatible for fsl,ns16550. Does reverting the patch fix the issue
observed?

I don't think the missing compatible is causing it, but of_serial
provides a DT match for .type = serial just to fail later on
with the error seen above.

The commit in question reorders of_match_device in a way that match
table order is not relevant anymore. This can cause it to match
.type = serial first here.

Rather than touching the commit, I suggest to remove the problematic
.type = serial from the match table. It is of no use anyway.

Sebastian
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files.

2014-02-11 Thread Stephen N Chivers
Sebastian Hesselbarth sebastian.hesselba...@gmail.com wrote on 
02/12/2014 09:51:43 AM:

 From: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
 To: Kumar Gala ga...@kernel.crashing.org, Stephen N Chivers 
 schiv...@csc.com.au
 Cc: linuxppc-dev@lists.ozlabs.org, Chris Proctor 
 cproc...@csc.com.au, devicetree devicet...@vger.kernel.org, Arnd
 Bergmann a...@arndb.de
 Date: 02/12/2014 09:51 AM
 Subject: Re: Linux-3.14-rc2: Order of serial node compatibles in DTS 
files.
 
 On 02/11/2014 11:33 PM, Kumar Gala wrote:
 
  On Feb 11, 2014, at 2:57 PM, Stephen N Chivers schiv...@csc.com.au 
wrote:
 
  I have been trial booting a 3.14-rc2 kernel for a 85xx platform
  (dtbImage).
 
  After mounting the root filesystem there are no messages from the 
init
  scripts
  and the serial console is not available for login.
 
  In the kernel log messages there is:
 
  of_serial f1004500.serial: Unknown serial port found, ignored.
 
  The serial nodes in boards dts file are specified as:
 
  serial0: serial@4500 {
  cell-index = 0;
  device_type = serial;
  compatible = fsl,ns16550, ns16550;
  reg = 0x4500 0x100;
  clock-frequency = 0;
  interrupts = 0x2a 0x2;
  interrupt-parent = mpic;
  };
 
  Reversing the order of the compatible:
 
  compatible = ns16550, fsl,ns16550;
 
  restores the serial console.
 
  Linux-3.13 does not have this behaviour.
 
  There are 49 dts files in Linux-3.14-rc2 that have the fsl,ns16550
  compatible first.
 
  Hmm,
 
  Wondering if this caused the issue:
 
  commit 105353145eafb3ea919f5cdeb652a9d8f270228e
  Author: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
  Date:   Tue Dec 3 14:52:00 2013 +0100
 
   OF: base: match each node compatible against all given matches 
first
 
 [adding Arnd on Cc]
 
 Could be. I checked tty/serial/of_serial.c and it does not provide a
 compatible for fsl,ns16550. Does reverting the patch fix the issue
 observed?
 
 I don't think the missing compatible is causing it, but of_serial
 provides a DT match for .type = serial just to fail later on
 with the error seen above.
 
 The commit in question reorders of_match_device in a way that match
 table order is not relevant anymore. This can cause it to match
 .type = serial first here.
 
 Rather than touching the commit, I suggest to remove the problematic
 .type = serial from the match table. It is of no use anyway.
Deleting the serial line from the match table fixes the problem.
I tested it for both orderings of compatible.
 
 Sebastian

Thanks,
Stephen Chivers,
CSC Australia Pty. Ltd.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files.

2014-02-11 Thread Scott Wood
On Tue, 2014-02-11 at 23:51 +0100, Sebastian Hesselbarth wrote:
 On 02/11/2014 11:33 PM, Kumar Gala wrote:
  Hmm,
 
  Wondering if this caused the issue:
 
  commit 105353145eafb3ea919f5cdeb652a9d8f270228e
  Author: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
  Date:   Tue Dec 3 14:52:00 2013 +0100
 
   OF: base: match each node compatible against all given matches first
 
 [adding Arnd on Cc]
 
 Could be. I checked tty/serial/of_serial.c and it does not provide a
 compatible for fsl,ns16550. Does reverting the patch fix the issue
 observed?
 
 I don't think the missing compatible is causing it, but of_serial
 provides a DT match for .type = serial just to fail later on
 with the error seen above.
 
 The commit in question reorders of_match_device in a way that match
 table order is not relevant anymore. This can cause it to match
 .type = serial first here.
 
 Rather than touching the commit, I suggest to remove the problematic
 .type = serial from the match table. It is of no use anyway.

Regardless of whether .type = serial gets removed, it seems wrong for
of_match_node() to accept a .type-only match (or .name, or anything else
that doesn't involve .compatible) before it accepts a compatible match
other than the first in the compatible property.

-Scott


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files.

2014-02-11 Thread Sebastian Hesselbarth

On 02/12/2014 12:38 AM, Stephen N Chivers wrote:

Sebastian Hesselbarth sebastian.hesselba...@gmail.com wrote on

On 02/11/2014 11:33 PM, Kumar Gala wrote:

On Feb 11, 2014, at 2:57 PM, Stephen N Chivers schiv...@csc.com.au wrote:

I have been trial booting a 3.14-rc2 kernel for a 85xx platform
(dtbImage).

[...]


of_serial f1004500.serial: Unknown serial port found, ignored.

The serial nodes in boards dts file are specified as:

 serial0: serial@4500 {
 cell-index = 0;
 device_type = serial;
 compatible = fsl,ns16550, ns16550;
 reg = 0x4500 0x100;
 clock-frequency = 0;
 interrupts = 0x2a 0x2;
 interrupt-parent = mpic;
 };


Wondering if this caused the issue:

commit 105353145eafb3ea919f5cdeb652a9d8f270228e
Author: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
Date:   Tue Dec 3 14:52:00 2013 +0100

  OF: base: match each node compatible against all given matches first



[...]


I don't think the missing compatible is causing it, but of_serial
provides a DT match for .type = serial just to fail later on
with the error seen above.

The commit in question reorders of_match_device in a way that match
table order is not relevant anymore. This can cause it to match
.type = serial first here.

Rather than touching the commit, I suggest to remove the problematic
.type = serial from the match table. It is of no use anyway.

Deleting the serial line from the match table fixes the problem.
I tested it for both orderings of compatible.


I revert my statement about removing anything from of_serial.c. Instead
we should try to prefer matches with compatibles over type/name without
compatibles. Something like the patch below (compile tested only)





diff --git a/drivers/of/base.c b/drivers/of/base.c
index ff85450d5683..60da53b385ff 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -734,6 +734,7 @@ static
 const struct of_device_id *__of_match_node(const struct of_device_id *matches,
 	   const struct device_node *node)
 {
+	const struct of_device_id *m;
 	const char *cp;
 	int cplen, l;
 
@@ -742,15 +743,15 @@ const struct of_device_id *__of_match_node(const struct of_device_id *matches,
 
 	cp = __of_get_property(node, compatible, cplen);
 	do {
-		const struct of_device_id *m = matches;
+		m = matches;
 
 		/* Check against matches with current compatible string */
 		while (m-name[0] || m-type[0] || m-compatible[0]) {
 			int match = 1;
-			if (m-name[0])
+			if (m-name[0]  m-compatible[0])
 match = node-name
 	 !strcmp(m-name, node-name);
-			if (m-type[0])
+			if (m-type[0]  m-compatible[0])
 match = node-type
 	 !strcmp(m-type, node-type);
 			if (m-compatible[0])
@@ -770,6 +771,21 @@ const struct of_device_id *__of_match_node(const struct of_device_id *matches,
 		}
 	} while (cp  (cplen  0));
 
+	/* Check against matches without compatible string */
+	m = matches;
+	while (m-name[0] || m-type[0]) {
+		int match = 1;
+		if (m-name[0])
+			match = node-name
+ !strcmp(m-name, node-name);
+		if (m-type[0])
+			match = node-type
+ !strcmp(m-type, node-type);
+		if (match)
+			return m;
+		m++;
+	}
+
 	return NULL;
 }
 
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files.

2014-02-11 Thread Sebastian Hesselbarth

On 02/12/2014 12:41 AM, Scott Wood wrote:

On Tue, 2014-02-11 at 23:51 +0100, Sebastian Hesselbarth wrote:

On 02/11/2014 11:33 PM, Kumar Gala wrote:

Hmm,

Wondering if this caused the issue:

commit 105353145eafb3ea919f5cdeb652a9d8f270228e
Author: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
Date:   Tue Dec 3 14:52:00 2013 +0100

  OF: base: match each node compatible against all given matches first


[adding Arnd on Cc]

Could be. I checked tty/serial/of_serial.c and it does not provide a
compatible for fsl,ns16550. Does reverting the patch fix the issue
observed?

I don't think the missing compatible is causing it, but of_serial
provides a DT match for .type = serial just to fail later on
with the error seen above.

The commit in question reorders of_match_device in a way that match
table order is not relevant anymore. This can cause it to match
.type = serial first here.

Rather than touching the commit, I suggest to remove the problematic
.type = serial from the match table. It is of no use anyway.


Regardless of whether .type = serial gets removed, it seems wrong for
of_match_node() to accept a .type-only match (or .name, or anything else
that doesn't involve .compatible) before it accepts a compatible match
other than the first in the compatible property.


Right, I thought about it and came to the same conclusion. I sent a
patch a second ago to prefer .compatible != NULL matches over those
with .compatible == NULL.

Would be great if Stephen can re-test that. If it solves the issue, I
can send a patch tomorrow.

Sebastian

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files.

2014-02-11 Thread Stephen N Chivers
Sebastian Hesselbarth sebastian.hesselba...@gmail.com wrote on 
02/12/2014 10:46:36 AM:

 From: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
 To: Scott Wood scottw...@freescale.com
 Cc: Kumar Gala ga...@kernel.crashing.org, Stephen N Chivers 
 schiv...@csc.com.au, Chris Proctor cproc...@csc.com.au, 
 linuxppc-dev@lists.ozlabs.org, Arnd Bergmann a...@arndb.de, 
 devicetree devicet...@vger.kernel.org
 Date: 02/12/2014 11:04 AM
 Subject: Re: Linux-3.14-rc2: Order of serial node compatibles in DTS 
files.
 
 On 02/12/2014 12:41 AM, Scott Wood wrote:
  On Tue, 2014-02-11 at 23:51 +0100, Sebastian Hesselbarth wrote:
  On 02/11/2014 11:33 PM, Kumar Gala wrote:
  Hmm,
 
  Wondering if this caused the issue:
 
  commit 105353145eafb3ea919f5cdeb652a9d8f270228e
  Author: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
  Date:   Tue Dec 3 14:52:00 2013 +0100
 
OF: base: match each node compatible against all given matches 
first
 
  [adding Arnd on Cc]
 
  Could be. I checked tty/serial/of_serial.c and it does not provide a
  compatible for fsl,ns16550. Does reverting the patch fix the issue
  observed?
 
  I don't think the missing compatible is causing it, but of_serial
  provides a DT match for .type = serial just to fail later on
  with the error seen above.
 
  The commit in question reorders of_match_device in a way that match
  table order is not relevant anymore. This can cause it to match
  .type = serial first here.
 
  Rather than touching the commit, I suggest to remove the problematic
  .type = serial from the match table. It is of no use anyway.
 
  Regardless of whether .type = serial gets removed, it seems wrong 
for
  of_match_node() to accept a .type-only match (or .name, or anything 
else
  that doesn't involve .compatible) before it accepts a compatible match
  other than the first in the compatible property.
 
 Right, I thought about it and came to the same conclusion. I sent a
 patch a second ago to prefer .compatible != NULL matches over those
 with .compatible == NULL.
 
 Would be great if Stephen can re-test that. If it solves the issue, I
 can send a patch tomorrow.
Done.

But, the Interrupt Controller (MPIC)
goes AWOL and it is down hill from there.

The MPIC is specified in the DTS as:

mpic: pic@4 {
interrupt-controller;
#address-cells = 0;
#interrupt-cells = 2;
reg = 0x4 0x4;
compatible = chrp,open-pic;
device_type = open-pic;
big-endian;
};

The board support file has the standard mechanism for allocating
the PIC:

struct mpic *mpic;

mpic = mpic_alloc(NULL, 0, 0, 0, 256,  OpenPIC  );
BUG_ON(mpic == NULL);

mpic_init(mpic);

I checked for damage in applying the patch and it has applied
correctly.

Stephen Chivers,
CSC Australia Pty. Ltd.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2] powerpc: thp: Fix crash on mremap

2014-02-11 Thread Aneesh Kumar K.V
Greg KH gre...@linuxfoundation.org writes:

 On Fri, Feb 07, 2014 at 07:21:57PM +0530, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 This patch fix the below crash
 
 NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
 LR [c00439ac] .hash_page+0x18c/0x5e0
 ...
 Call Trace:
 [c00736103c40] [1b00] 0x1b00(unreliable)
 [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58
 
 On ppc64 we use the pgtable for storing the hpte slot information and
 store address to the pgtable at a constant offset (PTRS_PER_PMD) from
 pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
 the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
 from new pmd.
 
 We also want to move the withdraw and deposit before the set_pmd so
 that, when page fault find the pmd as trans huge we can be sure that
 pgtable can be located at the offset.
 
 variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
 for 3.12 stable series

 This doesn't look like a variant, it looks totally different.  Why
 can't I just take the b3084f4db3aeb991c507ca774337c7e7893ed04f patch
 (and follow-on fix) for 3.12?

Because the code in that function changed in 3.13. Kirill added split
ptl locks for huge pte, and we decide whether to withdraw and
deposit again based on the ptl locks in 3.13. In 3.12 we do that only
for ppc64 using #ifdef



 I _REALLY_ dislike patches that are totally different from Linus's tree
 in stable trees, it has caused nothing but problems in the past.


-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2] powerpc: thp: Fix crash on mremap

2014-02-11 Thread Aneesh Kumar K.V
Benjamin Herrenschmidt b...@kernel.crashing.org writes:

 On Tue, 2014-02-11 at 09:31 -0800, Greg KH wrote:
 On Fri, Feb 07, 2014 at 07:21:57PM +0530, Aneesh Kumar K.V wrote:
  From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
  
  This patch fix the below crash
  
  NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
  LR [c00439ac] .hash_page+0x18c/0x5e0
  ...
  Call Trace:
  [c00736103c40] [1b00] 0x1b00(unreliable)
  [437908.479693] [c00736103d50] [c00439ac] 
  .hash_page+0x18c/0x5e0
  [437908.479699] [c00736103e30] [c000924c] 
  .do_hash_page+0x4c/0x58
  
  On ppc64 we use the pgtable for storing the hpte slot information and
  store address to the pgtable at a constant offset (PTRS_PER_PMD) from
  pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
  the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
  from new pmd.
  
  We also want to move the withdraw and deposit before the set_pmd so
  that, when page fault find the pmd as trans huge we can be sure that
  pgtable can be located at the offset.
  
  variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
  for 3.12 stable series
 
 This doesn't look like a variant, it looks totally different.  Why
 can't I just take the b3084f4db3aeb991c507ca774337c7e7893ed04f patch
 (and follow-on fix) for 3.12?
 
 I _REALLY_ dislike patches that are totally different from Linus's tree
 in stable trees, it has caused nothing but problems in the past.

 I don't think it applies... (I tried on an internal tree) but the
 affected function changed in 3.13 in various ways. Aneesh, please
 provide a more details explanation and whether we should backport those
 other changes too or whether this is not necessary

Yes the affected function added support for split ptl locks for huge
pte. I don't think that is a stable material.

.

 BTW. Aneesh, we need a 3.11.x one too


3.11.x it is already applied.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V2 1/3] powerpc: mm: Add new set flag argument to pte/pmd update function

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We will use this later to set the _PAGE_NUMA bit.

Acked-by: Mel Gorman mgor...@suse.de
Acked-by: Rik van Riel r...@redhat.com
Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/hugetlb.h   |  2 +-
 arch/powerpc/include/asm/pgtable-ppc64.h | 26 +++---
 arch/powerpc/mm/pgtable_64.c | 12 +++-
 arch/powerpc/mm/subpage-prot.c   |  2 +-
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index d750336b171d..623f2971ce0e 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -127,7 +127,7 @@ static inline pte_t huge_ptep_get_and_clear(struct 
mm_struct *mm,
unsigned long addr, pte_t *ptep)
 {
 #ifdef CONFIG_PPC64
-   return __pte(pte_update(mm, addr, ptep, ~0UL, 1));
+   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
 #else
return __pte(pte_update(ptep, ~0UL, 0));
 #endif
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index bc141c950b1e..eb9261024f51 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -195,6 +195,7 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned 
long addr,
 static inline unsigned long pte_update(struct mm_struct *mm,
   unsigned long addr,
   pte_t *ptep, unsigned long clr,
+  unsigned long set,
   int huge)
 {
 #ifdef PTE_ATOMIC_UPDATES
@@ -205,14 +206,15 @@ static inline unsigned long pte_update(struct mm_struct 
*mm,
andi.   %1,%0,%6\n\
bne-1b \n\
andc%1,%0,%4 \n\
+   or  %1,%1,%7\n\
stdcx.  %1,0,%3 \n\
bne-1b
: =r (old), =r (tmp), =m (*ptep)
-   : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY)
+   : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY), r (set)
: cc );
 #else
unsigned long old = pte_val(*ptep);
-   *ptep = __pte(old  ~clr);
+   *ptep = __pte((old  ~clr) | set);
 #endif
/* huge pages use the old page table lock */
if (!huge)
@@ -231,9 +233,9 @@ static inline int __ptep_test_and_clear_young(struct 
mm_struct *mm,
 {
unsigned long old;
 
-   if ((pte_val(*ptep)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+   if ((pte_val(*ptep)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
return 0;
-   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0);
+   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
return (old  _PAGE_ACCESSED) != 0;
 }
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
@@ -252,7 +254,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, 
unsigned long addr,
if ((pte_val(*ptep)  _PAGE_RW) == 0)
return;
 
-   pte_update(mm, addr, ptep, _PAGE_RW, 0);
+   pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
 }
 
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
@@ -261,7 +263,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct 
*mm,
if ((pte_val(*ptep)  _PAGE_RW) == 0)
return;
 
-   pte_update(mm, addr, ptep, _PAGE_RW, 1);
+   pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
 }
 
 /*
@@ -284,14 +286,14 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
 {
-   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0);
+   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
return __pte(old);
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 pte_t * ptep)
 {
-   pte_update(mm, addr, ptep, ~0UL, 0);
+   pte_update(mm, addr, ptep, ~0UL, 0, 0);
 }
 
 
@@ -506,7 +508,9 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 
 extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
 unsigned long addr,
-pmd_t *pmdp, unsigned long clr);
+pmd_t *pmdp,
+unsigned long clr,
+unsigned long set);
 
 static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
  unsigned long addr, pmd_t *pmdp)
@@ -515,7 +519,7 @@ static inline int __pmdp_test_and_clear_young(struct 
mm_struct *mm,
 
if ((pmd_val(*pmdp)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
return 0;
-   old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED);
+   old = 

[PATCH V2 0/3] powerpc: Fix random application crashes with NUMA_BALANCING enabled

2014-02-11 Thread Aneesh Kumar K.V
Hello,

This patch series fix random application crashes observed on ppc64 with numa
balancing enabled. Without the patch we see crashes like

anacron[14551]: unhandled signal 11 at 0041 nip 3cfd54b4 lr 
3cfd5464 code 30001
anacron[14599]: unhandled signal 11 at 0041 nip 3efc54b4 lr 
3efc5464 code 30001

Changes from V1:
* Build fix for CONFIG_NUMA_BALANCING disabled

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V2 2/3] mm: dirty accountable change only apply to non prot numa case

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

So move it within the if loop

Acked-by: Mel Gorman mgor...@suse.de
Reviewed-by: Rik van Riel r...@redhat.com
Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 mm/mprotect.c | 21 +++--
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7332c1785744..33eab902f10e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,6 +58,13 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
if (pte_numa(ptent))
ptent = pte_mknonnuma(ptent);
ptent = pte_modify(ptent, newprot);
+   /*
+* Avoid taking write faults for pages we
+* know to be dirty.
+*/
+   if (dirty_accountable  pte_dirty(ptent))
+   ptent = pte_mkwrite(ptent);
+   ptep_modify_prot_commit(mm, addr, pte, ptent);
updated = true;
} else {
struct page *page;
@@ -72,22 +79,8 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
}
}
}
-
-   /*
-* Avoid taking write faults for pages we know to be
-* dirty.
-*/
-   if (dirty_accountable  pte_dirty(ptent)) {
-   ptent = pte_mkwrite(ptent);
-   updated = true;
-   }
-
if (updated)
pages++;
-
-   /* Only !prot_numa always clears the pte */
-   if (!prot_numa)
-   ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION)  !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
 
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V2 3/3] mm: Use ptep/pmdp_set_numa for updating _PAGE_NUMA bit

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions. ppc64 also 
doesn't implement
flush_tlb_range. ppc64 require the tlb flushing to be batched within ptl locks. 
The reason
to do that is to ensure that the hash page table is in sync with linux page 
table.
We track the hpte index in linux pte and if we clear them without flushing hash 
and drop the
ptl lock, we can have another cpu update the pte and can end up with double 
hash. We also want
to keep set_pte_at simpler by not requiring them to do hash flush for 
performance reason.
Hence cannot use them while updating _PAGE_NUMA bit. Add new functions for 
marking pte/pmd numa

Acked-by: Mel Gorman mgor...@suse.de
Reviewed-by: Rik van Riel r...@redhat.com
Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
Changes from V1:
 * Build fix for non numa balancing config
 
 arch/powerpc/include/asm/pgtable.h | 22 +
 include/asm-generic/pgtable.h  | 39 ++
 mm/huge_memory.c   |  9 ++---
 mm/mprotect.c  |  4 +---
 4 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index f83b6f3e1b39..3ebb188c3ff5 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -75,12 +75,34 @@ static inline pte_t pte_mknuma(pte_t pte)
return pte;
 }
 
+#define ptep_set_numa ptep_set_numa
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+pte_t *ptep)
+{
+   if ((pte_val(*ptep)  _PAGE_PRESENT) == 0)
+   VM_BUG_ON(1);
+
+   pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
+   return;
+}
+
 #define pmd_numa pmd_numa
 static inline int pmd_numa(pmd_t pmd)
 {
return pte_numa(pmd_pte(pmd));
 }
 
+#define pmdp_set_numa pmdp_set_numa
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+pmd_t *pmdp)
+{
+   if ((pmd_val(*pmdp)  _PAGE_PRESENT) == 0)
+   VM_BUG_ON(1);
+
+   pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
+   return;
+}
+
 #define pmd_mknonnuma pmd_mknonnuma
 static inline pmd_t pmd_mknonnuma(pmd_t pmd)
 {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 8e4f41d9af4d..34c7bdc06014 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -701,6 +701,18 @@ static inline pte_t pte_mknuma(pte_t pte)
 }
 #endif
 
+#ifndef ptep_set_numa
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+pte_t *ptep)
+{
+   pte_t ptent = *ptep;
+
+   ptent = pte_mknuma(ptent);
+   set_pte_at(mm, addr, ptep, ptent);
+   return;
+}
+#endif
+
 #ifndef pmd_mknuma
 static inline pmd_t pmd_mknuma(pmd_t pmd)
 {
@@ -708,6 +720,18 @@ static inline pmd_t pmd_mknuma(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_PRESENT);
 }
 #endif
+
+#ifndef pmdp_set_numa
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+pmd_t *pmdp)
+{
+   pmd_t pmd = *pmdp;
+
+   pmd = pmd_mknuma(pmd);
+   set_pmd_at(mm, addr, pmdp, pmd);
+   return;
+}
+#endif
 #else
 extern int pte_numa(pte_t pte);
 extern int pmd_numa(pmd_t pmd);
@@ -715,6 +739,8 @@ extern pte_t pte_mknonnuma(pte_t pte);
 extern pmd_t pmd_mknonnuma(pmd_t pmd);
 extern pte_t pte_mknuma(pte_t pte);
 extern pmd_t pmd_mknuma(pmd_t pmd);
+extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t 
*ptep);
+extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t 
*pmdp);
 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 #else
 static inline int pmd_numa(pmd_t pmd)
@@ -742,10 +768,23 @@ static inline pte_t pte_mknuma(pte_t pte)
return pte;
 }
 
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+pte_t *ptep)
+{
+   return;
+}
+
+
 static inline pmd_t pmd_mknuma(pmd_t pmd)
 {
return pmd;
 }
+
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+pmd_t *pmdp)
+{
+   return ;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #endif /* CONFIG_MMU */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 82166bf974e1..da23eb96779f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1545,6 +1545,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR;
+   set_pmd_at(mm, addr, pmd, entry);
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
@@ -1557,16 

[git pull] Please pull powerpc.git merge branch

2014-02-11 Thread Benjamin Herrenschmidt
Hi Linus !

Here is some powerpc goodness for -rc2. Arguably -rc1 material more than
-rc2 but I was travelling (again !)

It's mostly bug fixes including regressions, but there are a couple of
new things that I decided to drop-in.

One is a straightforward patch from Michael to add a bunch of P8 cache
events to perf.

The other one is a patch by myself to add the direct DMA (iommu bypass)
for PCIe on Power8 for 64-bit capable devices. This has been around for
a while, I had lost track of it. However it's been in our internal
kernels we use for testing P8 already and it affects only P8 related
code. Since P8 is still unreleased the risk is pretty much nil at this
point.

Cheers,
Ben.

The following changes since commit b28a960c42fcd9cfc987441fa6d1c1a471f0f9ed:

  Linux 3.14-rc2 (2014-02-09 18:15:47 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git merge

for you to fetch changes up to cd15b048445d0a54f7147c35a86c5a16ef231554:

  powerpc/powernv: Add iommu DMA bypass support for IODA2 (2014-02-11 16:07:37 
+1100)


Anshuman Khandual (1):
  powerpc/perf: Configure BHRB filter before enabling PMU interrupts

Anton Blanchard (1):
  powerpc: Fix endian issues in kexec and crash dump code

Benjamin Herrenschmidt (1):
  powerpc/powernv: Add iommu DMA bypass support for IODA2

Kevin Hao (1):
  powerpc/ppc32: Fix the bug in the init of non-base exception stack for UP

Laurent Dufour (1):
  powerpc/relocate fix relocate processing in LE mode

Mahesh Salgaonkar (2):
  powerpc/pseries: Disable relocation on exception while going down during 
crash.
  powerpc: Fix kdump hang issue on p8 with relocation on exception enabled.

Michael Ellerman (5):
  powerpc/perf: Add Power8 cache  TLB events
  powerpc/pseries: Select ARCH_RANDOM on pseries
  powerpc/xmon: Don't loop forever in get_output_lock()
  powerpc/xmon: Fix timeout loop in get_output_lock()
  powerpc/xmon: Don't signal we've entered until we're finished printing

Nathan Fontenot (1):
  crypto/nx/nx-842: Fix handling of vmalloc addresses

Paul Gortmaker (1):
  powerpc: Fix build failure in sysdev/mpic.c for MPIC_WEIRD=y

Thadeu Lima de Souza Cascardo (1):
  powerpc/eeh: Drop taken reference to driver on eeh_rmv_device

 arch/powerpc/include/asm/dma-mapping.h|   1 +
 arch/powerpc/include/asm/iommu.h  |   1 +
 arch/powerpc/include/asm/sections.h   |  12 +++
 arch/powerpc/kernel/dma.c |  10 ++-
 arch/powerpc/kernel/eeh_driver.c  |   8 +-
 arch/powerpc/kernel/iommu.c   |  12 +++
 arch/powerpc/kernel/irq.c |   5 ++
 arch/powerpc/kernel/machine_kexec.c   |  14 ++-
 arch/powerpc/kernel/machine_kexec_64.c|   6 +-
 arch/powerpc/kernel/reloc_64.S|   4 +-
 arch/powerpc/kernel/setup_32.c|   5 ++
 arch/powerpc/mm/hash_utils_64.c   |  14 +++
 arch/powerpc/perf/core-book3s.c   |   5 +-
 arch/powerpc/perf/power8-pmu.c| 144 ++
 arch/powerpc/platforms/powernv/pci-ioda.c |  84 +
 arch/powerpc/platforms/powernv/pci.c  |  10 +++
 arch/powerpc/platforms/powernv/pci.h  |   6 +-
 arch/powerpc/platforms/powernv/powernv.h  |   8 ++
 arch/powerpc/platforms/powernv/setup.c|   9 ++
 arch/powerpc/platforms/pseries/Kconfig|   1 +
 arch/powerpc/platforms/pseries/setup.c|   3 +-
 arch/powerpc/sysdev/mpic.c|  38 
 arch/powerpc/xmon/xmon.c  |  24 +++--
 drivers/crypto/nx/nx-842.c|  29 +++---
 24 files changed, 398 insertions(+), 55 deletions(-)


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc: Fix attempt to move .org backwards error

2014-02-11 Thread Stephen Rothwell
Hi all,

On Tue, 10 Dec 2013 10:26:10 +1100 Benjamin Herrenschmidt 
b...@kernel.crashing.org wrote:

 On Tue, 2013-12-10 at 10:10 +1100, Stephen Rothwell wrote:
  Reported-by: Stephen Rothwell s...@canb.auug.org.au
  Tested-by: Stephen Rothwell s...@canb.auug.org.au
  
  Works for me.  Thanks.  I will add this to linux-next today if Ben
  doesn't add it to his tree.
 
 I will but probably not soon enough for your cut today

As noted elsewhere, this did not completely fix the problem and I have
been still getting this error from my allyesconfig builds for some time:

arch/powerpc/kernel/exceptions-64s.S: Assembler messages:
arch/powerpc/kernel/exceptions-64s.S:1312: Error: attempt to move .org backwards

Could someone please fix this?
-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpb14He1UsDa.pgp
Description: PGP signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files.

2014-02-11 Thread Kevin Hao
On Wed, Feb 12, 2014 at 10:21:58AM +1000, Stephen N Chivers wrote:
 But, the Interrupt Controller (MPIC)
 goes AWOL and it is down hill from there.
 
 The MPIC is specified in the DTS as:
 
 mpic: pic@4 {
 interrupt-controller;
 #address-cells = 0;
 #interrupt-cells = 2;
 reg = 0x4 0x4;
 compatible = chrp,open-pic;
 device_type = open-pic;
 big-endian;
 };
 
 The board support file has the standard mechanism for allocating
 the PIC:
 
 struct mpic *mpic;
 
 mpic = mpic_alloc(NULL, 0, 0, 0, 256,  OpenPIC  );
 BUG_ON(mpic == NULL);
 
 mpic_init(mpic);
 
 I checked for damage in applying the patch and it has applied
 correctly.

How about the following fix?

diff --git a/drivers/of/base.c b/drivers/of/base.c
index ff85450d5683..ca91984d3c4b 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -730,32 +730,40 @@ out:
 }
 EXPORT_SYMBOL(of_find_node_with_property);
 
+static int of_match_type_name(const struct device_node *node,
+   const struct of_device_id *m)
+{
+   int match = 1;
+
+   if (m-name[0])
+   match = node-name  !strcmp(m-name, node-name);
+
+   if (m-type[0])
+   match = node-type  !strcmp(m-type, node-type);
+
+   return match;
+}
+
 static
 const struct of_device_id *__of_match_node(const struct of_device_id *matches,
   const struct device_node *node)
 {
const char *cp;
int cplen, l;
+   const struct of_device_id *m;
+   int match;
 
if (!matches)
return NULL;
 
cp = __of_get_property(node, compatible, cplen);
do {
-   const struct of_device_id *m = matches;
+   m = matches;
 
/* Check against matches with current compatible string */
-   while (m-name[0] || m-type[0] || m-compatible[0]) {
-   int match = 1;
-   if (m-name[0])
-   match = node-name
-!strcmp(m-name, node-name);
-   if (m-type[0])
-   match = node-type
-!strcmp(m-type, node-type);
-   if (m-compatible[0])
-   match = cp
-!of_compat_cmp(m-compatible, cp,
+   while (m-compatible[0]) {
+   match = of_match_type_name(node, m);
+   match = cp  !of_compat_cmp(m-compatible, cp,
strlen(m-compatible));
if (match)
return m;
@@ -770,6 +778,15 @@ const struct of_device_id *__of_match_node(const struct 
of_device_id *matches,
}
} while (cp  (cplen  0));
 
+   /* Check against matches without compatible string */
+   m = matches;
+   while (!m-compatible[0]  (m-name[0] || m-type[0])) {
+   match = of_match_type_name(node, m);
+   if (match)
+   return m;
+   m++;
+   }
+
return NULL;
 }


Thanks,
Kevin
 
 Stephen Chivers,
 CSC Australia Pty. Ltd.
 
 ___
 Linuxppc-dev mailing list
 Linuxppc-dev@lists.ozlabs.org
 https://lists.ozlabs.org/listinfo/linuxppc-dev


pgp8VuWp0SQuQ.pgp
Description: PGP signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: set the correct ksp_limit on ppc32 when switching to irq stack

2014-02-11 Thread Kevin Hao
On Wed, Jan 22, 2014 at 08:48:48AM +1100, Benjamin Herrenschmidt wrote:
 It will be merged when I come back from vacation. It was too late for
 3.13 so I'll send it to Linus next week and will CC -stable.

Hi Ben,

Any reason why this is still not merged yet?

Thanks,
Kevin
 
 Cheers,
 Ben.
 
 


pgpgNwRn08JKN.pgp
Description: PGP signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/2] powerpc: Link VDSOs at 0x0

2014-02-11 Thread Anton Blanchard

perf is failing to resolve symbols in the VDSO. A while (1)
gettimeofday() loop shows:

93.99%  [vdso]  [.] 0x05e0
 3.12%  test[.] 0037.plt_call.gettimeofday@@GLIBC_2.18
 2.81%  test[.] main

The reason for this is that we are linking our VDSO shared libraries
at 1MB, which is a little weird. Even though this is uncommon, Alan
points out that it is valid and we should probably fix perf userspace.

Regardless, I can't see a reason why we are doing this. The code
is all position independent and we never rely on the VDSO ending
up at 1M (and we never place it there on 64bit tasks).

Changing our link address to 0x0 fixes perf VDSO symbol resolution:

73.18%  [vdso]  [.] 0x060c
12.39%  [vdso]  [.] __kernel_gettimeofday
 3.58%  test[.] 0037.plt_call.gettimeofday@@GLIBC_2.18
 2.94%  [vdso]  [.] __kernel_datapage_offset
 2.90%  test[.] main

We still have some local symbol resolution issues that will be
fixed in a subsequent patch.

Signed-off-by: Anton Blanchard an...@samba.org
---

diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h
index 0d9cecd..c53f5f6 100644
--- a/arch/powerpc/include/asm/vdso.h
+++ b/arch/powerpc/include/asm/vdso.h
@@ -4,11 +4,11 @@
 #ifdef __KERNEL__
 
 /* Default link addresses for the vDSOs */
-#define VDSO32_LBASE   0x10
-#define VDSO64_LBASE   0x10
+#define VDSO32_LBASE   0x0
+#define VDSO64_LBASE   0x0
 
 /* Default map addresses for 32bit vDSO */
-#define VDSO32_MBASE   VDSO32_LBASE
+#define VDSO32_MBASE   0x10
 
 #define VDSO_VERSION_STRINGLINUX_2.6.15
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/2] powerpc: Use unstripped VDSO image for more accurate profiling data

2014-02-11 Thread Anton Blanchard

We are seeing a lot of hits in the VDSO that are not resolved by perf.
A while(1) gettimeofday() loop shows the issue:

27.64%  [vdso]  [.] 0x060c
22.57%  [vdso]  [.] 0x0628
16.88%  [vdso]  [.] 0x0610
12.39%  [vdso]  [.] __kernel_gettimeofday 
 6.09%  [vdso]  [.] 0x05f8
 3.58%  test[.] 0037.plt_call.gettimeofday@@GLIBC_2.18
 2.94%  [vdso]  [.] __kernel_datapage_offset  
 2.90%  test[.] main  

We are using a stripped VDSO image which means only symbols with
relocation info can be resolved. There isn't a lot of point to
stripping the VDSO, the debug info is only about 1kB:

4680 arch/powerpc/kernel/vdso64/vdso64.so
5815 arch/powerpc/kernel/vdso64/vdso64.so.dbg

By using the unstripped image, we can resolve all the symbols in the
VDSO and the perf profile data looks much better:

76.53%  [vdso]  [.] __do_get_tspec
12.20%  [vdso]  [.] __kernel_gettimeofday 
 5.05%  [vdso]  [.] __get_datapage
 3.20%  test[.] main  
 2.92%  test[.] 0037.plt_call.gettimeofday@@GLIBC_2.18

Signed-off-by: Anton Blanchard an...@samba.org
---

diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S 
b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
index 6e8f507..37e2e13 100644
--- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
@@ -7,7 +7,7 @@
.globl vdso32_start, vdso32_end
.balign PAGE_SIZE
 vdso32_start:
-   .incbin arch/powerpc/kernel/vdso32/vdso32.so
+   .incbin arch/powerpc/kernel/vdso32/vdso32.so.dbg
.balign PAGE_SIZE
 vdso32_end:
 
diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S 
b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
index b8553d6..01e7799 100644
--- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
+++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
@@ -7,7 +7,7 @@
.globl vdso64_start, vdso64_end
.balign PAGE_SIZE
 vdso64_start:
-   .incbin arch/powerpc/kernel/vdso64/vdso64.so
+   .incbin arch/powerpc/kernel/vdso64/vdso64.so.dbg
.balign PAGE_SIZE
 vdso64_end:
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/3] powerpc/eeh: Cleanup on eeh_subsystem_enabled

2014-02-11 Thread Gavin Shan
The patch cleans up variable eeh_subsystem_enabled so that we needn't
refer the variable directly from external. Instead, we will use
function eeh_enabled() and eeh_set_enable() to operate the variable.

Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h   |   21 +++--
 arch/powerpc/kernel/eeh.c|   12 ++--
 arch/powerpc/platforms/powernv/eeh-powernv.c |2 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |2 +-
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 9e39ceb..d4dd41f 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -172,10 +172,20 @@ struct eeh_ops {
 };
 
 extern struct eeh_ops *eeh_ops;
-extern int eeh_subsystem_enabled;
+extern bool eeh_subsystem_enabled;
 extern raw_spinlock_t confirm_error_lock;
 extern int eeh_probe_mode;
 
+static inline bool eeh_enabled(void)
+{
+   return eeh_subsystem_enabled;
+}
+
+static inline void eeh_set_enable(bool mode)
+{
+   eeh_subsystem_enabled = mode;
+}
+
 #define EEH_PROBE_MODE_DEV (10)  /* From PCI device  */
 #define EEH_PROBE_MODE_DEVTREE (11)  /* From device tree */
 
@@ -246,7 +256,7 @@ void eeh_remove_device(struct pci_dev *);
  * If this macro yields TRUE, the caller relays to eeh_check_failure()
  * which does further tests out of line.
  */
-#define EEH_POSSIBLE_ERROR(val, type)  ((val) == (type)~0  
eeh_subsystem_enabled)
+#define EEH_POSSIBLE_ERROR(val, type)  ((val) == (type)~0  eeh_enabled())
 
 /*
  * Reads from a device which has been isolated by EEH will return
@@ -257,6 +267,13 @@ void eeh_remove_device(struct pci_dev *);
 
 #else /* !CONFIG_EEH */
 
+static inline bool eeh_enabled(void)
+{
+return false;
+}
+
+static inline void eeh_set_enable(bool mode) { }
+
 static inline int eeh_init(void)
 {
return 0;
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 148db72..f22f7b6 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -89,7 +89,7 @@
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
 
-int eeh_subsystem_enabled;
+bool eeh_subsystem_enabled = false;
 EXPORT_SYMBOL(eeh_subsystem_enabled);
 
 /*
@@ -364,7 +364,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 
eeh_stats.total_mmio_ffs++;
 
-   if (!eeh_subsystem_enabled)
+   if (!eeh_enabled())
return 0;
 
if (!edev) {
@@ -822,7 +822,7 @@ int eeh_init(void)
return ret;
}
 
-   if (eeh_subsystem_enabled)
+   if (eeh_enabled())
pr_info(EEH: PCI Enhanced I/O Error Handling Enabled\n);
else
pr_warning(EEH: No capable adapters found\n);
@@ -897,7 +897,7 @@ void eeh_add_device_late(struct pci_dev *dev)
struct device_node *dn;
struct eeh_dev *edev;
 
-   if (!dev || !eeh_subsystem_enabled)
+   if (!dev || !eeh_enabled())
return;
 
pr_debug(EEH: Adding device %s\n, pci_name(dev));
@@ -1005,7 +1005,7 @@ void eeh_remove_device(struct pci_dev *dev)
 {
struct eeh_dev *edev;
 
-   if (!dev || !eeh_subsystem_enabled)
+   if (!dev || !eeh_enabled())
return;
edev = pci_dev_to_eeh_dev(dev);
 
@@ -1045,7 +1045,7 @@ void eeh_remove_device(struct pci_dev *dev)
 
 static int proc_eeh_show(struct seq_file *m, void *v)
 {
-   if (0 == eeh_subsystem_enabled) {
+   if (!eeh_enabled()) {
seq_printf(m, EEH Subsystem is globally disabled\n);
seq_printf(m, eeh_total_mmio_ffs=%llu\n, 
eeh_stats.total_mmio_ffs);
} else {
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index a79fddc..a59788e 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -145,7 +145,7 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void 
*flag)
 * Enable EEH explicitly so that we will do EEH check
 * while accessing I/O stuff
 */
-   eeh_subsystem_enabled = 1;
+   eeh_set_enable(true);
 
/* Save memory bars */
eeh_save_bars(edev);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 9ef3cc8..8a8f047 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -265,7 +265,7 @@ static void *pseries_eeh_of_probe(struct device_node *dn, 
void *flag)
enable = 1;
 
if (enable) {
-   eeh_subsystem_enabled = 1;
+   eeh_set_enable(true);
eeh_add_to_parent_pe(edev);
 
pr_debug(%s: EEH enabled on %s PHB#%d-PE#%x, config 
addr#%x\n,
-- 
1.7.10.4


[PATCH 1/3] powerpc/powernv: Rework EEH reset

2014-02-11 Thread Gavin Shan
When doing reset in order to recover the affected PE, we issue
hot reset on PE primary bus if it's not root bus. Otherwise, we
issue hot or fundamental reset on root port or PHB accordingly.
For the later case, we didn't cover the situation where PE only
includes root port and it potentially causes kernel crash upon
EEH error to the PE.

The patch reworks the logic of EEH reset to improve the code
readability and also avoid the kernel crash.

Cc: sta...@vger.kernel.org
Reported-by: Thadeu Lima de Souza Cascardo casca...@linux.vnet.ibm.com
Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/powernv/eeh-ioda.c |   29 -
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
b/arch/powerpc/platforms/powernv/eeh-ioda.c
index e1e7161..fcb79cf 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -489,8 +489,7 @@ static int ioda_eeh_bridge_reset(struct pci_controller 
*hose,
 static int ioda_eeh_reset(struct eeh_pe *pe, int option)
 {
struct pci_controller *hose = pe-phb;
-   struct eeh_dev *edev;
-   struct pci_dev *dev;
+   struct pci_bus *bus;
int ret;
 
/*
@@ -519,31 +518,11 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
if (pe-type  EEH_PE_PHB) {
ret = ioda_eeh_phb_reset(hose, option);
} else {
-   if (pe-type  EEH_PE_DEVICE) {
-   /*
-* If it's device PE, we didn't refer to the parent
-* PCI bus yet. So we have to figure it out indirectly.
-*/
-   edev = list_first_entry(pe-edevs,
-   struct eeh_dev, list);
-   dev = eeh_dev_to_pci_dev(edev);
-   dev = dev-bus-self;
-   } else {
-   /*
-* If it's bus PE, the parent PCI bus is already there
-* and just pick it up.
-*/
-   dev = pe-bus-self;
-   }
-
-   /*
-* Do reset based on the fact that the direct upstream bridge
-* is root bridge (port) or not.
-*/
-   if (dev-bus-number == 0)
+   bus = eeh_pe_bus_get(pe);
+   if (pci_is_root_bus(bus))
ret = ioda_eeh_root_reset(hose, option);
else
-   ret = ioda_eeh_bridge_reset(hose, dev, option);
+   ret = ioda_eeh_bridge_reset(hose, bus-self, option);
}
 
return ret;
-- 
1.7.10.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 3/3] powerpc/eeh: Disable EEH on reboot

2014-02-11 Thread Gavin Shan
We possiblly detect EEH errors during reboot, particularly in kexec
path, but it's impossible for device drivers and EEH core to handle
or recover them properly.

The patch registers one reboot notifier for EEH and disable EEH
subsystem during reboot. That means the EEH errors is going to be
cleared by hardware reset or second kernel during early stage of
PCI probe.

Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/eeh.c |   20 
 arch/powerpc/platforms/powernv/eeh-ioda.c |3 ++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index f22f7b6..e7b76a6 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -28,6 +28,7 @@
 #include linux/pci.h
 #include linux/proc_fs.h
 #include linux/rbtree.h
+#include linux/reboot.h
 #include linux/seq_file.h
 #include linux/spinlock.h
 #include linux/export.h
@@ -747,6 +748,17 @@ int __exit eeh_ops_unregister(const char *name)
return -EEXIST;
 }
 
+static int eeh_reboot_notifier(struct notifier_block *nb,
+  unsigned long action, void *unused)
+{
+   eeh_set_enable(false);
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block eeh_reboot_nb = {
+   .notifier_call = eeh_reboot_notifier,
+};
+
 /**
  * eeh_init - EEH initialization
  *
@@ -778,6 +790,14 @@ int eeh_init(void)
if (machine_is(powernv)  cnt++ = 0)
return ret;
 
+   /* Register reboot notifier */
+   ret = register_reboot_notifier(eeh_reboot_nb);
+   if (ret) {
+   pr_warn(%s: Failed to register notifier (%d)\n,
+   __func__, ret);
+   return ret;
+   }
+
/* call platform initialization function */
if (!eeh_ops) {
pr_warning(%s: Platform EEH operation not found\n,
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
b/arch/powerpc/platforms/powernv/eeh-ioda.c
index fcb79cf..f514743 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -44,7 +44,8 @@ static int ioda_eeh_event(struct notifier_block *nb,
 
/* We simply send special EEH event */
if ((changed_evts  OPAL_EVENT_PCI_ERROR) 
-   (events  OPAL_EVENT_PCI_ERROR))
+   (events  OPAL_EVENT_PCI_ERROR) 
+   eeh_enabled())
eeh_send_failure_event(NULL);
 
return 0;
-- 
1.7.10.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev