Re: [RFC][PATCH] Update ppc disassembly in xmon

2016-11-23 Thread Balbir Singh


On 24/11/16 16:14, Andrew Donnellan wrote:
> On 24/11/16 13:05, Balbir Singh wrote:
>> 9. The license for these files is now GPL v3 or later
> 
> As much as I love the GPLv3, isn't this an instant NAK?
> 

Thats why I called it out, my bad though I should have done
a stronger check than just "git grep "version 3"" which matched
scripts/kconfig/zconf.tab.c_shipped -- derived from bison, which
can itself be licensed differently

Michael, please hold these patches and don't merge them yet.

Balbir Singh.


[PATCH kernel v6 6/7] vfio/spapr: Reference mm in tce_container

2016-11-23 Thread Alexey Kardashevskiy
In some situations the userspace memory context may live longer than
the userspace process itself so if we need to do proper memory context
cleanup, we better have tce_container take a reference to mm_struct and
use it later when the process is gone (@current or @current->mm is NULL).

This references mm and stores the pointer in the container; this is done
in a new helper - tce_iommu_mm_set() - when one of the following happens:
- a container is enabled (IOMMU v1);
- a first attempt to pre-register memory is made (IOMMU v2);
- a DMA window is created (IOMMU v2).
The @mm stays referenced till the container is destroyed.

This replaces current->mm with container->mm everywhere except debug
prints.

This adds a check that current->mm is the same as the one stored in
the container to prevent userspace from making changes to a memory
context of other processes.

DMA map/unmap ioctls() do not check for @mm as they already check
for @enabled which is set after tce_iommu_mm_set() is called.

This does not reference a task as multiple threads within the same mm
are allowed to ioctl() to vfio and supposedly they will have same limits
and capabilities and if they do not, we'll just fail with no harm made.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v6:
* updated the commit log about not referencing task

v5:
* postpone referencing of mm

v4:
* added check for container->mm!=current->mm in tce_iommu_ioctl()
for all ioctls and removed other redundand checks
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 159 ++--
 1 file changed, 99 insertions(+), 60 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 88622be..b2fb05ac 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -31,49 +31,49 @@
 static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);
 
-static long try_increment_locked_vm(long npages)
+static long try_increment_locked_vm(struct mm_struct *mm, long npages)
 {
long ret = 0, locked, lock_limit;
 
-   if (!current || !current->mm)
-   return -ESRCH; /* process exited */
+   if (!mm)
+   return -EPERM;
 
if (!npages)
return 0;
 
-   down_write(>mm->mmap_sem);
-   locked = current->mm->locked_vm + npages;
+   down_write(>mmap_sem);
+   locked = mm->locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
ret = -ENOMEM;
else
-   current->mm->locked_vm += npages;
+   mm->locked_vm += npages;
 
pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
npages << PAGE_SHIFT,
-   current->mm->locked_vm << PAGE_SHIFT,
+   mm->locked_vm << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK),
ret ? " - exceeded" : "");
 
-   up_write(>mm->mmap_sem);
+   up_write(>mmap_sem);
 
return ret;
 }
 
-static void decrement_locked_vm(long npages)
+static void decrement_locked_vm(struct mm_struct *mm, long npages)
 {
-   if (!current || !current->mm || !npages)
-   return; /* process exited */
+   if (!mm && !npages)
+   return;
 
-   down_write(>mm->mmap_sem);
-   if (WARN_ON_ONCE(npages > current->mm->locked_vm))
-   npages = current->mm->locked_vm;
-   current->mm->locked_vm -= npages;
+   down_write(>mmap_sem);
+   if (WARN_ON_ONCE(npages > mm->locked_vm))
+   npages = mm->locked_vm;
+   mm->locked_vm -= npages;
pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
npages << PAGE_SHIFT,
-   current->mm->locked_vm << PAGE_SHIFT,
+   mm->locked_vm << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK));
-   up_write(>mm->mmap_sem);
+   up_write(>mmap_sem);
 }
 
 /*
@@ -99,26 +99,38 @@ struct tce_container {
bool v2;
bool def_window_pending;
unsigned long locked_pages;
+   struct mm_struct *mm;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct list_head group_list;
 };
 
+static long tce_iommu_mm_set(struct tce_container *container)
+{
+   if (container->mm) {
+   if (container->mm == current->mm)
+   return 0;
+   return -EPERM;
+   }
+   BUG_ON(!current->mm);
+   container->mm = current->mm;
+   atomic_inc(>mm->mm_count);
+
+   return 0;
+}
+
 static long tce_iommu_unregister_pages(struct tce_container *container,
__u64 vaddr, __u64 size)
 {
struct mm_iommu_table_group_mem_t *mem;
 
-   if (!current || !current->mm)
-   return -ESRCH; /* process exited */
-
if ((vaddr & 

[PATCH kernel v6 4/7] vfio/spapr: Add a helper to create default DMA window

2016-11-23 Thread Alexey Kardashevskiy
There is already a helper to create a DMA window which does allocate
a table and programs it to the IOMMU group. However
tce_iommu_take_ownership_ddw() did not use it and did these 2 calls
itself to simplify error path.

Since we are going to delay the default window creation till
the default window is accessed/removed or new window is added,
we need a helper to create a default window from all these cases.

This adds tce_iommu_create_default_window(). Since it relies on
a VFIO container to have at least one IOMMU group (for future use),
this changes tce_iommu_attach_group() to add a group to the container
first and then call the new helper.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v6:
* new to the patchset
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 87 ++---
 1 file changed, 42 insertions(+), 45 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 4efd2b2..a67bbfd 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -710,6 +710,29 @@ static long tce_iommu_remove_window(struct tce_container 
*container,
return 0;
 }
 
+static long tce_iommu_create_default_window(struct tce_container *container)
+{
+   long ret;
+   __u64 start_addr = 0;
+   struct tce_iommu_group *tcegrp;
+   struct iommu_table_group *table_group;
+
+   if (!tce_groups_attached(container))
+   return -ENODEV;
+
+   tcegrp = list_first_entry(>group_list,
+   struct tce_iommu_group, next);
+   table_group = iommu_group_get_iommudata(tcegrp->grp);
+   if (!table_group)
+   return -ENODEV;
+
+   ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
+   table_group->tce32_size, 1, _addr);
+   WARN_ON_ONCE(!ret && start_addr);
+
+   return ret;
+}
+
 static long tce_iommu_ioctl(void *iommu_data,
 unsigned int cmd, unsigned long arg)
 {
@@ -1100,9 +1123,6 @@ static void tce_iommu_release_ownership_ddw(struct 
tce_container *container,
 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
struct iommu_table_group *table_group)
 {
-   long i, ret = 0;
-   struct iommu_table *tbl = NULL;
-
if (!table_group->ops->create_table || !table_group->ops->set_window ||
!table_group->ops->release_ownership) {
WARN_ON_ONCE(1);
@@ -,47 +1131,7 @@ static long tce_iommu_take_ownership_ddw(struct 
tce_container *container,
 
table_group->ops->take_ownership(table_group);
 
-   /*
-* If it the first group attached, check if there is
-* a default DMA window and create one if none as
-* the userspace expects it to exist.
-*/
-   if (!tce_groups_attached(container) && !container->tables[0]) {
-   ret = tce_iommu_create_table(container,
-   table_group,
-   0, /* window number */
-   IOMMU_PAGE_SHIFT_4K,
-   table_group->tce32_size,
-   1, /* default levels */
-   );
-   if (ret)
-   goto release_exit;
-   else
-   container->tables[0] = tbl;
-   }
-
-   /* Set all windows to the new group */
-   for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-   tbl = container->tables[i];
-
-   if (!tbl)
-   continue;
-
-   /* Set the default window to a new group */
-   ret = table_group->ops->set_window(table_group, i, tbl);
-   if (ret)
-   goto release_exit;
-   }
-
return 0;
-
-release_exit:
-   for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
-   table_group->ops->unset_window(table_group, i);
-
-   table_group->ops->release_ownership(table_group);
-
-   return ret;
 }
 
 static int tce_iommu_attach_group(void *iommu_data,
@@ -1161,6 +1141,7 @@ static int tce_iommu_attach_group(void *iommu_data,
struct tce_container *container = iommu_data;
struct iommu_table_group *table_group;
struct tce_iommu_group *tcegrp = NULL;
+   bool create_default_window = false;
 
mutex_lock(>lock);
 
@@ -1203,14 +1184,30 @@ static int tce_iommu_attach_group(void *iommu_data,
}
 
if (!table_group->ops || !table_group->ops->take_ownership ||
-   !table_group->ops->release_ownership)
+   !table_group->ops->release_ownership) {
ret = tce_iommu_take_ownership(container, table_group);
-   else
+   } else {
ret = tce_iommu_take_ownership_ddw(container, table_group);
+   if (!tce_groups_attached(container) && 

[PATCH kernel v6 7/7] powerpc/mm/iommu, vfio/spapr: Put pages on VFIO container shutdown

2016-11-23 Thread Alexey Kardashevskiy
At the moment the userspace tool is expected to request pinning of
the entire guest RAM when VFIO IOMMU SPAPR v2 driver is present.
When the userspace process finishes, all the pinned pages need to
be put; this is done as a part of the userspace memory context (MM)
destruction which happens on the very last mmdrop().

This approach has a problem that a MM of the userspace process
may live longer than the userspace process itself as kernel threads
use userspace process MMs which was runnning on a CPU where
the kernel thread was scheduled to. If this happened, the MM remains
referenced until this exact kernel thread wakes up again
and releases the very last reference to the MM, on an idle system this
can take even hours.

This moves preregistered regions tracking from MM to VFIO; insteads of
using mm_iommu_table_group_mem_t::used, tce_container::prereg_list is
added so each container releases regions which it has pre-registered.

This changes the userspace interface to return EBUSY if a memory
region is already registered in a container. However it should not
have any practical effect as the only userspace tool available now
does register memory region once per container anyway.

As tce_iommu_register_pages/tce_iommu_unregister_pages are called
under container->lock, this does not need additional locking.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: Nicholas Piggin 
---
Changes:
v4:
* changed tce_iommu_register_pages() to call mm_iommu_find() first and
avoid calling mm_iommu_put() if memory is preregistered already

v3:
* moved tce_iommu_prereg_free() call out of list_for_each_entry()

v2:
* updated commit log
---
 arch/powerpc/mm/mmu_context_book3s64.c |  4 ---
 arch/powerpc/mm/mmu_context_iommu.c| 11 ---
 drivers/vfio/vfio_iommu_spapr_tce.c| 58 +-
 3 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_book3s64.c 
b/arch/powerpc/mm/mmu_context_book3s64.c
index ad82735..1a07969 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -159,10 +159,6 @@ static inline void destroy_pagetable_page(struct mm_struct 
*mm)
 
 void destroy_context(struct mm_struct *mm)
 {
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-   mm_iommu_cleanup(mm);
-#endif
-
 #ifdef CONFIG_PPC_ICSWX
drop_cop(mm->context.acop, mm);
kfree(mm->context.cop_lockp);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index 4c6db09..104bad0 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -365,14 +365,3 @@ void mm_iommu_init(struct mm_struct *mm)
 {
INIT_LIST_HEAD_RCU(>context.iommu_group_mem_list);
 }
-
-void mm_iommu_cleanup(struct mm_struct *mm)
-{
-   struct mm_iommu_table_group_mem_t *mem, *tmp;
-
-   list_for_each_entry_safe(mem, tmp, >context.iommu_group_mem_list,
-   next) {
-   list_del_rcu(>next);
-   mm_iommu_do_free(mem);
-   }
-}
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index b2fb05ac..86c9348 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -89,6 +89,15 @@ struct tce_iommu_group {
 };
 
 /*
+ * A container needs to remember which preregistered region  it has
+ * referenced to do proper cleanup at the userspace process exit.
+ */
+struct tce_iommu_prereg {
+   struct list_head next;
+   struct mm_iommu_table_group_mem_t *mem;
+};
+
+/*
  * The container descriptor supports only a single group per container.
  * Required by the API as the container is not supplied with the IOMMU group
  * at the moment of initialization.
@@ -102,6 +111,7 @@ struct tce_container {
struct mm_struct *mm;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct list_head group_list;
+   struct list_head prereg_list;
 };
 
 static long tce_iommu_mm_set(struct tce_container *container)
@@ -118,10 +128,24 @@ static long tce_iommu_mm_set(struct tce_container 
*container)
return 0;
 }
 
+static long tce_iommu_prereg_free(struct tce_container *container,
+   struct tce_iommu_prereg *tcemem)
+{
+   long ret;
+
+   list_del(>next);
+   ret = mm_iommu_put(container->mm, tcemem->mem);
+   kfree(tcemem);
+
+   return ret;
+}
+
 static long tce_iommu_unregister_pages(struct tce_container *container,
__u64 vaddr, __u64 size)
 {
struct mm_iommu_table_group_mem_t *mem;
+   struct tce_iommu_prereg *tcemem;
+   bool found = false;
 
if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
return -EINVAL;
@@ -130,7 +154,17 @@ static long tce_iommu_unregister_pages(struct 
tce_container *container,
if (!mem)
return -ENOENT;
 
-   return mm_iommu_put(container->mm, mem);
+   list_for_each_entry(tcemem, 

[PATCH kernel v6 5/7] vfio/spapr: Postpone default window creation

2016-11-23 Thread Alexey Kardashevskiy
We are going to allow the userspace to configure container in
one memory context and pass container fd to another so
we are postponing memory allocations accounted against
the locked memory limit. One of previous patches took care of
it_userspace.

At the moment we create the default DMA window when the first group is
attached to a container; this is done for the userspace which is not
DDW-aware but familiar with the SPAPR TCE IOMMU v2 in the part of memory
pre-registration - such client expects the default DMA window to exist.

This postpones the default DMA window allocation till one of
the folliwing happens:
1. first map/unmap request arrives;
2. new window is requested;
This adds noop for the case when the userspace requested removal
of the default window which has not been created yet.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v6:
* new helper tce_iommu_create_default_window() moved to a separate patch;
* creates a default window when new window is requested; it used to
reset the def_window_pending flag instead;
* def_window_pending handling (mostly) localized in
tce_iommu_create_default_window() now, the only exception is removal
of not yet created default window.
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 40 +++--
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index a67bbfd..88622be 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -97,6 +97,7 @@ struct tce_container {
struct mutex lock;
bool enabled;
bool v2;
+   bool def_window_pending;
unsigned long locked_pages;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct list_head group_list;
@@ -717,6 +718,9 @@ static long tce_iommu_create_default_window(struct 
tce_container *container)
struct tce_iommu_group *tcegrp;
struct iommu_table_group *table_group;
 
+   if (!container->def_window_pending)
+   return 0;
+
if (!tce_groups_attached(container))
return -ENODEV;
 
@@ -730,6 +734,9 @@ static long tce_iommu_create_default_window(struct 
tce_container *container)
table_group->tce32_size, 1, _addr);
WARN_ON_ONCE(!ret && start_addr);
 
+   if (!ret)
+   container->def_window_pending = false;
+
return ret;
 }
 
@@ -823,6 +830,10 @@ static long tce_iommu_ioctl(void *iommu_data,
VFIO_DMA_MAP_FLAG_WRITE))
return -EINVAL;
 
+   ret = tce_iommu_create_default_window(container);
+   if (ret)
+   return ret;
+
num = tce_iommu_find_table(container, param.iova, );
if (num < 0)
return -ENXIO;
@@ -886,6 +897,10 @@ static long tce_iommu_ioctl(void *iommu_data,
if (param.flags)
return -EINVAL;
 
+   ret = tce_iommu_create_default_window(container);
+   if (ret)
+   return ret;
+
num = tce_iommu_find_table(container, param.iova, );
if (num < 0)
return -ENXIO;
@@ -1012,6 +1027,10 @@ static long tce_iommu_ioctl(void *iommu_data,
 
mutex_lock(>lock);
 
+   ret = tce_iommu_create_default_window(container);
+   if (ret)
+   return ret;
+
ret = tce_iommu_create_window(container, create.page_shift,
create.window_size, create.levels,
_addr);
@@ -1044,6 +1063,11 @@ static long tce_iommu_ioctl(void *iommu_data,
if (remove.flags)
return -EINVAL;
 
+   if (container->def_window_pending && !remove.start_addr) {
+   container->def_window_pending = false;
+   return 0;
+   }
+
mutex_lock(>lock);
 
ret = tce_iommu_remove_window(container, remove.start_addr);
@@ -1141,7 +1165,6 @@ static int tce_iommu_attach_group(void *iommu_data,
struct tce_container *container = iommu_data;
struct iommu_table_group *table_group;
struct tce_iommu_group *tcegrp = NULL;
-   bool create_default_window = false;
 
mutex_lock(>lock);
 
@@ -1189,25 +1212,12 @@ static int tce_iommu_attach_group(void *iommu_data,
} else {
ret = tce_iommu_take_ownership_ddw(container, table_group);
if (!tce_groups_attached(container) && !container->tables[0])
-   create_default_window = true;
+   container->def_window_pending = true;
}
 
if (!ret) {
tcegrp->grp = iommu_group;
list_add(>next, >group_list);
-   /*
-   

[PATCH kernel v6 3/7] vfio/spapr: Postpone allocation of userspace version of TCE table

2016-11-23 Thread Alexey Kardashevskiy
The iommu_table struct manages a hardware TCE table and a vmalloc'd
table with corresponding userspace addresses. Both are allocated when
the default DMA window is created and this happens when the very first
group is attached to a container.

As we are going to allow the userspace to configure container in one
memory context and pas container fd to another, we have to postpones
such allocations till a container fd is passed to the destination
user process so we would account locked memory limit against the actual
container user constrainsts.

This postpones the it_userspace array allocation till it is used first
time for mapping. The unmapping patch already checks if the array is
allocated.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v6:
* moved missing hunk from the next patch: tce_iommu_create_table()
would decrement locked_vm while new caller - tce_iommu_build_v2() -
will not; this adds a new return code to the DMA mapping path but
this seems to be a minor change.
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 20 +++-
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index d0c38b2..4efd2b2 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -515,6 +515,12 @@ static long tce_iommu_build_v2(struct tce_container 
*container,
unsigned long hpa;
enum dma_data_direction dirtmp;
 
+   if (!tbl->it_userspace) {
+   ret = tce_iommu_userspace_view_alloc(tbl);
+   if (ret)
+   return ret;
+   }
+
for (i = 0; i < pages; ++i) {
struct mm_iommu_table_group_mem_t *mem = NULL;
unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
@@ -588,15 +594,6 @@ static long tce_iommu_create_table(struct tce_container 
*container,
WARN_ON(!ret && !(*ptbl)->it_ops->free);
WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
 
-   if (!ret && container->v2) {
-   ret = tce_iommu_userspace_view_alloc(*ptbl);
-   if (ret)
-   (*ptbl)->it_ops->free(*ptbl);
-   }
-
-   if (ret)
-   decrement_locked_vm(table_size >> PAGE_SHIFT);
-
return ret;
 }
 
@@ -1068,10 +1065,7 @@ static int tce_iommu_take_ownership(struct tce_container 
*container,
if (!tbl || !tbl->it_map)
continue;
 
-   rc = tce_iommu_userspace_view_alloc(tbl);
-   if (!rc)
-   rc = iommu_take_ownership(tbl);
-
+   rc = iommu_take_ownership(tbl);
if (rc) {
for (j = 0; j < i; ++j)
iommu_release_ownership(
-- 
2.5.0.rc3



[PATCH kernel v6 2/7] powerpc/iommu: Stop using @current in mm_iommu_xxx

2016-11-23 Thread Alexey Kardashevskiy
This changes mm_iommu_xxx helpers to take mm_struct as a parameter
instead of getting it from @current which in some situations may
not have a valid reference to mm.

This changes helpers to receive @mm and moves all references to @current
to the caller, including checks for !current and !current->mm;
checks in mm_iommu_preregistered() are removed as there is no caller
yet.

This moves the mm_iommu_adjust_locked_vm() call to the caller as
it receives mm_iommu_table_group_mem_t but it needs mm.

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/include/asm/mmu_context.h | 16 ++--
 arch/powerpc/mm/mmu_context_iommu.c| 46 +-
 drivers/vfio/vfio_iommu_spapr_tce.c| 14 ---
 3 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 424844b..b9e3f0a 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -19,16 +19,18 @@ extern void destroy_context(struct mm_struct *mm);
 struct mm_iommu_table_group_mem_t;
 
 extern int isolate_lru_page(struct page *page);/* from internal.h */
-extern bool mm_iommu_preregistered(void);
-extern long mm_iommu_get(unsigned long ua, unsigned long entries,
+extern bool mm_iommu_preregistered(struct mm_struct *mm);
+extern long mm_iommu_get(struct mm_struct *mm,
+   unsigned long ua, unsigned long entries,
struct mm_iommu_table_group_mem_t **pmem);
-extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern long mm_iommu_put(struct mm_struct *mm,
+   struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_init(struct mm_struct *mm);
 extern void mm_iommu_cleanup(struct mm_struct *mm);
-extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
-   unsigned long size);
-extern struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
-   unsigned long entries);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
+   unsigned long ua, unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+   unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
unsigned long ua, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index ad2e575..4c6db09 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -56,7 +56,7 @@ static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
}
 
pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-   current->pid,
+   current ? current->pid : 0,
incr ? '+' : '-',
npages << PAGE_SHIFT,
mm->locked_vm << PAGE_SHIFT,
@@ -66,12 +66,9 @@ static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
return ret;
 }
 
-bool mm_iommu_preregistered(void)
+bool mm_iommu_preregistered(struct mm_struct *mm)
 {
-   if (!current || !current->mm)
-   return false;
-
-   return !list_empty(>mm->context.iommu_group_mem_list);
+   return !list_empty(>context.iommu_group_mem_list);
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
 
@@ -124,19 +121,16 @@ static int mm_iommu_move_page_from_cma(struct page *page)
return 0;
 }
 
-long mm_iommu_get(unsigned long ua, unsigned long entries,
+long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long 
entries,
struct mm_iommu_table_group_mem_t **pmem)
 {
struct mm_iommu_table_group_mem_t *mem;
long i, j, ret = 0, locked_entries = 0;
struct page *page = NULL;
 
-   if (!current || !current->mm)
-   return -ESRCH; /* process exited */
-
mutex_lock(_list_mutex);
 
-   list_for_each_entry_rcu(mem, >mm->context.iommu_group_mem_list,
+   list_for_each_entry_rcu(mem, >context.iommu_group_mem_list,
next) {
if ((mem->ua == ua) && (mem->entries == entries)) {
++mem->used;
@@ -154,7 +148,7 @@ long mm_iommu_get(unsigned long ua, unsigned long entries,
 
}
 
-   ret = mm_iommu_adjust_locked_vm(current->mm, entries, true);
+   ret = mm_iommu_adjust_locked_vm(mm, entries, true);
if (ret)
goto unlock_exit;
 
@@ -215,11 +209,11 @@ long mm_iommu_get(unsigned long ua, unsigned long entries,
mem->entries = entries;
*pmem = mem;
 
-   list_add_rcu(>next, >mm->context.iommu_group_mem_list);
+   list_add_rcu(>next, 

[PATCH kernel v6 1/7] powerpc/iommu: Pass mm_struct to init/cleanup helpers

2016-11-23 Thread Alexey Kardashevskiy
We are going to get rid of @current references in mmu_context_boos3s64.c
and cache mm_struct in the VFIO container. Since mm_context_t does not
have reference counting, we will be using mm_struct which does have
the reference counter.

This changes mm_iommu_init/mm_iommu_cleanup to receive mm_struct rather
than mm_context_t (which is embedded into mm).

This should not cause any behavioral change.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/include/asm/mmu_context.h | 4 ++--
 arch/powerpc/kernel/setup-common.c | 2 +-
 arch/powerpc/mm/mmu_context_book3s64.c | 4 ++--
 arch/powerpc/mm/mmu_context_iommu.c| 9 +
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 5c45114..424844b 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -23,8 +23,8 @@ extern bool mm_iommu_preregistered(void);
 extern long mm_iommu_get(unsigned long ua, unsigned long entries,
struct mm_iommu_table_group_mem_t **pmem);
 extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
-extern void mm_iommu_init(mm_context_t *ctx);
-extern void mm_iommu_cleanup(mm_context_t *ctx);
+extern void mm_iommu_init(struct mm_struct *mm);
+extern void mm_iommu_cleanup(struct mm_struct *mm);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 270ee30..f516ac5 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -915,7 +915,7 @@ void __init setup_arch(char **cmdline_p)
init_mm.context.pte_frag = NULL;
 #endif
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-   mm_iommu_init(_mm.context);
+   mm_iommu_init(_mm);
 #endif
irqstack_early_init();
exc_lvl_early_init();
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c 
b/arch/powerpc/mm/mmu_context_book3s64.c
index b114f8b..ad82735 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -115,7 +115,7 @@ int init_new_context(struct task_struct *tsk, struct 
mm_struct *mm)
mm->context.pte_frag = NULL;
 #endif
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-   mm_iommu_init(>context);
+   mm_iommu_init(mm);
 #endif
return 0;
 }
@@ -160,7 +160,7 @@ static inline void destroy_pagetable_page(struct mm_struct 
*mm)
 void destroy_context(struct mm_struct *mm)
 {
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-   mm_iommu_cleanup(>context);
+   mm_iommu_cleanup(mm);
 #endif
 
 #ifdef CONFIG_PPC_ICSWX
diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index e0f1c33..ad2e575 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -373,16 +373,17 @@ void mm_iommu_mapped_dec(struct 
mm_iommu_table_group_mem_t *mem)
 }
 EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
 
-void mm_iommu_init(mm_context_t *ctx)
+void mm_iommu_init(struct mm_struct *mm)
 {
-   INIT_LIST_HEAD_RCU(>iommu_group_mem_list);
+   INIT_LIST_HEAD_RCU(>context.iommu_group_mem_list);
 }
 
-void mm_iommu_cleanup(mm_context_t *ctx)
+void mm_iommu_cleanup(struct mm_struct *mm)
 {
struct mm_iommu_table_group_mem_t *mem, *tmp;
 
-   list_for_each_entry_safe(mem, tmp, >iommu_group_mem_list, next) {
+   list_for_each_entry_safe(mem, tmp, >context.iommu_group_mem_list,
+   next) {
list_del_rcu(>next);
mm_iommu_do_free(mem);
}
-- 
2.5.0.rc3



[PATCH kernel v6 0/7] powerpc/spapr/vfio: Put pages on VFIO container shutdown

2016-11-23 Thread Alexey Kardashevskiy
These patches are to fix a bug when pages stay pinned hours
after QEMU which requested pinning exited.

Main change to v5 that it is now 7 patches.

Please comment. Thanks.

Alexey Kardashevskiy (7):
  powerpc/iommu: Pass mm_struct to init/cleanup helpers
  powerpc/iommu: Stop using @current in mm_iommu_xxx
  vfio/spapr: Postpone allocation of userspace version of TCE table
  vfio/spapr: Add a helper to create default DMA window
  vfio/spapr: Postpone default window creation
  vfio/spapr: Reference mm in tce_container
  powerpc/mm/iommu, vfio/spapr: Put pages on VFIO container shutdown

 arch/powerpc/include/asm/mmu_context.h |  20 +-
 arch/powerpc/kernel/setup-common.c |   2 +-
 arch/powerpc/mm/mmu_context_book3s64.c |   6 +-
 arch/powerpc/mm/mmu_context_iommu.c|  60 ++
 drivers/vfio/vfio_iommu_spapr_tce.c| 324 ++---
 5 files changed, 245 insertions(+), 167 deletions(-)

-- 
2.5.0.rc3



Re: [RFC][PATCH] powerpc/64be: use ELFv2 ABI for big endian kernels

2016-11-23 Thread Oliver O'Halloran
On Thu, Nov 24, 2016 at 1:38 AM, Segher Boessenkool
 wrote:
> On Thu, Nov 24, 2016 at 12:08:40AM +1100, Nicholas Piggin wrote:
>> Question, are there any fundamental reasons we shouldn't use the ELFv2
>> ABI to build big endian kernels if the compiler supports it?
>
> No one uses ELFv2 for BE in production, and it isn't thoroughly tested
> at all, not even regularly tested.  "Not supported", as far as GCC is
> concerned (or any of the distros AFAIK).

Is this actually unsupported by gcc? The ppc64 musl libc port is ABI
v2 only so they use it on BE too. Buildroot forces ABI v2 to be used
for all of userspace when musl is selected as the libc for this reason
so it's not completely used in the wild. It's still pretty niche
though...


Re: [v3,2/3] powerpc: get hugetlbpage handling more generic

2016-11-23 Thread Scott Wood
On Wed, Sep 21, 2016 at 10:11:54AM +0200, Christophe Leroy wrote:
> Today there are two implementations of hugetlbpages which are managed
> by exclusive #ifdefs:
> * FSL_BOOKE: several directory entries points to the same single hugepage
> * BOOK3S: one upper level directory entry points to a table of hugepages
> 
> In preparation of implementation of hugepage support on the 8xx, we
> need a mix of the two above solutions, because the 8xx needs both cases
> depending on the size of pages:
> * In 4k page size mode, each PGD entry covers a 4M bytes area. It means
> that 2 PGD entries will be necessary to cover an 8M hugepage while a
> single PGD entry will cover 8x 512k hugepages.
> * In 16 page size mode, each PGD entry covers a 64M bytes area. It means
> that 8x 8M hugepages will be covered by one PGD entry and 64x 512k
> hugepages will be covers by one PGD entry.
> 
> This patch:
> * removes #ifdefs in favor of if/else based on the range sizes
> * merges the two huge_pte_alloc() functions as they are pretty similar
> * merges the two hugetlbpage_init() functions as they are pretty similar
> 
> Signed-off-by: Christophe Leroy 
> Reviewed-by: Aneesh Kumar K.V 

With this patch on e6500, running the hugetlb testsuite results in the
system hanging in a storm of OOM killer invocations (I'll try to debug
more deeply later).  This patch also changes the default hugepage size on
FSL book3e from 4M to 16M.

-Scott


Re: [RFC][PATCH] Update ppc disassembly in xmon

2016-11-23 Thread Andrew Donnellan

On 24/11/16 13:05, Balbir Singh wrote:

9. The license for these files is now GPL v3 or later


As much as I love the GPLv3, isn't this an instant NAK?

--
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited



Re: [PATCH kernel v5 4/6] vfio/spapr: Postpone default window creation

2016-11-23 Thread David Gibson
On Wed, Nov 23, 2016 at 04:06:30PM +1100, Alexey Kardashevskiy wrote:
> On 23/11/16 12:35, David Gibson wrote:
> > On Tue, Nov 22, 2016 at 06:29:39PM +1100, Alexey Kardashevskiy wrote:
> >> On 22/11/16 13:50, David Gibson wrote:
> >>> On Fri, Nov 11, 2016 at 11:32:15PM +1100, Alexey Kardashevskiy wrote:
>  As mentioned in the previous patch, we are going to allow the userspace
>  to configure container in one memory context and pass container fd to
>  another so we are postponing memory allocations accounted against
>  the locked memory limit. The previous patch took care of it_userspace.
> 
>  At the moment we create the default DMA window when the first group is
>  attached to a container; this is done for the userspace which is not
>  DDW-aware but familiar with the SPAPR TCE IOMMU v2 in the part of memory
>  pre-registration - such client expects the default DMA window to exist.
> 
>  This postpones the default DMA window allocation till first map/unmap
>  request happens.
> 
>  Signed-off-by: Alexey Kardashevskiy 
>  ---
>   drivers/vfio/vfio_iommu_spapr_tce.c | 98 
>  ++---
>   1 file changed, 47 insertions(+), 51 deletions(-)
> 
>  diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
>  b/drivers/vfio/vfio_iommu_spapr_tce.c
>  index 442baac..1c02498 100644
>  --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>  +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>  @@ -97,6 +97,7 @@ struct tce_container {
>   struct mutex lock;
>   bool enabled;
>   bool v2;
>  +bool def_window_pending;
>   unsigned long locked_pages;
>   struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>   struct list_head group_list;
>  @@ -594,15 +595,6 @@ static long tce_iommu_create_table(struct 
>  tce_container *container,
>   WARN_ON(!ret && !(*ptbl)->it_ops->free);
>   WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
>   
>  -if (!ret && container->v2) {
>  -ret = tce_iommu_userspace_view_alloc(*ptbl);
>  -if (ret)
>  -(*ptbl)->it_ops->free(*ptbl);
>  -}
> >>>
> >>> Does this stuff for the user view belong in the previous patch?
> >>
> >> Yes it does, my mistake, will fix.
> >>
> >>
> >>>
>  -
>  -if (ret)
>  -decrement_locked_vm(table_size >> PAGE_SHIFT);
>  -
>   return ret;
>   }
>   
>  @@ -719,6 +711,29 @@ static long tce_iommu_remove_window(struct 
>  tce_container *container,
>   return 0;
>   }
>   
>  +static long tce_iommu_create_default_window(struct tce_container 
>  *container)
>  +{
>  +long ret;
>  +__u64 start_addr = 0;
>  +struct tce_iommu_group *tcegrp;
>  +struct iommu_table_group *table_group;
>  +
>  +if (!tce_groups_attached(container))
>  +return -ENODEV;
>  +
>  +tcegrp = list_first_entry(>group_list,
>  +struct tce_iommu_group, next);
>  +table_group = iommu_group_get_iommudata(tcegrp->grp);
>  +if (!table_group)
>  +return -ENODEV;
>  +
>  +ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
>  +table_group->tce32_size, 1, _addr);
>  +WARN_ON_ONCE(!ret && start_addr);
>  +
>  +return ret;
>  +}
>  +
>   static long tce_iommu_ioctl(void *iommu_data,
>    unsigned int cmd, unsigned long arg)
>   {
>  @@ -809,6 +824,13 @@ static long tce_iommu_ioctl(void *iommu_data,
>   VFIO_DMA_MAP_FLAG_WRITE))
>   return -EINVAL;
>   
>  +if (container->def_window_pending) {
>  +ret = 
>  tce_iommu_create_default_window(container);
>  +if (ret)
>  +return ret;
>  +container->def_window_pending = false;
> >>>
> >>> Would it make sense to clear (and maybe test) def_window_pending
> >>> within create_default_window()?
> >>
> >> Dunno, matter of taste I suppose. I'll move it there.
> >>
> >>
> >>>
>  +}
>  +
>   num = tce_iommu_find_table(container, param.iova, );
>   if (num < 0)
>   return -ENXIO;
>  @@ -872,6 +894,13 @@ static long tce_iommu_ioctl(void *iommu_data,
>   if (param.flags)
>   return -EINVAL;
>   
>  +if (container->def_window_pending) {
>  +ret = 

linux-next: build failure after merge of the akpm-current tree

2016-11-23 Thread Stephen Rothwell
Hi Andrew,

After merging the akpm-current tree, today's linux-next build (powerpc
ppc64_defconfig) failed like this:

powerpc-linux-ld: unrecognized option '--no-dynamic-linker'

Caused by patch

  "powerpc: add purgatory for kexec_file_load implementation"

I have disabled KEXEC_FILE for now:

From: Stephen Rothwell 
Date: Thu, 24 Nov 2016 15:52:55 +1100
Subject: [PATCH] disable KEXEC_FILE on powerpc for now

Signed-off-by: Stephen Rothwell 
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2d86643f280d..b72c1c7afcf0 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -475,6 +475,7 @@ config KEXEC_FILE
depends on PPC64
depends on CRYPTO=y
depends on CRYPTO_SHA256=y
+   depends on BROKEN
help
  This is a new version of the kexec system call. This call is
  file based and takes in file descriptors as system call arguments
-- 
2.10.2

-- 
Cheers,
Stephen Rothwell


Re: [RFC][PATCH] powerpc/64be: use ELFv2 ABI for big endian kernels

2016-11-23 Thread Nicholas Piggin
On Wed, 23 Nov 2016 08:38:34 -0600
Segher Boessenkool  wrote:

> On Thu, Nov 24, 2016 at 12:08:40AM +1100, Nicholas Piggin wrote:
> > Question, are there any fundamental reasons we shouldn't use the ELFv2
> > ABI to build big endian kernels if the compiler supports it?  
> 
> No one uses ELFv2 for BE in production, and it isn't thoroughly tested
> at all, not even regularly tested.  "Not supported", as far as GCC is
> concerned (or any of the distros AFAIK).
> 
> There are no fundamental reasons of course, ABIs are largely just
> conventions, not laws of nature.

That's a very good reason! I didn't think of that, I'll drop the idea.

Thanks,
Nick


Re: [V2,10/68] powerpc/mm: Update _PAGE_KERNEL_RO

2016-11-23 Thread Geoff Levand

Hi Aneesh,

On 11/23/2016 02:41 AM, Aneesh Kumar K.V wrote:

Can you try this patch ?

commit 43e05fa840330f0f2deae1e8cc2effd5df68079f
Author: Aneesh Kumar K.V 
Date:   Wed Nov 23 15:23:05 2016 +0530

powerpc/mm: Kernel RO fixup for cell


I tested your patch with v4.7, v4.8 and v4.9-rc6, and all work OK.

-Geoff


Re: [PATCH 0/2] Preliminary cleanups for HPT resizing

2016-11-23 Thread Paul Mackerras
On Wed, Nov 23, 2016 at 04:14:05PM +1100, David Gibson wrote:
> Hi Paul,
> 
> I'm still chasing this confusion about the CAS bit to send the real
> HPT resizing patches.  However, in the meantime, here are some
> preliminary cleanups.
> 
> These cleanups stand on their own, although I wrote them in the
> context of writing the HPT resizing code, and are prerequisites for
> those patches.
> 
> David Gibson (2):
>   kvm: Move KVM_PPC_PVINFO_FLAGS_EV_IDLE definition next to its
> structure
>   powerpc/kvm: Corectly report KVM_CAP_PPC_ALLOC_HTAB
> 
>  arch/powerpc/kvm/powerpc.c | 5 -
>  include/uapi/linux/kvm.h   | 5 +++--
>  2 files changed, 7 insertions(+), 3 deletions(-)

Thanks, series applied to my kvm-ppc-next branch.

Paul.


Re: [PATCH] cxl: drop duplicate header sched.h

2016-11-23 Thread Ian Munsie
Acked-by: Ian Munsie 



Re: Locking API testsuite output mangled

2016-11-23 Thread Christian Kujau
On Wed, 23 Nov 2016, Michael Ellerman wrote:
> That's nothing powerpc specific AFAICS, does this fix it?

Hm, so s/printk/pr_cont/ - but not in all places? But yeah, this fixes it 
for me, at least on x86.

 Tested-by: Christian Kujau 

Thank you!
Christian.

> 
> cheers
> 
> diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
> index 872a15a2a637..f3a217ea0388 100644
> --- a/lib/locking-selftest.c
> +++ b/lib/locking-selftest.c
> @@ -980,23 +980,23 @@ static void dotest(void (*testcase_fn)(void), int 
> expected, int lockclass_mask)
>  #ifndef CONFIG_PROVE_LOCKING
>   if (expected == FAILURE && debug_locks) {
>   expected_testcase_failures++;
> - printk("failed|");
> + pr_cont("failed|");
>   }
>   else
>  #endif
>   if (debug_locks != expected) {
>   unexpected_testcase_failures++;
> - printk("FAILED|");
> + pr_cont("FAILED|");
>  
>   dump_stack();
>   } else {
>   testcase_successes++;
> - printk("  ok  |");
> + pr_cont("  ok  |");
>   }
>   testcase_total++;
>  
>   if (debug_locks_verbose)
> - printk(" lockclass mask: %x, debug_locks: %d, expected: %d\n",
> + pr_cont(" lockclass mask: %x, debug_locks: %d, expected: %d\n",
>   lockclass_mask, debug_locks, expected);
>   /*
>* Some tests (e.g. double-unlock) might corrupt the preemption
> @@ -1021,26 +1021,26 @@ static inline void print_testname(const char 
> *testname)
>  #define DO_TESTCASE_1(desc, name, nr)\
>   print_testname(desc"/"#nr); \
>   dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK);  \
> - printk("\n");
> + pr_cont("\n");
>  
>  #define DO_TESTCASE_1B(desc, name, nr)   \
>   print_testname(desc"/"#nr); \
>   dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK);  \
> - printk("\n");
> + pr_cont("\n");
>  
>  #define DO_TESTCASE_3(desc, name, nr)\
>   print_testname(desc"/"#nr); \
>   dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);   \
>   dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);\
>   dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);\
> - printk("\n");
> + pr_cont("\n");
>  
>  #define DO_TESTCASE_3RW(desc, name, nr)  \
>   print_testname(desc"/"#nr); \
>   dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\
>   dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);\
>   dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);\
> - printk("\n");
> + pr_cont("\n");
>  
>  #define DO_TESTCASE_6(desc, name)\
>   print_testname(desc);   \
> @@ -1050,7 +1050,7 @@ static inline void print_testname(const char *testname)
>   dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX);  \
>   dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM);   \
>   dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM);   \
> - printk("\n");
> + pr_cont("\n");
>  
>  #define DO_TESTCASE_6_SUCCESS(desc, name)\
>   print_testname(desc);   \
> @@ -1060,7 +1060,7 @@ static inline void print_testname(const char *testname)
>   dotest(name##_mutex, SUCCESS, LOCKTYPE_MUTEX);  \
>   dotest(name##_wsem, SUCCESS, LOCKTYPE_RWSEM);   \
>   dotest(name##_rsem, SUCCESS, LOCKTYPE_RWSEM);   \
> - printk("\n");
> + pr_cont("\n");
>  
>  /*
>   * 'read' variant: rlocks must not trigger.
> @@ -1073,7 +1073,7 @@ static inline void print_testname(const char *testname)
>   dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX);  \
>   dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM);   \
>   dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM);   \
> - printk("\n");
> + pr_cont("\n");
>  
>  #define DO_TESTCASE_2I(desc, name, nr)   \
>   DO_TESTCASE_1("hard-"desc, name##_hard, nr);\
> @@ -1726,25 +1726,25 @@ static void ww_tests(void)
>   dotest(ww_test_fail_acquire, SUCCESS, LOCKTYPE_WW);
>   dotest(ww_test_normal, SUCCESS, LOCKTYPE_WW);
>   dotest(ww_test_unneeded_slow, FAILURE, LOCKTYPE_WW);
> - printk("\n");
> + pr_cont("\n");
>  
>   print_testname("ww contexts mixing");
>   dotest(ww_test_two_contexts, FAILURE, LOCKTYPE_WW);
>   dotest(ww_test_diff_class, FAILURE, LOCKTYPE_WW);
> - printk("\n");
> + pr_cont("\n");
>  
>   print_testname("finishing ww context");
>   dotest(ww_test_context_done_twice, FAILURE, LOCKTYPE_WW);
>   dotest(ww_test_context_unlock_twice, FAILURE, LOCKTYPE_WW);
>   

linux-next: manual merge of the kvm-ppc-paulus tree with the powerpc tree

2016-11-23 Thread Stephen Rothwell
Hi Paul,

Today's linux-next merge of the kvm-ppc-paulus tree got a conflict in:

  arch/powerpc/mm/pgtable-radix.c

between commit:

  555c16328ae6 ("powerpc/mm: Correct process and partition table max size")

from the powerpc tree and commit:

  9d66195807ac ("powerpc/64: Provide functions for accessing POWER9 partition 
table")

from the kvm-ppc-paulus tree.

I fixed it up (the latter removed the code modified by the former) and
can carry the fix as necessary. This is now fixed as far as linux-next
is concerned, but any non trivial conflicts should be mentioned to your
upstream maintainer when your tree is submitted for merging.  You may
also want to consider cooperating with the maintainer of the conflicting
tree to minimise any particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell


linux-next: manual merge of the kvm-ppc-paulus tree with the powerpc tree

2016-11-23 Thread Stephen Rothwell
Hi Paul,

Today's linux-next merge of the kvm-ppc-paulus tree got a conflict in:

  arch/powerpc/include/asm/reg.h

between commit:

  29a969b76481 ("powerpc: Revert Load Monitor Register Support")

from the powerpc tree and commits:

  7fd317f8c330 ("powerpc/64: Add some more SPRs and SPR bits for POWER9")
  02ed21aeda0e ("powerpc/powernv: Define and set POWER9 HFSCR doorbell bit")

from the kvm-ppc-paulus tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/include/asm/reg.h
index 332e6b4b306a,04aa1ee8cdb6..
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@@ -292,6 -295,9 +295,7 @@@
  #define SPRN_HRMOR0x139   /* Real mode offset register */
  #define SPRN_HSRR00x13A   /* Hypervisor Save/Restore 0 */
  #define SPRN_HSRR10x13B   /* Hypervisor Save/Restore 1 */
 -#define SPRN_LMRR 0x32D   /* Load Monitor Region Register */
 -#define SPRN_LMSER0x32E   /* Load Monitor Section Enable Register */
+ #define SPRN_ASDR 0x330   /* Access segment descriptor register */
  #define SPRN_IC   0x350   /* Virtual Instruction Count */
  #define SPRN_VTB  0x351   /* Virtual Time Base */
  #define SPRN_LDBAR0x352   /* LD Base Address Register */
@@@ -302,6 -308,8 +306,7 @@@
  #define SPRN_PMCR 0x374   /* Power Management Control Register */
  
  /* HFSCR and FSCR bit numbers are the same */
 -#define FSCR_LM_LG11  /* Enable Load Monitor Registers */
+ #define FSCR_MSGP_LG  10  /* Enable MSGP */
  #define FSCR_TAR_LG   8   /* Enable Target Address Register */
  #define FSCR_EBB_LG   7   /* Enable Event Based Branching */
  #define FSCR_TM_LG5   /* Enable Transactional Memory */
@@@ -315,6 -324,8 +320,7 @@@
  #define   FSCR_EBB__MASK(FSCR_EBB_LG)
  #define   FSCR_DSCR   __MASK(FSCR_DSCR_LG)
  #define SPRN_HFSCR0xbe/* HV=1 Facility Status & Control Register */
 -#define   HFSCR_LM__MASK(FSCR_LM_LG)
+ #define   HFSCR_MSGP  __MASK(FSCR_MSGP_LG)
  #define   HFSCR_TAR   __MASK(FSCR_TAR_LG)
  #define   HFSCR_EBB   __MASK(FSCR_EBB_LG)
  #define   HFSCR_TM__MASK(FSCR_TM_LG)


linux-next: manual merge of the kvm-ppc-paulus tree with the powerpc-fixes tree

2016-11-23 Thread Stephen Rothwell
Hi Paul,

Today's linux-next merge of the kvm-ppc-paulus tree got a conflict in:

  arch/powerpc/include/asm/asm-prototypes.h

between commit:

  82de5797a260 ("powerpc: Remove extraneous header from asm-prototypes.h")

from the powerpc-fixes tree and commit:

  ebe4535fbe7a ("KVM: PPC: Book3S HV: sparse: prototypes for functions called 
from assembler")

from the kvm-ppc-paulus tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/include/asm/asm-prototypes.h
index 81592562e0f8,6c853bcd11fa..
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@@ -13,10 -13,10 +13,13 @@@
   */
  
  #include 
 -#include 
 +#include 
 +#include 
 +#include 
 +#include 
+ #ifdef CONFIG_KVM
+ #include 
+ #endif
  
  #include 
  
@@@ -112,12 -112,45 +115,53 @@@ void early_setup_secondary(void)
  /* time */
  void accumulate_stolen_time(void);
  
 +/* misc runtime */
 +extern u64 __bswapdi2(u64);
 +extern s64 __lshrdi3(s64, int);
 +extern s64 __ashldi3(s64, int);
 +extern s64 __ashrdi3(s64, int);
 +extern int __cmpdi2(s64, s64);
 +extern int __ucmpdi2(u64, u64);
 +
+ /* kvm */
+ #ifdef CONFIG_KVM
+ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+unsigned long ioba, unsigned long tce);
+ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+ unsigned long liobn, unsigned long ioba,
+ unsigned long tce_list, unsigned long npages);
+ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
+  unsigned long liobn, unsigned long ioba,
+  unsigned long tce_value, unsigned long npages);
+ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
+ unsigned int yield_count);
+ long kvmppc_h_random(struct kvm_vcpu *vcpu);
+ void kvmhv_commence_exit(int trap);
+ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
+ void kvmppc_subcore_enter_guest(void);
+ void kvmppc_subcore_exit_guest(void);
+ long kvmppc_realmode_hmi_handler(void);
+ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+ long pte_index, unsigned long pteh, unsigned long ptel);
+ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+  unsigned long pte_index, unsigned long avpn);
+ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu);
+ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+   unsigned long pte_index, unsigned long avpn,
+   unsigned long va);
+ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
+unsigned long pte_index);
+ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
+ unsigned long pte_index);
+ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
+ unsigned long pte_index);
+ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+   unsigned long slb_v, unsigned int status, bool 
data);
+ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
+ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+ unsigned long mfrr);
+ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
+ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
+ #endif
+ 
  #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */


Re: linux-next: manual merge of the powerpc tree with the powerpc-fixes tree

2016-11-23 Thread Michael Ellerman
Stephen Rothwell  writes:

> Hi all,
>
> Today's linux-next merge of the powerpc tree got a conflict in:
>
>   arch/powerpc/include/asm/asm-prototypes.h
>
> between commit:
>
>   9e5f68842276 ("powerpc: Fix missing CRCs, add more asm-prototypes.h 
> declarations")
>
> from the powerpc-fixes tree and commit:
>
>   82de5797a260 ("powerpc: Remove extraneous header from asm-prototypes.h")
>
> from the powerpc tree.
>
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.

Thanks. I'm planning to merge fixes into next RSN ... as soon as people
stop finding bugs.

cheers


Re: [PATCH v2 2/2] powerpc: kprobes: invoke handlers directly

2016-11-23 Thread Masami Hiramatsu
On Tue, 22 Nov 2016 16:23:13 +0530
"Naveen N. Rao"  wrote:

> On 2016/11/22 09:43PM, Michael Ellerman wrote:
> > "Naveen N. Rao"  writes:
> > > On 2016/11/22 02:25PM, Masami Hiramatsu wrote:
> > >> On Mon, 21 Nov 2016 22:36:41 +0530
> > >> "Naveen N. Rao"  wrote:
> > >> > diff --git a/arch/powerpc/include/asm/kprobes.h 
> > >> > b/arch/powerpc/include/asm/kprobes.h
> > >> > index 2c9759bd..da30dc3 100644
> > >> > --- a/arch/powerpc/include/asm/kprobes.h
> > >> > +++ b/arch/powerpc/include/asm/kprobes.h
> > >> > @@ -32,6 +32,7 @@
> > >> >  #include 
> > >> >  #include 
> > >> >  
> > >> > +#ifdef CONFIG_KPROBES
> > >> >  #define  __ARCH_WANT_KPROBES_INSN_SLOT
> > >> >  
> > >> >  struct pt_regs;
> > >> > @@ -127,5 +128,11 @@ struct kprobe_ctlblk {
> > >> >  extern int kprobe_exceptions_notify(struct notifier_block *self,
> > >> >unsigned long val, void *data);
> > >> >  extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
> > >> > +extern int kprobe_handler(struct pt_regs *regs);
> > >> > +extern int kprobe_post_handler(struct pt_regs *regs);
> > >> > +#else
> > >> > +static int kprobe_handler(struct pt_regs *regs) { return 0; }
> > >> > +static int kprobe_post_handler(struct pt_regs *regs) { return 0; }
> > >> 
> > >> These should be "static inline int kprobe_...", you lost 'inline' here.
> > >> Others are OK for me.
> > >
> > > Ah, indeed. Good catch. Thanks.
> > >  
> > > Michael,
> > > Would you be ok to make this change when applying this, if you're ok 
> > > with the rest of the patch?
> > 
> > Yep done.
> > 
> > Why do we still need kprobe_exceptions_notify() now that it's empty?
> > Just to keep the generic code happy?
> 
> Yup. I took a look to see if we can get rid of it, but there are other 
> architectures that need it.

FYI, x86 use it not only for hooking traps but also hooking page
protection fault in kprobe handlers. Anyway, I'd better add an weak
function for that.

Thanks!

> 
> Thanks!
> - Naveen
> 


-- 
Masami Hiramatsu 


[PATCH v8] QE: remove PPCisms for QE

2016-11-23 Thread Zhao Qiang
QE was supported on PowerPC, and dependent on PPC,
Now it is supported on other platforms. so remove PPCisms.

Signed-off-by: Zhao Qiang 
---
Changes for v2:
- na
Changes for v3:
- add NO_IRQ
Changes for v4:
- modify spin_event_timeout to opencoded timeout loop
- remove NO_IRQ
- modify virq_to_hw to opencoed code
Changes for v5:
- modify commit msg
- modify depends of QUICC_ENGINE
- add kerneldoc header for qe_issue_cmd
Changes for v6:
- add dependency on FSL_SOC and PPC32 for drivers
  depending on QUICC_ENGING but not available on ARM
Changes for v7:
- split qeic part to another patch
- rebase
Changes for v8:
- include  in ucc_uart

 drivers/net/ethernet/freescale/Kconfig | 10 ++---
 drivers/soc/fsl/qe/Kconfig |  2 +-
 drivers/soc/fsl/qe/qe.c| 80 --
 drivers/soc/fsl/qe/qe_io.c | 42 --
 drivers/soc/fsl/qe/qe_tdm.c|  8 ++--
 drivers/soc/fsl/qe/ucc.c   | 10 ++---
 drivers/soc/fsl/qe/ucc_fast.c  | 68 ++---
 drivers/tty/serial/Kconfig |  2 +-
 drivers/tty/serial/ucc_uart.c  |  1 +
 drivers/usb/gadget/udc/Kconfig |  2 +-
 drivers/usb/host/Kconfig   |  2 +-
 include/soc/fsl/qe/qe.h|  1 -
 12 files changed, 119 insertions(+), 109 deletions(-)

diff --git a/drivers/net/ethernet/freescale/Kconfig 
b/drivers/net/ethernet/freescale/Kconfig
index d1ca45f..6677aff 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -5,10 +5,10 @@
 config NET_VENDOR_FREESCALE
bool "Freescale devices"
default y
-   depends on FSL_SOC || QUICC_ENGINE || CPM1 || CPM2 || PPC_MPC512x || \
-  M523x || M527x || M5272 || M528x || M520x || M532x || \
-  ARCH_MXC || ARCH_MXS || (PPC_MPC52xx && PPC_BESTCOMM) || \
-  ARCH_LAYERSCAPE
+   depends on FSL_SOC || (QUICC_ENGINE && PPC32) || CPM1 || CPM2 || \
+  PPC_MPC512x || M523x || M527x || M5272 || M528x || M520x || \
+  M532x || ARCH_MXC || ARCH_MXS || \
+  (PPC_MPC52xx && PPC_BESTCOMM) || ARCH_LAYERSCAPE
---help---
  If you have a network (Ethernet) card belonging to this class, say Y.
 
@@ -72,7 +72,7 @@ config FSL_XGMAC_MDIO
 
 config UCC_GETH
tristate "Freescale QE Gigabit Ethernet"
-   depends on QUICC_ENGINE
+   depends on QUICC_ENGINE && FSL_SOC && PPC32
select FSL_PQ_MDIO
select PHYLIB
---help---
diff --git a/drivers/soc/fsl/qe/Kconfig b/drivers/soc/fsl/qe/Kconfig
index 73a2e08..b26b643 100644
--- a/drivers/soc/fsl/qe/Kconfig
+++ b/drivers/soc/fsl/qe/Kconfig
@@ -4,7 +4,7 @@
 
 config QUICC_ENGINE
bool "Freescale QUICC Engine (QE) Support"
-   depends on FSL_SOC && PPC32
+   depends on OF && HAS_IOMEM
select GENERIC_ALLOCATOR
select CRC32
help
diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
index 2707a82..2b53e85 100644
--- a/drivers/soc/fsl/qe/qe.c
+++ b/drivers/soc/fsl/qe/qe.c
@@ -33,8 +33,6 @@
 #include 
 #include 
 #include 
-#include 
-#include 
 
 static void qe_snums_init(void);
 static int qe_sdma_init(void);
@@ -109,15 +107,27 @@ void qe_reset(void)
panic("sdma init failed!");
 }
 
+/* issue commands to QE, return 0 on success while -EIO on error
+ *
+ * @cmd: the command code, should be QE_INIT_TX_RX, QE_STOP_TX and so on
+ * @device: which sub-block will run the command, QE_CR_SUBBLOCK_UCCFAST1 - 8
+ * , QE_CR_SUBBLOCK_UCCSLOW1 - 8, QE_CR_SUBBLOCK_MCC1 - 3,
+ * QE_CR_SUBBLOCK_IDMA1 - 4 and such on.
+ * @mcn_protocol: specifies mode for the command for non-MCC, should be
+ * QE_CR_PROTOCOL_HDLC_TRANSPARENT, QE_CR_PROTOCOL_QMC, QE_CR_PROTOCOL_UART
+ * and such on.
+ * @cmd_input: command related data.
+ */
 int qe_issue_cmd(u32 cmd, u32 device, u8 mcn_protocol, u32 cmd_input)
 {
unsigned long flags;
u8 mcn_shift = 0, dev_shift = 0;
-   u32 ret;
+   int ret;
+   int i;
 
spin_lock_irqsave(_lock, flags);
if (cmd == QE_RESET) {
-   out_be32(_immr->cp.cecr, (u32) (cmd | QE_CR_FLG));
+   iowrite32be((cmd | QE_CR_FLG), _immr->cp.cecr);
} else {
if (cmd == QE_ASSIGN_PAGE) {
/* Here device is the SNUM, not sub-block */
@@ -134,20 +144,26 @@ int qe_issue_cmd(u32 cmd, u32 device, u8 mcn_protocol, 
u32 cmd_input)
mcn_shift = QE_CR_MCN_NORMAL_SHIFT;
}
 
-   out_be32(_immr->cp.cecdr, cmd_input);
-   out_be32(_immr->cp.cecr,
-(cmd | QE_CR_FLG | ((u32) device << dev_shift) | (u32)
- mcn_protocol << mcn_shift));
+   iowrite32be(cmd_input, 

Re: [PATCH 00/11] KVM: PPC: Book3S HV: Support KVM guests on POWER9

2016-11-23 Thread Paul Mackerras
On Wed, Nov 23, 2016 at 11:31:54AM +1100, Paul Mackerras wrote:
> This series of patches adds support to HV KVM for running KVM guests
> on POWER9 systems.  This allows us to run KVM guests that use HPT
> (hashed page table) address translation and know about the POWER9
> processor.  With suitable changes to the user-mode driver, this can
> also run guests on POWER9 in POWER8 or POWER7 compatibility mode.
> 
> For now we require the host to be in HPT mode (not radix).
> 
> This series of patches is based on the ppc-kvm topic branch from the
> powerpc tree merged with my current kvm-ppc-next tree.

I have pushed the merge plus this series to my kvm-ppc-next branch.

Paul.


Re: [PATCH 0/3] minor build fixes

2016-11-23 Thread Nicholas Piggin
On Thu, 24 Nov 2016 09:33:20 +1030
Alan Modra  wrote:

> On Thu, Nov 24, 2016 at 12:02:06AM +1100, Nicholas Piggin wrote:
> > I was building BookE and big endian with a little endian cross
> > compiler and it stopped working. My BookS BE tests must have been
> > building using the ELFv2 ABI. After this, the build sometimes still
> > strangely fails with dot symbols in syscall table unable to be found,
> > but that's looking like it may be a linker bug (Alan is going to take
> > a look).  
> 
> Yes it is a bug.  In compatibility code that was supposed to handle
> mixing old object files that use dot-symbols on function entry with
> newer object files that don't.  Here, "old" means mid 2004 or
> earlier.
> 
> As you can imagine, I'm not hugely concerned about the ld bug..
> 
> Since every binutils back to at least 2.17 has the bug, what changed
> in the kernel to expose it?  Are you building without -mcall-aixdesc?
> 

Yeah, it's my attempt to get powerpc64le compiler to build big endian
with -mabi=elfv2, so I'd have missed -mcall-aixdesc somewhere.

Thanks,
Nick


Re: [PATCH] powerpc/eeh/of: use builtin_platform_driver

2016-11-23 Thread Russell Currey
On Wed, 2016-11-23 at 22:58 +0800, Geliang Tang wrote:
> Use builtin_platform_driver() helper to simplify the code.
> 
> Signed-off-by: Geliang Tang 
> ---

Acked-by: Russell Currey 


[PATCH] scsi/ipr: Fix runaway IRQs when falling back from MSI to LSI

2016-11-23 Thread Benjamin Herrenschmidt
LSIs must be ack'ed with an MMIO otherwise they remain asserted
forever. This is controlled by the "clear_isr" flag.

While we set that flag properly when deciding initially whether
to use LSIs or MSIs, we fail to set it if we first chose MSIs,
the test fails, then fallback to LSIs.

Signed-off-by: Benjamin Herrenschmidt 
---
 drivers/scsi/ipr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 5324741..5dd3194 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -10213,6 +10213,7 @@ static int ipr_probe_ioa(struct pci_dev *pdev,
}
 
ioa_cfg->intr_flag = IPR_USE_LSI;
+   ioa_cfg->clear_isr = 1;
ioa_cfg->nvectors = 1;
}
else if (rc)



Re: [PATCH] cxl: drop duplicate header sched.h

2016-11-23 Thread Andrew Donnellan

On 24/11/16 02:27, Geliang Tang wrote:

Drop duplicate header sched.h from native.c.

Signed-off-by: Geliang Tang 


Good catch!

Reviewed-by: Andrew Donnellan 

--
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited



Re: [PATCH v3 0/2] Disable VF's memory space on updating IOV BARs

2016-11-23 Thread Bjorn Helgaas
On Wed, Oct 26, 2016 at 12:15:34PM +1100, Gavin Shan wrote:
> This moves pcibios_sriov_enable() to the point before VF and VF BARs
> are enabled on PowerNV platform. Also, pci_update_resource() is used
> to update IOV BARs on PowerNV platform, the PF might have been functional
> when the function is called. We shouldn't disable PF's memory decoding
> at that point. Instead, the VF's memory space should be disabled.
> 
> Changelog
> =
> v3:
>   * Disable VF's memory space when IOV BARs are updated in
> pcibios_sriov_enable().
> v2:
>   * Added one patch calling pcibios_sriov_enable() before the VF
> and VF BARs are enabled.
> 
> Gavin Shan (2):
>   PCI: Call pcibios_sriov_enable() before IOV BARs are enabled
>   PCI: Disable VF's memory space on updating IOV BAR in
> pci_update_resource()
> 
>  drivers/pci/iov.c   | 14 +++---
>  drivers/pci/setup-res.c | 28 
>  2 files changed, 27 insertions(+), 15 deletions(-)

I applied these to pci/virtualization for v4.10.  Thanks for your
patience, Gavin.


linux-next: manual merge of the powerpc tree with the powerpc-fixes tree

2016-11-23 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the powerpc tree got a conflict in:

  arch/powerpc/include/asm/asm-prototypes.h

between commit:

  9e5f68842276 ("powerpc: Fix missing CRCs, add more asm-prototypes.h 
declarations")

from the powerpc-fixes tree and commit:

  82de5797a260 ("powerpc: Remove extraneous header from asm-prototypes.h")

from the powerpc tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/include/asm/asm-prototypes.h
index e0baba1535e6,dfef1174663e..
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@@ -13,12 -13,6 +13,11 @@@
   */
  
  #include 
- #include 
 +#include 
 +#include 
 +#include 
 +#include 
 +
  #include 
  
  /* SMP */


Re: [PATCH 0/3] minor build fixes

2016-11-23 Thread Alan Modra
On Thu, Nov 24, 2016 at 12:02:06AM +1100, Nicholas Piggin wrote:
> I was building BookE and big endian with a little endian cross
> compiler and it stopped working. My BookS BE tests must have been
> building using the ELFv2 ABI. After this, the build sometimes still
> strangely fails with dot symbols in syscall table unable to be found,
> but that's looking like it may be a linker bug (Alan is going to take
> a look).

Yes it is a bug.  In compatibility code that was supposed to handle
mixing old object files that use dot-symbols on function entry with
newer object files that don't.  Here, "old" means mid 2004 or
earlier.

As you can imagine, I'm not hugely concerned about the ld bug..

Since every binutils back to at least 2.17 has the bug, what changed
in the kernel to expose it?  Are you building without -mcall-aixdesc?

-- 
Alan Modra
Australia Development Lab, IBM


Re: fsl_pamu: erratum a007907 should be applied on all versions of E6500 chips.

2016-11-23 Thread Scott Wood
On Tue, 2016-11-22 at 08:43 +, Jun Yang wrote:
> Hello Scott,
> Do you know the IOMMU maintainer's mail address?

>From MAINTAINERS:

IOMMU DRIVERS
M:  Joerg Roedel 
L:  io...@lists.linux-foundation.org
T:  git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
S:  Maintained
F:  Documentation/devicetree/bindings/iommu/
F:  drivers/iommu/

Also CC Varun Sethi 

-Scott



[powerpc:next 81/84] arch/powerpc/include/asm/cmpxchg.h:484:37: warning: passing argument 1 of '__cmpxchg' discards 'volatile' qualifier from pointer target type

2016-11-23 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
head:   3382a6220ff3bac886d9d90766f3fe18cf25b468
commit: d0563a1297e234ed37f6b51c2e9321accebd1839 [81/84] powerpc: Implement 
{cmp}xchg for u8 and u16
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout d0563a1297e234ed37f6b51c2e9321accebd1839
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All warnings (new ones prefixed by >>):

   In file included from arch/powerpc/include/asm/pgtable-be-types.h:4:0,
from arch/powerpc/include/asm/page.h:292,
from arch/powerpc/include/asm/book3s/64/mmu-hash.h:16,
from arch/powerpc/include/asm/book3s/64/mmu.h:29,
from arch/powerpc/include/asm/mmu.h:282,
from arch/powerpc/include/asm/lppaca.h:36,
from arch/powerpc/include/asm/paca.h:21,
from arch/powerpc/include/asm/current.h:16,
from include/linux/mutex.h:13,
from include/linux/kernfs.h:13,
from include/linux/sysfs.h:15,
from include/linux/kobject.h:21,
from include/linux/cdev.h:4,
from include/drm/drmP.h:36,
from drivers/gpu/drm/drm_lock.c:37:
   drivers/gpu/drm/drm_lock.c: In function 'drm_lock_take':
>> arch/powerpc/include/asm/cmpxchg.h:484:37: warning: passing argument 1 of 
>> '__cmpxchg' discards 'volatile' qualifier from pointer target type 
>> [-Wdiscarded-qualifiers]
 (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,   \
^
>> drivers/gpu/drm/drm_lock.c:69:10: note: in expansion of macro 'cmpxchg'
  prev = cmpxchg(lock, old, new);
 ^~~
   arch/powerpc/include/asm/cmpxchg.h:402:1: note: expected 'void *' but 
argument is of type 'volatile unsigned int *'
__cmpxchg(void *ptr, unsigned long old, unsigned long new,
^
   drivers/gpu/drm/drm_lock.c: In function 'drm_lock_transfer':
>> arch/powerpc/include/asm/cmpxchg.h:484:37: warning: passing argument 1 of 
>> '__cmpxchg' discards 'volatile' qualifier from pointer target type 
>> [-Wdiscarded-qualifiers]
 (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,   \
^
   drivers/gpu/drm/drm_lock.c:112:10: note: in expansion of macro 'cmpxchg'
  prev = cmpxchg(lock, old, new);
 ^~~
   arch/powerpc/include/asm/cmpxchg.h:402:1: note: expected 'void *' but 
argument is of type 'volatile unsigned int *'
__cmpxchg(void *ptr, unsigned long old, unsigned long new,
^
   drivers/gpu/drm/drm_lock.c: In function 'drm_legacy_lock_free':
>> arch/powerpc/include/asm/cmpxchg.h:484:37: warning: passing argument 1 of 
>> '__cmpxchg' discards 'volatile' qualifier from pointer target type 
>> [-Wdiscarded-qualifiers]
 (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,   \
^
   drivers/gpu/drm/drm_lock.c:135:10: note: in expansion of macro 'cmpxchg'
  prev = cmpxchg(lock, old, new);
 ^~~
   arch/powerpc/include/asm/cmpxchg.h:402:1: note: expected 'void *' but 
argument is of type 'volatile unsigned int *'
__cmpxchg(void *ptr, unsigned long old, unsigned long new,
^
   drivers/gpu/drm/drm_lock.c: In function 'drm_legacy_idlelock_release':
>> arch/powerpc/include/asm/cmpxchg.h:484:37: warning: passing argument 1 of 
>> '__cmpxchg' discards 'volatile' qualifier from pointer target type 
>> [-Wdiscarded-qualifiers]
 (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,   \
^
   drivers/gpu/drm/drm_lock.c:313:12: note: in expansion of macro 'cmpxchg'
prev = cmpxchg(lock, old, DRM_KERNEL_CONTEXT);
   ^~~
   arch/powerpc/include/asm/cmpxchg.h:402:1: note: expected 'void *' but 
argument is of type 'volatile unsigned int *'
__cmpxchg(void *ptr, unsigned long old, unsigned long new,
^

vim +484 arch/powerpc/include/asm/cmpxchg.h

d0563a12 Pan Xinhui2016-04-27  468  case 2:
d0563a12 Pan Xinhui2016-04-27  469  return 
__cmpxchg_u16_acquire(ptr, old, new);
56c08e6d Boqun Feng2015-12-15  470  case 4:
56c08e6d Boqun Feng2015-12-15  471  return 
__cmpxchg_u32_acquire(ptr, old, new);
56c08e6d Boqun Feng2015-12-15  472  #ifdef CONFIG_PPC64
56c08e6d Boqun Feng2015-12-15  473  case 8:
56c08e6d Boqun Feng2015-12-15  474  return 
__cmpxchg_u64_acquire(ptr, old, new);
56c08e6d Boqun Feng2015-12-15  475  #endif

[PATCH] cxl: drop duplicate header sched.h

2016-11-23 Thread Geliang Tang
Drop duplicate header sched.h from native.c.

Signed-off-by: Geliang Tang 
---
 drivers/misc/cxl/native.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index c336350..aeefa53 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -10,7 +10,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
-- 
2.9.3



Re: [PATCH v5 1/7] powerpc/mm: update ptep_set_access_flag to not do full mm tlb flush

2016-11-23 Thread Balbir Singh


On 24/11/16 01:36, Aneesh Kumar K.V wrote:
> Balbir Singh  writes:
> 
>> On 23/11/16 22:53, Aneesh Kumar K.V wrote:
>>> Balbir Singh  writes:
>>>
 On 23/11/16 22:09, Aneesh Kumar K.V wrote:
> When we are updating pte, we just need to flush the tlb mapping for
> that pte. Right now we do a full mm flush because we don't track page
> size. Update the interface to track the page size and use that to
> do the right tlb flush.
>

 Could you also clarify the scope -- this seems to be _radix_ only.
 The problem statement is not very clear and why doesn't the 
 flush_tlb_page()
 following ptep_set_access_flags() work? What else do we need to do?
>>>
>>> Yes it modifies only radix part.  Don't understand the flush_tlb_page()
>>> part of the comment above. We are modifying the tlbflush that we need to do 
>>> in the pte update
>>> sequence for DD1. ie, we need to do the flush before we can set the pte
>>> with new value.
>>>
>>> Also in this specific case, we can idealy drop that flush_tlb_page,
>>> because relaxing an access really don't need a tlb flush from generic
>>> architecture point of view. I left it there to make sure, we measure and
>>> get the invalidate path correct before going ahead with that
>>> optimization.
>>>
>>
>> OK.. here is my untested solution. I've only compiled it.
>> It breaks the 64/hash/radix abstractions, but it makes the
>> changes much simpler
>>
>> Signed-off-by: Balbir Singh 
> 
> I find the below one more confusing and complicated, spreading the
> details of DD1 around the code. I am not sure what extra i could have
> done to simplify the code. We have done the arch pte updates such that
> most of the update use the pte_update() interface and the one which relax
> the access bits get to ptep_set_access_flag. All pte updated rules are
> contained there. What you did below is that you moved the dd1 sequence
> out to a place where page size is available. What I did in my patch is to
> pass page size around. IMHO it is a matter of style. I also want to pass
> page size around so that we keep huge_pte_update, pte_update,
> ptep_set_access_flags all similar.
> 

Agreed and the reason I did it that way is that after a while we know
the _dd1_ variants need not be supported/maintained at all. It is a
matter of style and I was wondering if we need to change the API
to pass address and page_size as a permanent solution.

Balbir Singh.


[PATCH] soc/fsl/qe: use builtin_platform_driver

2016-11-23 Thread Geliang Tang
Use builtin_platform_driver() helper to simplify the code.

Signed-off-by: Geliang Tang 
---
 drivers/soc/fsl/qe/qe.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
index 2707a82..ade168f 100644
--- a/drivers/soc/fsl/qe/qe.c
+++ b/drivers/soc/fsl/qe/qe.c
@@ -717,9 +717,5 @@ static struct platform_driver qe_driver = {
.resume = qe_resume,
 };
 
-static int __init qe_drv_init(void)
-{
-   return platform_driver_register(_driver);
-}
-device_initcall(qe_drv_init);
+builtin_platform_driver(qe_driver);
 #endif /* defined(CONFIG_SUSPEND) && defined(CONFIG_PPC_85xx) */
-- 
2.9.3



[PATCH] powerpc: sysdev: use builtin_platform_driver

2016-11-23 Thread Geliang Tang
Use builtin_platform_driver() helper to simplify the code.

Signed-off-by: Geliang Tang 
---
 arch/powerpc/sysdev/fsl_pmc.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_pmc.c b/arch/powerpc/sysdev/fsl_pmc.c
index 1d6fd7c..232225e 100644
--- a/arch/powerpc/sysdev/fsl_pmc.c
+++ b/arch/powerpc/sysdev/fsl_pmc.c
@@ -85,8 +85,4 @@ static struct platform_driver pmc_driver = {
.probe = pmc_probe,
 };
 
-static int __init pmc_init(void)
-{
-   return platform_driver_register(_driver);
-}
-device_initcall(pmc_init);
+builtin_platform_driver(pmc_driver);
-- 
2.9.3



[PATCH] powerpc: platforms: 83xx: use builtin_platform_driver

2016-11-23 Thread Geliang Tang
Use builtin_platform_driver() helper to simplify the code.

Signed-off-by: Geliang Tang 
---
 arch/powerpc/platforms/83xx/suspend.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/83xx/suspend.c 
b/arch/powerpc/platforms/83xx/suspend.c
index 24717d0..08f92f6 100644
--- a/arch/powerpc/platforms/83xx/suspend.c
+++ b/arch/powerpc/platforms/83xx/suspend.c
@@ -441,8 +441,4 @@ static struct platform_driver pmc_driver = {
.remove = pmc_remove
 };
 
-static int pmc_init(void)
-{
-   return platform_driver_register(_driver);
-}
-device_initcall(pmc_init);
+builtin_platform_driver(pmc_driver);
-- 
2.9.3



[PATCH] powerpc/eeh/of: use builtin_platform_driver

2016-11-23 Thread Geliang Tang
Use builtin_platform_driver() helper to simplify the code.

Signed-off-by: Geliang Tang 
---
 arch/powerpc/kernel/of_platform.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/of_platform.c 
b/arch/powerpc/kernel/of_platform.c
index b60a67d..34aeac5 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -114,11 +114,6 @@ static struct platform_driver of_pci_phb_driver = {
},
 };
 
-static __init int of_pci_phb_init(void)
-{
-   return platform_driver_register(_pci_phb_driver);
-}
-
-device_initcall(of_pci_phb_init);
+builtin_platform_driver(of_pci_phb_driver);
 
 #endif /* CONFIG_PPC_OF_PLATFORM_PCI */
-- 
2.9.3



Re: [RFC][PATCH] powerpc/64be: use ELFv2 ABI for big endian kernels

2016-11-23 Thread Segher Boessenkool
On Thu, Nov 24, 2016 at 12:08:40AM +1100, Nicholas Piggin wrote:
> Question, are there any fundamental reasons we shouldn't use the ELFv2
> ABI to build big endian kernels if the compiler supports it?

No one uses ELFv2 for BE in production, and it isn't thoroughly tested
at all, not even regularly tested.  "Not supported", as far as GCC is
concerned (or any of the distros AFAIK).

There are no fundamental reasons of course, ABIs are largely just
conventions, not laws of nature.


Segher


Re: [PATCH v5 1/7] powerpc/mm: update ptep_set_access_flag to not do full mm tlb flush

2016-11-23 Thread Aneesh Kumar K.V
Balbir Singh  writes:

> On 23/11/16 22:53, Aneesh Kumar K.V wrote:
>> Balbir Singh  writes:
>> 
>>> On 23/11/16 22:09, Aneesh Kumar K.V wrote:
 When we are updating pte, we just need to flush the tlb mapping for
 that pte. Right now we do a full mm flush because we don't track page
 size. Update the interface to track the page size and use that to
 do the right tlb flush.

>>>
>>> Could you also clarify the scope -- this seems to be _radix_ only.
>>> The problem statement is not very clear and why doesn't the flush_tlb_page()
>>> following ptep_set_access_flags() work? What else do we need to do?
>> 
>> Yes it modifies only radix part.  Don't understand the flush_tlb_page()
>> part of the comment above. We are modifying the tlbflush that we need to do 
>> in the pte update
>> sequence for DD1. ie, we need to do the flush before we can set the pte
>> with new value.
>> 
>> Also in this specific case, we can idealy drop that flush_tlb_page,
>> because relaxing an access really don't need a tlb flush from generic
>> architecture point of view. I left it there to make sure, we measure and
>> get the invalidate path correct before going ahead with that
>> optimization.
>> 
>
> OK.. here is my untested solution. I've only compiled it.
> It breaks the 64/hash/radix abstractions, but it makes the
> changes much simpler
>
> Signed-off-by: Balbir Singh 

I find the below one more confusing and complicated, spreading the
details of DD1 around the code. I am not sure what extra i could have
done to simplify the code. We have done the arch pte updates such that
most of the update use the pte_update() interface and the one which relax
the access bits get to ptep_set_access_flag. All pte updated rules are
contained there. What you did below is that you moved the dd1 sequence
out to a place where page size is available. What I did in my patch is to
pass page size around. IMHO it is a matter of style. I also want to pass
page size around so that we keep huge_pte_update, pte_update,
ptep_set_access_flags all similar.


>
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
> b/arch/powerpc/include/asm/book3s/64/radix.h
> index 2a46dea..2454217 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -162,6 +162,30 @@ static inline unsigned long radix__pte_update(struct 
> mm_struct *mm,
>   return old_pte;
>  }
>
> +static inline void radix__ptep_dd1_set_access_flags(struct mm_struct *mm,
> + unsigned long addr,
> + pte_t *ptep, pte_t entry,
> + unsigned long page_size)
> +{
> +
> + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
> +   _PAGE_RW | _PAGE_EXEC);
> +
> + unsigned long old_pte, new_pte;
> +
> + old_pte = __radix_pte_update(ptep, ~0, 0);
> + asm volatile("ptesync" : : : "memory");
> + /*
> +  * new value of pte
> +  */
> + new_pte = old_pte | set;
> +
> + radix__flush_tlb_page_psize(mm, addr, page_size);
> + __radix_pte_update(ptep, 0, new_pte);
> +
> + asm volatile("ptesync" : : : "memory");
> +}
> +
>  /*
>   * Set the dirty and/or accessed bits atomically in a linux PTE, this
>   * function doesn't need to invalidate tlb.
> @@ -173,26 +197,7 @@ static inline void radix__ptep_set_access_flags(struct 
> mm_struct *mm,
>   unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
> _PAGE_RW | _PAGE_EXEC);
>
> - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -
> - unsigned long old_pte, new_pte;
> -
> - old_pte = __radix_pte_update(ptep, ~0, 0);
> - asm volatile("ptesync" : : : "memory");
> - /*
> -  * new value of pte
> -  */
> - new_pte = old_pte | set;
> -
> - /*
> -  * For now let's do heavy pid flush
> -  * radix__flush_tlb_page_psize(mm, addr, mmu_virtual_psize);
> -  */
> - radix__flush_tlb_mm(mm);
> -
> - __radix_pte_update(ptep, 0, new_pte);
> - } else
> - __radix_pte_update(ptep, 0, set);
> + __radix_pte_update(ptep, 0, set);
>   asm volatile("ptesync" : : : "memory");
>  }
>
> diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
> b/arch/powerpc/mm/pgtable-book3s64.c
> index f4f437c..0c7ee0e 100644
> --- a/arch/powerpc/mm/pgtable-book3s64.c
> +++ b/arch/powerpc/mm/pgtable-book3s64.c
> @@ -7,12 +7,14 @@
>   * 2 of the License, or (at your option) any later version.
>   */
>
> +#include 
>  #include 
>  #include 
>  #include 
>
>  #include "mmu_decl.h"
>  #include 
> +#include 
>
>  int (*register_process_table)(unsigned long base, unsigned long page_size,
> 

Re: [PATCH v5 3/7] powerpc/mm/hugetlb: Handle hugepage size supported by hash config

2016-11-23 Thread Aneesh Kumar K.V
Balbir Singh  writes:

> On 23/11/16 22:09, Aneesh Kumar K.V wrote:
>> W.r.t hash page table config, we support 16MB and 16GB as the hugepage
>> size. Update the hstate_get_psize to handle 16M and 16G.
>> 
>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>>  arch/powerpc/include/asm/book3s/64/hugetlb.h | 4 
>>  1 file changed, 4 insertions(+)
>> 
>> diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h 
>> b/arch/powerpc/include/asm/book3s/64/hugetlb.h
>> index 499268045306..d9c283f95e05 100644
>> --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
>> +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
>> @@ -21,6 +21,10 @@ static inline int hstate_get_psize(struct hstate *hstate)
>>  return MMU_PAGE_2M;
>>  else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
>>  return MMU_PAGE_1G;
>> +else if (shift == mmu_psize_defs[MMU_PAGE_16M].shift)
>> +return MMU_PAGE_16M;
>> +else if (shift == mmu_psize_defs[MMU_PAGE_16G].shift)
>> +return MMU_PAGE_16G;
>>  else {
>>  WARN(1, "Wrong huge page shift\n");
>>  return mmu_virtual_psize;
>> 
>
> Is this related to this patch series? Radix can't do these sizes
>

The code returns the psize (the index value of the page size ) from hstate.
It doesn't make any verification. I added the hash details here because
this header is now suppose to contain generic functions not radix
specific one.

-aneesh



Re: [RFC][PATCH] powerpc/64be: use ELFv2 ABI for big endian kernels

2016-11-23 Thread Balbir Singh
On Thu, Nov 24, 2016 at 12:08 AM, Nicholas Piggin  wrote:
> Question, are there any fundamental reasons we shouldn't use the ELFv2
> ABI to build big endian kernels if the compiler supports it?
>

Does this have implications w.r.t interfaces to

1. openfirmware/skiboot
2. glibc/vdso

Keen to find out as well

Balbir Singh.


Re: [PATCH v5 3/7] powerpc/mm/hugetlb: Handle hugepage size supported by hash config

2016-11-23 Thread Balbir Singh


On 23/11/16 22:09, Aneesh Kumar K.V wrote:
> W.r.t hash page table config, we support 16MB and 16GB as the hugepage
> size. Update the hstate_get_psize to handle 16M and 16G.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/book3s/64/hugetlb.h | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h 
> b/arch/powerpc/include/asm/book3s/64/hugetlb.h
> index 499268045306..d9c283f95e05 100644
> --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
> +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
> @@ -21,6 +21,10 @@ static inline int hstate_get_psize(struct hstate *hstate)
>   return MMU_PAGE_2M;
>   else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
>   return MMU_PAGE_1G;
> + else if (shift == mmu_psize_defs[MMU_PAGE_16M].shift)
> + return MMU_PAGE_16M;
> + else if (shift == mmu_psize_defs[MMU_PAGE_16G].shift)
> + return MMU_PAGE_16G;
>   else {
>   WARN(1, "Wrong huge page shift\n");
>   return mmu_virtual_psize;
> 

Is this related to this patch series? Radix can't do these sizes

Balbir


Re: [PATCH v5 1/7] powerpc/mm: update ptep_set_access_flag to not do full mm tlb flush

2016-11-23 Thread Balbir Singh


On 23/11/16 22:53, Aneesh Kumar K.V wrote:
> Balbir Singh  writes:
> 
>> On 23/11/16 22:09, Aneesh Kumar K.V wrote:
>>> When we are updating pte, we just need to flush the tlb mapping for
>>> that pte. Right now we do a full mm flush because we don't track page
>>> size. Update the interface to track the page size and use that to
>>> do the right tlb flush.
>>>
>>
>> Could you also clarify the scope -- this seems to be _radix_ only.
>> The problem statement is not very clear and why doesn't the flush_tlb_page()
>> following ptep_set_access_flags() work? What else do we need to do?
> 
> Yes it modifies only radix part.  Don't understand the flush_tlb_page()
> part of the comment above. We are modifying the tlbflush that we need to do 
> in the pte update
> sequence for DD1. ie, we need to do the flush before we can set the pte
> with new value.
> 
> Also in this specific case, we can idealy drop that flush_tlb_page,
> because relaxing an access really don't need a tlb flush from generic
> architecture point of view. I left it there to make sure, we measure and
> get the invalidate path correct before going ahead with that
> optimization.
> 

OK.. here is my untested solution. I've only compiled it.
It breaks the 64/hash/radix abstractions, but it makes the
changes much simpler

Signed-off-by: Balbir Singh 

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 2a46dea..2454217 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -162,6 +162,30 @@ static inline unsigned long radix__pte_update(struct 
mm_struct *mm,
return old_pte;
 }
 
+static inline void radix__ptep_dd1_set_access_flags(struct mm_struct *mm,
+   unsigned long addr,
+   pte_t *ptep, pte_t entry,
+   unsigned long page_size)
+{
+
+   unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
+ _PAGE_RW | _PAGE_EXEC);
+
+   unsigned long old_pte, new_pte;
+
+   old_pte = __radix_pte_update(ptep, ~0, 0);
+   asm volatile("ptesync" : : : "memory");
+   /*
+* new value of pte
+*/
+   new_pte = old_pte | set;
+
+   radix__flush_tlb_page_psize(mm, addr, page_size);
+   __radix_pte_update(ptep, 0, new_pte);
+
+   asm volatile("ptesync" : : : "memory");
+}
+
 /*
  * Set the dirty and/or accessed bits atomically in a linux PTE, this
  * function doesn't need to invalidate tlb.
@@ -173,26 +197,7 @@ static inline void radix__ptep_set_access_flags(struct 
mm_struct *mm,
unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
  _PAGE_RW | _PAGE_EXEC);
 
-   if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
-
-   unsigned long old_pte, new_pte;
-
-   old_pte = __radix_pte_update(ptep, ~0, 0);
-   asm volatile("ptesync" : : : "memory");
-   /*
-* new value of pte
-*/
-   new_pte = old_pte | set;
-
-   /*
-* For now let's do heavy pid flush
-* radix__flush_tlb_page_psize(mm, addr, mmu_virtual_psize);
-*/
-   radix__flush_tlb_mm(mm);
-
-   __radix_pte_update(ptep, 0, new_pte);
-   } else
-   __radix_pte_update(ptep, 0, set);
+   __radix_pte_update(ptep, 0, set);
asm volatile("ptesync" : : : "memory");
 }
 
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index f4f437c..0c7ee0e 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -7,12 +7,14 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include 
 #include 
 #include 
 #include 
 
 #include "mmu_decl.h"
 #include 
+#include 
 
 int (*register_process_table)(unsigned long base, unsigned long page_size,
  unsigned long tbl_size);
@@ -35,7 +37,15 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, 
unsigned long address,
 #endif
changed = !pmd_same(*(pmdp), entry);
if (changed) {
-   __ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp), 
pmd_pte(entry));
+   if (radix_enabled() && cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+   unsigned long page_size;
+   page_size = is_vm_hugetlb_page(vma) ? PAGE_SIZE :
+   huge_page_size(hstate_vma(vma));
+
+   radix__ptep_dd1_set_access_flags(vma->vm_mm, address,
+   pmdp_ptep(pmdp), pmd_pte(entry), 
page_size);
+   } else
+   __ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp), 

Re: [mm v2 0/3] Support memory cgroup hotplug

2016-11-23 Thread Michal Hocko
On Thu 24-11-16 00:05:12, Balbir Singh wrote:
> 
> 
> On 23/11/16 20:28, Michal Hocko wrote:
[...]
> > I am more worried about synchronization with the hotplug which tends to
> > be a PITA in places were we were simply safe by definition until now. We
> > do not have all that many users of memcg->nodeinfo[nid] from what I can see
> > but are all of them safe to never race with the hotplug. A lack of
> > highlevel design description is less than encouraging.
> 
> As in explanation? The design is dictated by the notifier and the actions
> to take when the node comes online/offline.

Sure but how all the users of lruvec (for example) which is stored in
the nodeinfo AFAIR, are supposed to synchronize with the notifier.
Really if you are doing something dynamic then the first thing to
explain is the sychronization. There might be really good reasons why we
do not have to care about explicit synchr. for most code paths but my
past experience with many subtle hotplug related bugs just make me a bit
suspicious. So in other words, please make sure to document as much as
possible. This will make the review so much easier.

>  So please try to
> > spend some time describing how do we use nodeinfo currently and how is
> > the synchronization with the hotplug supposed to work and what
> > guarantees that no stale nodinfos can be ever used. This is just too
> > easy to get wrong...
> > 
> 
> OK.. I'll add that in the next cover letter

Thanks!

-- 
Michal Hocko
SUSE Labs


[RFC][PATCH] powerpc/64be: use ELFv2 ABI for big endian kernels

2016-11-23 Thread Nicholas Piggin
Question, are there any fundamental reasons we shouldn't use the ELFv2
ABI to build big endian kernels if the compiler supports it?

Thanks,
Nick

---
 arch/powerpc/Makefile  | 15 +--
 arch/powerpc/boot/Makefile |  2 +-
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 902da6e..b4867fc 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -72,18 +72,19 @@ GNUTARGET   := powerpc
 MULTIPLEWORD   := -mmultiple
 endif
 
+ifeq ($(CONFIG_PPC64),y)
+cflags-y   += $(call cc-option,-mabi=elfv2)
+aflags-y   += $(call cc-option,-mabi=elfv2)
+endif
+
 cflags-$(CONFIG_CPU_BIG_ENDIAN)+= $(call 
cc-option,-mbig-endian)
-cflags-$(CONFIG_CPU_BIG_ENDIAN)+= $(call cc-option,-mabi=elfv1)
 cflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mlittle-endian
-cflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mabi=elfv2
 ifneq ($(cc-name),clang)
   cflags-$(CONFIG_CPU_LITTLE_ENDIAN)   += -mno-strict-align
 endif
 
 aflags-$(CONFIG_CPU_BIG_ENDIAN)+= $(call 
cc-option,-mbig-endian)
-aflags-$(CONFIG_CPU_BIG_ENDIAN)+= $(call cc-option,-mabi=elfv1)
 aflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mlittle-endian
-aflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mabi=elfv2
 
 ifeq ($(HAS_BIARCH),y)
 override AS+= -a$(BITS)
@@ -113,14 +114,8 @@ endif
 endif
 
 CFLAGS-$(CONFIG_PPC64) := $(call cc-option,-mtraceback=no)
-ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
 CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv2,$(call 
cc-option,-mcall-aixdesc))
 AFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv2)
-else
-CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcall-aixdesc)
-CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv1)
-AFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv1)
-endif
 CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcmodel=medium,$(call 
cc-option,-mminimal-toc))
 CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mno-pointers-to-nested-functions)
 CFLAGS-$(CONFIG_PPC32) := -ffixed-r2 $(MULTIPLEWORD)
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 1d50d41..ef2499b 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -33,7 +33,7 @@ BOOTCFLAGS+= -m64
 endif
 ifdef CONFIG_CPU_BIG_ENDIAN
 BOOTCFLAGS += -mbig-endian
-BOOTCFLAGS += $(call cc-option,-mabi=elfv1)
+BOOTCFLAGS += $(call cc-option,-mabi=elfv2)
 else
 BOOTCFLAGS += -mlittle-endian
 BOOTCFLAGS += $(call cc-option,-mabi=elfv2)
-- 
2.10.2



Re: [mm v2 0/3] Support memory cgroup hotplug

2016-11-23 Thread Balbir Singh


On 23/11/16 20:28, Michal Hocko wrote:
> On Wed 23-11-16 19:37:16, Balbir Singh wrote:
>>
>>
>> On 23/11/16 19:07, Michal Hocko wrote:
>>> On Wed 23-11-16 18:50:42, Balbir Singh wrote:


 On 23/11/16 18:25, Michal Hocko wrote:
> On Wed 23-11-16 15:36:51, Balbir Singh wrote:
>> In the absence of hotplug we use extra memory proportional to
>> (possible_nodes - online_nodes) * number_of_cgroups. PPC64 has a patch
>> to disable large consumption with large number of cgroups. This patch
>> adds hotplug support to memory cgroups and reverts the commit that
>> limited possible nodes to online nodes.
>
> Balbir,
> I have asked this in the previous version but there still seems to be a
> lack of information of _why_ do we want this, _how_ much do we save on
> the memory overhead on most systems and _why_ the additional complexity
> is really worth it. Please make sure to add all this in the cover
> letter.
>

 The data is in the patch referred to in patch 3. The order of waste was
 200MB for 400 cgroup directories enough for us to restrict possible_map
 to online_map. These patches allow us to have a larger possible map and
 allow onlining nodes not in the online_map, which is currently a 
 restriction
 on ppc64.
>>>
>>> How common is to have possible_map >> online_map? If this is ppc64 then
>>> what is the downside of keeping the current restriction instead?
>>>
>>
>> On my system CONFIG_NODE_SHIFT is 8, 256 nodes and possible_nodes are 2
>> The downside is the ability to hotplug and online an offline node.
>> Please see http://www.spinics.net/lists/linux-mm/msg116724.html
> 
> OK, so we are slowly getting to what I've asked originally ;) So who
> cares? Depending on CONFIG_NODE_SHIFT (which tends to be quite large in
> distribution or other general purpose kernels) the overhead is 424B (as
> per pahole on the current kernel) for one numa node. Most machines are
> to be expected 1-4 numa nodes so the overhead might be somewhere around
> 100K per memcg (with 256 possible nodes). Not trivial amount for sure
> but I would rather encourage people to lower the possible node count for
> their hardware if it is artificially large.
> 

On my desktop NODES_SHIFT is 6, many distro kernels have it a 9. I've known
of solutions that use fake NUMA for partitioning and need as many nodes as
possible.

 A typical system that I use has about 100-150 directories, depending on the
 number of users/docker instances/configuration/virtual machines. These 
 numbers
 will only grow as we pack more of these instances on them.

 From a complexity view point, the patches are quite straight forward.
>>>
>>> Well, I would like to hear more about that. {get,put}_online_memory
>>> at random places doesn't sound all that straightforward to me.
>>>
>>
>> I thought those places were not random :) I tried to think them out as
>> discussed with Vladimir. I don't claim the code is bug free, we can fix
>> any bugs as we test this more.
> 
> I am more worried about synchronization with the hotplug which tends to
> be a PITA in places were we were simply safe by definition until now. We
> do not have all that many users of memcg->nodeinfo[nid] from what I can see
> but are all of them safe to never race with the hotplug. A lack of
> highlevel design description is less than encouraging.

As in explanation? The design is dictated by the notifier and the actions
to take when the node comes online/offline.

 So please try to
> spend some time describing how do we use nodeinfo currently and how is
> the synchronization with the hotplug supposed to work and what
> guarantees that no stale nodinfos can be ever used. This is just too
> easy to get wrong...
> 

OK.. I'll add that in the next cover letter

Balbir


[PATCH 3/3] powerpc/64e: don't branch to dot symbols

2016-11-23 Thread Nicholas Piggin
This converts one that was missed by b1576fec7f4d ("powerpc: No need
to use dot symbols when branching to a function").

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64e.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 38a1f96..45b453e 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -923,10 +923,10 @@ kernel_dbg_exc:
PROLOG_ADDITION_NONE)
EXCEPTION_COMMON(0x340)
addir3,r1,STACK_FRAME_OVERHEAD
-   bl  .save_nvgprs
+   bl  save_nvgprs
INTS_RESTORE_HARD
-   bl  .unknown_exception
-   b   .ret_from_except
+   bl  unknown_exception
+   b   ret_from_except
 
 /*
  * An interrupt came in while soft-disabled; We mark paca->irq_happened
-- 
2.10.2



[PATCH 2/3] powerpc: allow compilation on cross-endian toolchain

2016-11-23 Thread Nicholas Piggin
GCC can compile with either endian, but the ABI version always
defaults to the default endian. Alan Modra says:

  you need both -mbig and -mabi=elfv1 to make a powerpc64le gcc
  generate powerpc64 code

The opposite is true for powerpc64 when generating -mlittle it
requires -mabi=elfv2 to generate v2 ABI. This change adds abi
annotations together with endianness.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/Makefile  | 6 ++
 arch/powerpc/boot/Makefile | 1 +
 2 files changed, 7 insertions(+)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 617dece..902da6e 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -73,13 +73,17 @@ MULTIPLEWORD:= -mmultiple
 endif
 
 cflags-$(CONFIG_CPU_BIG_ENDIAN)+= $(call 
cc-option,-mbig-endian)
+cflags-$(CONFIG_CPU_BIG_ENDIAN)+= $(call cc-option,-mabi=elfv1)
 cflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mlittle-endian
+cflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mabi=elfv2
 ifneq ($(cc-name),clang)
   cflags-$(CONFIG_CPU_LITTLE_ENDIAN)   += -mno-strict-align
 endif
 
 aflags-$(CONFIG_CPU_BIG_ENDIAN)+= $(call 
cc-option,-mbig-endian)
+aflags-$(CONFIG_CPU_BIG_ENDIAN)+= $(call cc-option,-mabi=elfv1)
 aflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mlittle-endian
+aflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mabi=elfv2
 
 ifeq ($(HAS_BIARCH),y)
 override AS+= -a$(BITS)
@@ -114,6 +118,8 @@ CFLAGS-$(CONFIG_PPC64)  += $(call 
cc-option,-mabi=elfv2,$(call cc-option,-mcall-a
 AFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv2)
 else
 CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcall-aixdesc)
+CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv1)
+AFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv1)
 endif
 CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcmodel=medium,$(call 
cc-option,-mminimal-toc))
 CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mno-pointers-to-nested-functions)
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index eae2dc8..1d50d41 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -33,6 +33,7 @@ BOOTCFLAGS+= -m64
 endif
 ifdef CONFIG_CPU_BIG_ENDIAN
 BOOTCFLAGS += -mbig-endian
+BOOTCFLAGS += $(call cc-option,-mabi=elfv1)
 else
 BOOTCFLAGS += -mlittle-endian
 BOOTCFLAGS += $(call cc-option,-mabi=elfv2)
-- 
2.10.2



[PATCH 1/3] powerpc/64e: convert cmpi to cmpwi in head_64.S

2016-11-23 Thread Nicholas Piggin
>From 80f23935cadb ("powerpc: Convert cmp to cmpd in idle enter sequence"):

PowerPC's "cmp" instruction has four operands. Normally people write
"cmpw" or "cmpd" for the second cmp operand 0 or 1. But, frequently
people forget, and write "cmp" with just three operands.

With older binutils this is silently accepted as if this was "cmpw",
while often "cmpd" is wanted. With newer binutils GAS will complain
about this for 64-bit code. For 32-bit code it still silently assumes
"cmpw" is what is meant.

In this instance the code comes directly from ISA v2.07, including the
cmp, but cmpd is correct. Backport to stable so that new toolchains can
build old kernels.

In this case, cmpwi is called for, so this is just a build fix for
new toolchians.

Stable: v3.0
Cc: Segher Boessenkool 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/head_64.S | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 04c546e..1f7f908 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -214,9 +214,9 @@ booting_thread_hwid:
  */
 _GLOBAL(book3e_start_thread)
LOAD_REG_IMMEDIATE(r5, MSR_KERNEL)
-   cmpi0, r3, 0
+   cmpwi   r3, 0
beq 10f
-   cmpi0, r3, 1
+   cmpwi   r3, 1
beq 11f
/* If the thread id is invalid, just exit. */
b   13f
@@ -241,9 +241,9 @@ _GLOBAL(book3e_start_thread)
  * r3 = the thread physical id
  */
 _GLOBAL(book3e_stop_thread)
-   cmpi0, r3, 0
+   cmpwi   r3, 0
beq 10f
-   cmpi0, r3, 1
+   cmpwi   r3, 1
beq 10f
/* If the thread id is invalid, just exit. */
b   13f
-- 
2.10.2



[PATCH 0/3] minor build fixes

2016-11-23 Thread Nicholas Piggin
I was building BookE and big endian with a little endian cross
compiler and it stopped working. My BookS BE tests must have been
building using the ELFv2 ABI. After this, the build sometimes still
strangely fails with dot symbols in syscall table unable to be found,
but that's looking like it may be a linker bug (Alan is going to take
a look).

Thanks,
Nick

Nicholas Piggin (3):
  powerpc/64e: convert cmpi to cmpwi in head_64.S
  powerpc: allow compilation on cross-endian toolchain
  powerpc/64e: don't branch to dot symbols

 arch/powerpc/Makefile| 6 ++
 arch/powerpc/boot/Makefile   | 1 +
 arch/powerpc/kernel/exceptions-64e.S | 6 +++---
 arch/powerpc/kernel/head_64.S| 8 
 4 files changed, 14 insertions(+), 7 deletions(-)

-- 
2.10.2



Re: [PATCH v2] powerpc: split ftrace bits into a separate file

2016-11-23 Thread Michael Ellerman
"Naveen N. Rao"  writes:

> entry_*.S now includes a lot more than just kernel entry/exit code. As a
> first step at cleaning this up, let's split out the ftrace bits into
> separate files.
>
> No functional changes.
>
> Suggested-by: Michael Ellerman 
> Signed-off-by: Naveen N. Rao 
> ---
> v2: updated commit description.
>
>  arch/powerpc/kernel/Makefile|   2 +
>  arch/powerpc/kernel/entry_32.S  | 107 ---
>  arch/powerpc/kernel/entry_64.S  | 380 --
>  arch/powerpc/kernel/ftrace_32.S | 118 
>  arch/powerpc/kernel/ftrace_64.S | 391 
> 

Thanks for having a crack at this.

I think I'd actually like to go the whole way, and move all the tracing
code into arch/powerpc/trace (or ftrace?).

There's the 32 and 64-bit asm, and ftrace.c, and trace_clock.c. And then
possibly later some of the ftrace module code could move in there too.

And as part of moving the asm, I think we can come up with a better way
to organise the code. If you look at the ftrace code in entry_64.S for
example the ifdefs look like:

#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
#endif
#else /* CC_USING_MPROFILE_KERNEL */
#ifdef CONFIG_LIVEPATCH
#endif
#ifdef CONFIG_LIVEPATCH
#endif
#ifdef CONFIG_LIVEPATCH
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
#endif
#endif /* CC_USING_MPROFILE_KERNEL */
#ifdef CONFIG_LIVEPATCH
#endif
#else
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
#endif
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
#else /* CC_USING_MPROFILE_KERNEL */
#endif /* CC_USING_MPROFILE_KERNEL */
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
#endif /* CONFIG_FUNCTION_TRACER */


Which is not easy to follow.

I think the main axis is CC_USING_MPROFILE_KERNEL y/n, so perhaps we split
it into two .S files based on that?

cheers


Re: [PATCH v5 1/7] powerpc/mm: update ptep_set_access_flag to not do full mm tlb flush

2016-11-23 Thread Aneesh Kumar K.V
Balbir Singh  writes:

> On 23/11/16 22:09, Aneesh Kumar K.V wrote:
>> When we are updating pte, we just need to flush the tlb mapping for
>> that pte. Right now we do a full mm flush because we don't track page
>> size. Update the interface to track the page size and use that to
>> do the right tlb flush.
>> 
>
> Could you also clarify the scope -- this seems to be _radix_ only.
> The problem statement is not very clear and why doesn't the flush_tlb_page()
> following ptep_set_access_flags() work? What else do we need to do?

Yes it modifies only radix part.  Don't understand the flush_tlb_page()
part of the comment above. We are modifying the tlbflush that we need to do in 
the pte update
sequence for DD1. ie, we need to do the flush before we can set the pte
with new value.

Also in this specific case, we can idealy drop that flush_tlb_page,
because relaxing an access really don't need a tlb flush from generic
architecture point of view. I left it there to make sure, we measure and
get the invalidate path correct before going ahead with that
optimization.


>
>
>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>>  arch/powerpc/include/asm/book3s/32/pgtable.h |  4 +++-
>>  arch/powerpc/include/asm/book3s/64/pgtable.h |  7 +--
>>  arch/powerpc/include/asm/book3s/64/radix.h   | 14 +++---
>>  arch/powerpc/include/asm/nohash/32/pgtable.h |  4 +++-
>>  arch/powerpc/include/asm/nohash/64/pgtable.h |  4 +++-
>>  arch/powerpc/mm/pgtable-book3s64.c   |  3 ++-
>>  arch/powerpc/mm/pgtable-radix.c  | 16 
>>  arch/powerpc/mm/pgtable.c| 10 --
>>  arch/powerpc/mm/tlb-radix.c  | 15 ---
>>  9 files changed, 47 insertions(+), 30 deletions(-)
>> 

-aneesh



Re: [RFC] [PATCH] Trace TLBIE's

2016-11-23 Thread Balbir Singh


On 23/11/16 21:15, Michael Ellerman wrote:
> Balbir Singh  writes:
> 
>> Just a quick patch to trace tlbie(l)'s. The idea being that it can be
>> enabled when we suspect corruption or when we need to see if we are doing
>> the right thing during flush. I think the format can be enhanced to
>> make it nicer (expand the RB/RS/IS/L cases in more detail). For now I am
>> sharing the idea to get inputs
>>
>> A typical trace might look like this
>>
>>
>> <...>-5141  [062]  1354.486693: tlbie:
>>  tlbie with lpid 0, local 0, rb=7b5d0ff874f11f1, rs=0, ric=0 prs=0 r=0
>> systemd-udevd-2584  [018]  1354.486772: tlbie:
>>  tlbie with lpid 0, local 0, rb=17be1f421adc10c1, rs=0, ric=0 prs=0 r=0
>> ...
>>
>> qemu-system-ppc-5371  [016]  1412.369519: tlbie:
>>  tlbie with lpid 0, local 1, rb=67bd8900174c11c1, rs=0, ric=0 prs=0 r=0
>> qemu-system-ppc-5377  [056]  1421.687262: tlbie:
>>  tlbie with lpid 1, local 0, rb=5f04edffa00c11c1, rs=1, ric=0 prs=0 r=0
> 
> My first reaction is "why the hell do we have so many open-coded
> tlbies". So step one might be to add a static inline helper, that way we
> don't have to add the trace_tlbie() in so many places.
> 

The problem is the variants

Hash uses the two operand variant and radix on p9 uses the 5 operand variant.
To make matters even more complex depending on the CPU_FTR_ARCH we also have
operand variants with L and a version where the immediate value was removed.
I've tried to encapsulate the traces with their variants, but that implied
having the traces at the various call sites

We also use tlbie* from assembly (like the one called by kvmppc_hv_entry),
which is not covered by these traces at the moment, but I guess we can
add them as well.

> Also in some of them you call trace_tlbie() before the
> eieio/tlbsync/ptesync. Which may not be wrong, but looks worrying at
> first glance.
> 

I'll try and make it consistent, but there are places like do_tlbies()
where we loop do the tlbie(l)'s and then do sync's. In those case
I've had to put the traces within the loop instead of queuing or looping
twice. We don't access any of the addresses flushed, just print the registers.

> But overall I guess it's OK. We'd want to do a quick benchmark to make
> sure it's not adding any overhead.

OK.. I'll try and find a benchmark and run it with traces disabled.

Thanks for the review
Balbir Singh


Re: [PATCH v5 1/7] powerpc/mm: update ptep_set_access_flag to not do full mm tlb flush

2016-11-23 Thread Balbir Singh


On 23/11/16 22:09, Aneesh Kumar K.V wrote:
> When we are updating pte, we just need to flush the tlb mapping for
> that pte. Right now we do a full mm flush because we don't track page
> size. Update the interface to track the page size and use that to
> do the right tlb flush.
> 

Could you also clarify the scope -- this seems to be _radix_ only.
The problem statement is not very clear and why doesn't the flush_tlb_page()
following ptep_set_access_flags() work? What else do we need to do?


> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/book3s/32/pgtable.h |  4 +++-
>  arch/powerpc/include/asm/book3s/64/pgtable.h |  7 +--
>  arch/powerpc/include/asm/book3s/64/radix.h   | 14 +++---
>  arch/powerpc/include/asm/nohash/32/pgtable.h |  4 +++-
>  arch/powerpc/include/asm/nohash/64/pgtable.h |  4 +++-
>  arch/powerpc/mm/pgtable-book3s64.c   |  3 ++-
>  arch/powerpc/mm/pgtable-radix.c  | 16 
>  arch/powerpc/mm/pgtable.c| 10 --
>  arch/powerpc/mm/tlb-radix.c  | 15 ---
>  9 files changed, 47 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
> b/arch/powerpc/include/asm/book3s/32/pgtable.h
> index 6b8b2d57fdc8..cd835e74e633 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> @@ -224,7 +224,9 @@ static inline void huge_ptep_set_wrprotect(struct 
> mm_struct *mm,
>  
>  
>  static inline void __ptep_set_access_flags(struct mm_struct *mm,
> -pte_t *ptep, pte_t entry)
> +pte_t *ptep, pte_t entry,
> +unsigned long address,
> +unsigned long page_size)
>  {
>   unsigned long set = pte_val(entry) &
>   (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 86870c11917b..761622ec7f2a 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -580,10 +580,13 @@ static inline bool check_pte_access(unsigned long 
> access, unsigned long ptev)
>   */
>  
>  static inline void __ptep_set_access_flags(struct mm_struct *mm,
> -pte_t *ptep, pte_t entry)
> +pte_t *ptep, pte_t entry,
> +unsigned long address,
> +unsigned long page_size)
>  {
>   if (radix_enabled())
> - return radix__ptep_set_access_flags(mm, ptep, entry);
> + return radix__ptep_set_access_flags(mm, ptep, entry,
> + address, page_size);
>   return hash__ptep_set_access_flags(ptep, entry);
>  }
>  
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
> b/arch/powerpc/include/asm/book3s/64/radix.h
> index 2a46dea8e1b1..e104004bf2b1 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -110,6 +110,7 @@
>  #define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE)
>  #define RADIX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
>  
> +extern int radix_get_mmu_psize(unsigned long page_size);
>  static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long 
> clr,
>  unsigned long set)
>  {
> @@ -167,7 +168,9 @@ static inline unsigned long radix__pte_update(struct 
> mm_struct *mm,
>   * function doesn't need to invalidate tlb.
>   */
>  static inline void radix__ptep_set_access_flags(struct mm_struct *mm,
> - pte_t *ptep, pte_t entry)
> + pte_t *ptep, pte_t entry,
> + unsigned long address,
> + unsigned long page_size)
>  {
>  
>   unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
> @@ -175,6 +178,7 @@ static inline void radix__ptep_set_access_flags(struct 
> mm_struct *mm,
>  
>   if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
>  
> + int psize;
>   unsigned long old_pte, new_pte;
>  
>   old_pte = __radix_pte_update(ptep, ~0, 0);
> @@ -183,12 +187,8 @@ static inline void radix__ptep_set_access_flags(struct 
> mm_struct *mm,
>* new value of pte
>*/
>   new_pte = old_pte | set;
> -
> - /*
> -  * For now let's do heavy pid flush
> -  * radix__flush_tlb_page_psize(mm, addr, mmu_virtual_psize);
> -  */
> - radix__flush_tlb_mm(mm);
> + psize = 

[PATCH v5 7/7] powerpc/mm: Batch tlb flush when invalidating pte entries

2016-11-23 Thread Aneesh Kumar K.V
This will improve the task exit case, by batching tlb invalidates.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/radix.h | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index da94bdae1f88..2c3b93a628d7 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -142,15 +142,21 @@ static inline unsigned long radix__pte_update(struct 
mm_struct *mm,
unsigned long new_pte;
 
old_pte = __radix_pte_update(ptep, ~0, 0);
-   asm volatile("ptesync" : : : "memory");
/*
 * new value of pte
 */
new_pte = (old_pte | set) & ~clr;
-   psize = radix_get_mmu_psize(page_size);
-   radix__flush_tlb_page_psize(mm, addr, psize);
-
-   __radix_pte_update(ptep, 0, new_pte);
+   /*
+* If we are trying to clear the pte, we can skip
+* the below sequence and batch the tlb flush. The
+* tlb flush batching is done by mmu gather code
+*/
+   if (new_pte) {
+   asm volatile("ptesync" : : : "memory");
+   psize = radix_get_mmu_psize(page_size);
+   radix__flush_tlb_page_psize(mm, addr, psize);
+   __radix_pte_update(ptep, 0, new_pte);
+   }
} else
old_pte = __radix_pte_update(ptep, clr, set);
asm volatile("ptesync" : : : "memory");
-- 
2.10.2



[PATCH v5 6/7] powerpc/mm: update pte_update to not do full mm tlb flush

2016-11-23 Thread Aneesh Kumar K.V
When we are updating pte, we just need to flush the tlb mapping for
that pte. Right now we do a full mm flush because we don't track page
size. Update the interface to track the page size and use that to
do the right tlb flush.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 16 ++--
 arch/powerpc/include/asm/book3s/64/radix.h   | 19 ---
 arch/powerpc/mm/pgtable-radix.c  |  2 +-
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 4a2bd260eec0..7789ce64beb1 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -301,12 +301,16 @@ extern unsigned long pci_io_base;
 
 static inline unsigned long pte_update(struct mm_struct *mm, unsigned long 
addr,
   pte_t *ptep, unsigned long clr,
-  unsigned long set, int huge)
+  unsigned long set,
+  unsigned long page_size)
 {
+   bool huge = (page_size != PAGE_SIZE);
+
if (radix_enabled())
-   return radix__pte_update(mm, addr, ptep, clr, set, huge);
+   return radix__pte_update(mm, addr, ptep, clr, set, page_size);
return hash__pte_update(mm, addr, ptep, clr, set, huge);
 }
+
 /*
  * For hash even if we have _PAGE_ACCESSED = 0, we do a pte_update.
  * We currently remove entries from the hashtable regardless of whether
@@ -324,7 +328,7 @@ static inline int __ptep_test_and_clear_young(struct 
mm_struct *mm,
 
if ((pte_raw(*ptep) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 
0)
return 0;
-   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
+   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, PAGE_SIZE);
return (old & _PAGE_ACCESSED) != 0;
 }
 
@@ -343,21 +347,21 @@ static inline void ptep_set_wrprotect(struct mm_struct 
*mm, unsigned long addr,
if ((pte_raw(*ptep) & cpu_to_be64(_PAGE_WRITE)) == 0)
return;
 
-   pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0);
+   pte_update(mm, addr, ptep, _PAGE_WRITE, 0, PAGE_SIZE);
 }
 
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
 {
-   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
+   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, PAGE_SIZE);
return __pte(old);
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 pte_t * ptep)
 {
-   pte_update(mm, addr, ptep, ~0UL, 0, 0);
+   pte_update(mm, addr, ptep, ~0UL, 0, PAGE_SIZE);
 }
 
 static inline int pte_write(pte_t pte)
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index e104004bf2b1..da94bdae1f88 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -129,15 +129,16 @@ static inline unsigned long __radix_pte_update(pte_t 
*ptep, unsigned long clr,
 
 
 static inline unsigned long radix__pte_update(struct mm_struct *mm,
-   unsigned long addr,
-   pte_t *ptep, unsigned long clr,
-   unsigned long set,
-   int huge)
+ unsigned long addr,
+ pte_t *ptep, unsigned long clr,
+ unsigned long set,
+ unsigned long page_size)
 {
unsigned long old_pte;
 
if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
 
+   int psize;
unsigned long new_pte;
 
old_pte = __radix_pte_update(ptep, ~0, 0);
@@ -146,18 +147,14 @@ static inline unsigned long radix__pte_update(struct 
mm_struct *mm,
 * new value of pte
 */
new_pte = (old_pte | set) & ~clr;
-
-   /*
-* For now let's do heavy pid flush
-* radix__flush_tlb_page_psize(mm, addr, mmu_virtual_psize);
-*/
-   radix__flush_tlb_mm(mm);
+   psize = radix_get_mmu_psize(page_size);
+   radix__flush_tlb_page_psize(mm, addr, psize);
 
__radix_pte_update(ptep, 0, new_pte);
} else
old_pte = __radix_pte_update(ptep, clr, set);
asm volatile("ptesync" : : : "memory");
-   if (!huge)
+   if (page_size == PAGE_SIZE)
assert_pte_locked(mm, addr);
 
return old_pte;
diff --git a/arch/powerpc/mm/pgtable-radix.c 

[PATCH v5 5/7] powerpc/mm/hugetlb: Switch hugetlb update to use huge_pte_update

2016-11-23 Thread Aneesh Kumar K.V
We want to switch pte_update to use va based tlb flush. In order to do that we
need to track the page size. With hugetlb we currently don't have page size
available in these functions. Hence switch hugetlb to use seperate functions
for update. In later patch we will update hugetlb functions to take
vm_area_struct from which we can derive the page size. After that we will switch
this back to use pte_update

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hugetlb.h | 43 +++-
 arch/powerpc/include/asm/book3s/64/pgtable.h |  9 --
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h 
b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 8fc04d2ac86f..586236625117 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -31,9 +31,50 @@ static inline int hstate_get_psize(struct hstate *hstate)
}
 }
 
+static inline unsigned long huge_pte_update(struct mm_struct *mm, unsigned 
long addr,
+   pte_t *ptep, unsigned long clr,
+   unsigned long set)
+{
+   if (radix_enabled()) {
+   unsigned long old_pte;
+
+   if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+
+   unsigned long new_pte;
+
+   old_pte = __radix_pte_update(ptep, ~0, 0);
+   asm volatile("ptesync" : : : "memory");
+   /*
+* new value of pte
+*/
+   new_pte = (old_pte | set) & ~clr;
+   /*
+* For now let's do heavy pid flush
+* radix__flush_tlb_page_psize(mm, addr, 
mmu_virtual_psize);
+*/
+   radix__flush_tlb_mm(mm);
+
+   __radix_pte_update(ptep, 0, new_pte);
+   } else
+   old_pte = __radix_pte_update(ptep, clr, set);
+   asm volatile("ptesync" : : : "memory");
+   return old_pte;
+   }
+   return hash__pte_update(mm, addr, ptep, clr, set, true);
+}
+
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+  unsigned long addr, pte_t *ptep)
+{
+   if ((pte_raw(*ptep) & cpu_to_be64(_PAGE_WRITE)) == 0)
+   return;
+
+   huge_pte_update(mm, addr, ptep, _PAGE_WRITE, 0);
+}
+
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
 {
-   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
+   return __pte(huge_pte_update(mm, addr, ptep, ~0UL, 0));
 }
 #endif
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 761622ec7f2a..4a2bd260eec0 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -346,15 +346,6 @@ static inline void ptep_set_wrprotect(struct mm_struct 
*mm, unsigned long addr,
pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0);
 }
 
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
-  unsigned long addr, pte_t *ptep)
-{
-   if ((pte_raw(*ptep) & cpu_to_be64(_PAGE_WRITE)) == 0)
-   return;
-
-   pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1);
-}
-
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
-- 
2.10.2



[PATCH v5 4/7] powerpc/mm/hugetlb: Make copy of huge_ptep_get_and_clear to different platform headers

2016-11-23 Thread Aneesh Kumar K.V
In the subsequent patch we will change the implementation of book3s 64. This
also avoid #ifdef in the code.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/32/pgtable.h |  5 +
 arch/powerpc/include/asm/book3s/64/hugetlb.h |  6 ++
 arch/powerpc/include/asm/hugetlb.h   | 10 --
 arch/powerpc/include/asm/nohash/32/pgtable.h |  6 ++
 arch/powerpc/include/asm/nohash/64/pgtable.h |  7 +++
 5 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index cd835e74e633..3994c403ad9a 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -222,6 +222,11 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
ptep_set_wrprotect(mm, addr, ptep);
 }
 
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+   unsigned long addr, pte_t *ptep)
+{
+   return __pte(pte_update(ptep, ~0UL, 0));
+}
 
 static inline void __ptep_set_access_flags(struct mm_struct *mm,
   pte_t *ptep, pte_t entry,
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h 
b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index d9c283f95e05..8fc04d2ac86f 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -30,4 +30,10 @@ static inline int hstate_get_psize(struct hstate *hstate)
return mmu_virtual_psize;
}
 }
+
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+   unsigned long addr, pte_t *ptep)
+{
+   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
+}
 #endif
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index c03e0a3dd4d8..8b39806851c0 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -132,16 +132,6 @@ static inline void set_huge_pte_at(struct mm_struct *mm, 
unsigned long addr,
set_pte_at(mm, addr, ptep, pte);
 }
 
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
-   unsigned long addr, pte_t *ptep)
-{
-#ifdef CONFIG_PPC64
-   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
-#else
-   return __pte(pte_update(ptep, ~0UL, 0));
-#endif
-}
-
 static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
 unsigned long addr, pte_t *ptep)
 {
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h 
b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 4153b8e591a4..8569673682dd 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -266,6 +266,12 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
ptep_set_wrprotect(mm, addr, ptep);
 }
 
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+   unsigned long addr, pte_t *ptep)
+{
+   return __pte(pte_update(ptep, ~0UL, 0));
+}
+
 
 static inline void __ptep_set_access_flags(struct mm_struct *mm,
   pte_t *ptep, pte_t entry,
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h 
b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 7e42b8195e85..f7a973d4b509 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -266,6 +266,13 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
 }
 
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+   unsigned long addr, pte_t *ptep)
+{
+   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
+}
+
+
 /*
  * We currently remove entries from the hashtable regardless of whether
  * the entry was young or dirty. The generic routines only flush if the
-- 
2.10.2



[PATCH v5 3/7] powerpc/mm/hugetlb: Handle hugepage size supported by hash config

2016-11-23 Thread Aneesh Kumar K.V
W.r.t hash page table config, we support 16MB and 16GB as the hugepage
size. Update the hstate_get_psize to handle 16M and 16G.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hugetlb.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h 
b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 499268045306..d9c283f95e05 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -21,6 +21,10 @@ static inline int hstate_get_psize(struct hstate *hstate)
return MMU_PAGE_2M;
else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
return MMU_PAGE_1G;
+   else if (shift == mmu_psize_defs[MMU_PAGE_16M].shift)
+   return MMU_PAGE_16M;
+   else if (shift == mmu_psize_defs[MMU_PAGE_16G].shift)
+   return MMU_PAGE_16G;
else {
WARN(1, "Wrong huge page shift\n");
return mmu_virtual_psize;
-- 
2.10.2



[PATCH v5 2/7] powerpc/mm: Rename hugetlb-radix.h to hugetlb.h

2016-11-23 Thread Aneesh Kumar K.V
We will start moving some book3s specific hugetlb functions there.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/{hugetlb-radix.h => hugetlb.h} | 4 ++--
 arch/powerpc/include/asm/hugetlb.h| 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename arch/powerpc/include/asm/book3s/64/{hugetlb-radix.h => hugetlb.h} (90%)

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h 
b/arch/powerpc/include/asm/book3s/64/hugetlb.h
similarity index 90%
rename from arch/powerpc/include/asm/book3s/64/hugetlb-radix.h
rename to arch/powerpc/include/asm/book3s/64/hugetlb.h
index c45189aa7476..499268045306 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H
-#define _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H
+#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_H
+#define _ASM_POWERPC_BOOK3S_64_HUGETLB_H
 /*
  * For radix we want generic code to handle hugetlb. But then if we want
  * both hash and radix to be enabled together we need to workaround the
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index c5517f463ec7..c03e0a3dd4d8 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -9,7 +9,7 @@ extern struct kmem_cache *hugepte_cache;
 
 #ifdef CONFIG_PPC_BOOK3S_64
 
-#include 
+#include 
 /*
  * This should work for other subarchs too. But right now we use the
  * new format only for 64bit book3s
-- 
2.10.2



[PATCH v5 1/7] powerpc/mm: update ptep_set_access_flag to not do full mm tlb flush

2016-11-23 Thread Aneesh Kumar K.V
When we are updating pte, we just need to flush the tlb mapping for
that pte. Right now we do a full mm flush because we don't track page
size. Update the interface to track the page size and use that to
do the right tlb flush.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/32/pgtable.h |  4 +++-
 arch/powerpc/include/asm/book3s/64/pgtable.h |  7 +--
 arch/powerpc/include/asm/book3s/64/radix.h   | 14 +++---
 arch/powerpc/include/asm/nohash/32/pgtable.h |  4 +++-
 arch/powerpc/include/asm/nohash/64/pgtable.h |  4 +++-
 arch/powerpc/mm/pgtable-book3s64.c   |  3 ++-
 arch/powerpc/mm/pgtable-radix.c  | 16 
 arch/powerpc/mm/pgtable.c| 10 --
 arch/powerpc/mm/tlb-radix.c  | 15 ---
 9 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 6b8b2d57fdc8..cd835e74e633 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -224,7 +224,9 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct 
*mm,
 
 
 static inline void __ptep_set_access_flags(struct mm_struct *mm,
-  pte_t *ptep, pte_t entry)
+  pte_t *ptep, pte_t entry,
+  unsigned long address,
+  unsigned long page_size)
 {
unsigned long set = pte_val(entry) &
(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 86870c11917b..761622ec7f2a 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -580,10 +580,13 @@ static inline bool check_pte_access(unsigned long access, 
unsigned long ptev)
  */
 
 static inline void __ptep_set_access_flags(struct mm_struct *mm,
-  pte_t *ptep, pte_t entry)
+  pte_t *ptep, pte_t entry,
+  unsigned long address,
+  unsigned long page_size)
 {
if (radix_enabled())
-   return radix__ptep_set_access_flags(mm, ptep, entry);
+   return radix__ptep_set_access_flags(mm, ptep, entry,
+   address, page_size);
return hash__ptep_set_access_flags(ptep, entry);
 }
 
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 2a46dea8e1b1..e104004bf2b1 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -110,6 +110,7 @@
 #define RADIX_PUD_TABLE_SIZE   (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE)
 #define RADIX_PGD_TABLE_SIZE   (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
 
+extern int radix_get_mmu_psize(unsigned long page_size);
 static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr,
   unsigned long set)
 {
@@ -167,7 +168,9 @@ static inline unsigned long radix__pte_update(struct 
mm_struct *mm,
  * function doesn't need to invalidate tlb.
  */
 static inline void radix__ptep_set_access_flags(struct mm_struct *mm,
-   pte_t *ptep, pte_t entry)
+   pte_t *ptep, pte_t entry,
+   unsigned long address,
+   unsigned long page_size)
 {
 
unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
@@ -175,6 +178,7 @@ static inline void radix__ptep_set_access_flags(struct 
mm_struct *mm,
 
if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
 
+   int psize;
unsigned long old_pte, new_pte;
 
old_pte = __radix_pte_update(ptep, ~0, 0);
@@ -183,12 +187,8 @@ static inline void radix__ptep_set_access_flags(struct 
mm_struct *mm,
 * new value of pte
 */
new_pte = old_pte | set;
-
-   /*
-* For now let's do heavy pid flush
-* radix__flush_tlb_page_psize(mm, addr, mmu_virtual_psize);
-*/
-   radix__flush_tlb_mm(mm);
+   psize = radix_get_mmu_psize(page_size);
+   radix__flush_tlb_page_psize(mm, address, psize);
 
__radix_pte_update(ptep, 0, new_pte);
} else
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h 
b/arch/powerpc/include/asm/nohash/32/pgtable.h
index c219ef7be53b..4153b8e591a4 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ 

RE: [V2,10/68] powerpc/mm: Update _PAGE_KERNEL_RO

2016-11-23 Thread Aneesh Kumar K.V
Geoff Levand  writes:

> Hi Aneesh,
>
>> --- a/arch/powerpc/platforms/ps3/spu.c
>> +++ b/arch/powerpc/platforms/ps3/spu.c
>> @@ -205,7 +205,7 @@ static void spu_unmap(struct spu *spu)
>>  static int __init setup_areas(struct spu *spu)
>>  {
>>  struct table {char* name; unsigned long addr; unsigned long size;};
>> -static const unsigned long shadow_flags = _PAGE_NO_CACHE | 3;
>> +unsigned long shadow_flags = 
>> pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO));
>>  
>>  spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr,
>> sizeof(struct spe_shadow),
>
> This shadow_flags setting doesn't work correctly for PS3.  The PS3's LV1
> hypervisor wants the shadow reg pte bits N=1 and PP=11, so (rflags & 7) == 7.
> It also doesn't want bit 0x8000 set.  So maybe these:
>
>   (HPTE_R_N | HPTE_R_PP & ~HPTE_R_PP0)
>
> For what its worth, this is the error for v4.8:
>
>   ps3_hpte_insert:result=LV1_ILLEGAL_PARAMETER_VALUE (-17) vpn=7e4fa575c 
> pa=30003000 ix=bae0 v=7e4fa575c001 r=800030003126
>
> I tried different shadow_flags settings to try to get htab_convert_pte_flags()
> to give me the right pte bits, but couldn't find anything that would work.
>
> Here's the only thing I could get to work:
>
> --- a/arch/powerpc/mm/hash_utils_64.c
> +++ b/arch/powerpc/mm/hash_utils_64.c
> @@ -222,6 +222,12 @@ unsigned long htab_convert_pte_flags(unsigned long 
> pteflags)
>*/
>   rflags |= HPTE_R_M;
>
> + if ((pteflags & 0xc0003000UL) == 0xc0003000UL) {
> + pr_info("%s: bad rflags: pteflags= %lx => rflags=%lx\n",
> + __func__, pteflags, rflags);
> + return 0x127;
> + }
> +
>   return rflags;
>  }
>
> And here's the output of that:
>
>  htab_convert_pte_flags: bad rflags: pteflags= c0003000393c => 
> rflags=8126
>  htab_convert_pte_flags: bad rflags: pteflags= c0003000593c => 
> rflags=8126
>  htab_convert_pte_flags: bad rflags: pteflags= c0003000793c => 
> rflags=8126
>  htab_convert_pte_flags: bad rflags: pteflags= c0003000993c => 
> rflags=8126
>  htab_convert_pte_flags: bad rflags: pteflags= c0003000b93c => 
> rflags=8126
>  htab_convert_pte_flags: bad rflags: pteflags= c0003000d93c => 
> rflags=8126
>
> Actually, the problem started with (6a119eae942c "powerpc/mm: Add a _PAGE_PTE 
> bit"),
> but I could fix that by using 'shadow_flags =  (_PAGE_PRESENT | 
> _PAGE_NO_CACHE | _PAGE_USER)'.
>
> Please let me know what I need for shadow_flags to get those pte bits set.

Can you try this patch ?

commit 43e05fa840330f0f2deae1e8cc2effd5df68079f
Author: Aneesh Kumar K.V 
Date:   Wed Nov 23 15:23:05 2016 +0530

powerpc/mm: Kernel RO fixup for cell

Signed-off-by: Aneesh Kumar K.V 

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index e88368354e49..c13242bf3098 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -27,6 +27,11 @@
 /*
  * Individual features below.
  */
+/*
+ * kernel read only support
+ * We added the ppp value 0b110 in ISA 2.04
+ */
+#define MMU_FTR_KERNEL_RO  ASM_CONST(0x4000)
 
 /*
  * We need to clear top 16bits of va (from the remaining 64 bits )in
@@ -103,10 +108,10 @@
 #define MMU_FTRS_POWER4MMU_FTRS_DEFAULT_HPTE_ARCH_V2
 #define MMU_FTRS_PPC970MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA
 #define MMU_FTRS_POWER5MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER6MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER7MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER8MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER9MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
+#define MMU_FTRS_POWER6MMU_FTRS_POWER4 | 
MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
+#define MMU_FTRS_POWER7MMU_FTRS_POWER4 | 
MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
+#define MMU_FTRS_POWER8MMU_FTRS_POWER4 | 
MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
+#define MMU_FTRS_POWER9MMU_FTRS_POWER4 | 
MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
 #define MMU_FTRS_CELL  MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
MMU_FTR_CI_LARGE_PAGE
 #define MMU_FTRS_PA6T  MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 5503078090cd..78dabf065ba9 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -193,8 +193,12 @@ unsigned long htab_convert_pte_flags(unsigned long 
pteflags)
/*
 * Kernel read only mapped with ppp bits 0b110
  

Re: [PATCH v4 1/7] powerpc/mm: update ptep_set_access_flag to not do full mm tlb flush

2016-11-23 Thread Michael Ellerman
"Aneesh Kumar K.V"  writes:
> diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
> b/arch/powerpc/include/asm/book3s/32/pgtable.h
> index 6b8b2d57fdc8..0713626e9189 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> @@ -224,7 +224,9 @@ static inline void huge_ptep_set_wrprotect(struct 
> mm_struct *mm,
>  
>  
>  static inline void __ptep_set_access_flags(struct mm_struct *mm,
> -pte_t *ptep, pte_t entry)
> +pte_t *ptep, pte_t entry,
> +unsigned long address,
> +unsigned long pg_sz)

I'd much rather you called these page_size, rather than pg_sz.

It's confusing enough that we have things called "psize" which are not
page sizes, but then adding pg_sz as well just makes it worse.

So can we stick with page_size for things that are page sizes (in
bytes), and psize for things that are MMU_PAGE_xx values.

cheers


Re: [RFC] [PATCH] Trace TLBIE's

2016-11-23 Thread Michael Ellerman
Balbir Singh  writes:

> Just a quick patch to trace tlbie(l)'s. The idea being that it can be
> enabled when we suspect corruption or when we need to see if we are doing
> the right thing during flush. I think the format can be enhanced to
> make it nicer (expand the RB/RS/IS/L cases in more detail). For now I am
> sharing the idea to get inputs
>
> A typical trace might look like this
>
>
> <...>-5141  [062]  1354.486693: tlbie:
>   tlbie with lpid 0, local 0, rb=7b5d0ff874f11f1, rs=0, ric=0 prs=0 r=0
> systemd-udevd-2584  [018]  1354.486772: tlbie:
>   tlbie with lpid 0, local 0, rb=17be1f421adc10c1, rs=0, ric=0 prs=0 r=0
> ...
>
> qemu-system-ppc-5371  [016]  1412.369519: tlbie:
>   tlbie with lpid 0, local 1, rb=67bd8900174c11c1, rs=0, ric=0 prs=0 r=0
> qemu-system-ppc-5377  [056]  1421.687262: tlbie:
>   tlbie with lpid 1, local 0, rb=5f04edffa00c11c1, rs=1, ric=0 prs=0 r=0

My first reaction is "why the hell do we have so many open-coded
tlbies". So step one might be to add a static inline helper, that way we
don't have to add the trace_tlbie() in so many places.

Also in some of them you call trace_tlbie() before the
eieio/tlbsync/ptesync. Which may not be wrong, but looks worrying at
first glance.

But overall I guess it's OK. We'd want to do a quick benchmark to make
sure it's not adding any overhead.

cheers


Re: [PATCH v3 3/3] powernv: Pass PSSCR value and mask to power9_idle_stop

2016-11-23 Thread Michael Ellerman
"Gautham R. Shenoy"  writes:

> From: "Gautham R. Shenoy" 
>
> The power9_idle_stop method currently takes only the requested stop
> level as a parameter and picks up the rest of the PSSCR bits from a
> hand-coded macro. This is not a very flexible design, especially when
> the firmware has the capability to communicate the psscr value and the
> mask associated with a particular stop state via device tree.
>
> This patch modifies the power9_idle_stop API to take as parameters the
> PSSCR value and the PSSCR mask corresponding to the stop state that
> needs to be set. These PSSCR value and mask are respectively obtained
> by parsing the "ibm,cpu-idle-state-psscr" and
> "ibm,cpu-idle-state-psscr-mask" fields from the device tree.
>
> In addition to this, the patch adds support for handling stop states
> for which ESL and EC bits in the PSSCR are zero. As per the
> architecture, a wakeup from these stop states resumes execution from
> the subsequent instruction as opposed to waking up at the System
> Vector.
>
> The older firmware sets only the Requested Level (RL) field in the
> psscr and psscr-mask exposed in the device tree. For older firmware
> where psscr-mask=0xf, this patch will set the default sane values that
> the set for for remaining PSSCR fields (i.e PSLL, MTL, ESL, EC, and
> TR).

So we're using psscr-mas=0xf as a signal that we're running on old
firmware.

That's OK I think, but please send a patch to document it in the device
tree binding.

And call it out below in the code.

cheers


Re: [PATCH v3 2/3] cpuidle:powernv: Add helper function to populate powernv idle states.

2016-11-23 Thread Michael Ellerman
"Gautham R. Shenoy"  writes:

> diff --git a/drivers/cpuidle/cpuidle-powernv.c 
> b/drivers/cpuidle/cpuidle-powernv.c
> index 7fe442c..9240e08 100644
> --- a/drivers/cpuidle/cpuidle-powernv.c
> +++ b/drivers/cpuidle/cpuidle-powernv.c
> @@ -243,28 +262,31 @@ static int powernv_add_idle_states(void)
>*/
>   if (latency_ns[i] > POWERNV_THRESHOLD_LATENCY_NS)
>   continue;
> + /*
> +  * Firmware passes residency and latency values in ns.
> +  * cpuidle expects it in us.
> +  */
> + exit_latency = ((unsigned int)latency_ns[i]) / 1000;
> + target_residency = (!rc) ? ((unsigned int)residency_ns[i]) : 0;
> + target_residency /= 1000;

Urk.

Can you just do it normally:

if (rc == 0)
target_residency = (unsigned int)residency_ns[i] / 1000;

I also don't see why you need the cast?

cheers


Re: [mm v2 0/3] Support memory cgroup hotplug

2016-11-23 Thread Michal Hocko
On Wed 23-11-16 19:37:16, Balbir Singh wrote:
> 
> 
> On 23/11/16 19:07, Michal Hocko wrote:
> > On Wed 23-11-16 18:50:42, Balbir Singh wrote:
> >>
> >>
> >> On 23/11/16 18:25, Michal Hocko wrote:
> >>> On Wed 23-11-16 15:36:51, Balbir Singh wrote:
>  In the absence of hotplug we use extra memory proportional to
>  (possible_nodes - online_nodes) * number_of_cgroups. PPC64 has a patch
>  to disable large consumption with large number of cgroups. This patch
>  adds hotplug support to memory cgroups and reverts the commit that
>  limited possible nodes to online nodes.
> >>>
> >>> Balbir,
> >>> I have asked this in the previous version but there still seems to be a
> >>> lack of information of _why_ do we want this, _how_ much do we save on
> >>> the memory overhead on most systems and _why_ the additional complexity
> >>> is really worth it. Please make sure to add all this in the cover
> >>> letter.
> >>>
> >>
> >> The data is in the patch referred to in patch 3. The order of waste was
> >> 200MB for 400 cgroup directories enough for us to restrict possible_map
> >> to online_map. These patches allow us to have a larger possible map and
> >> allow onlining nodes not in the online_map, which is currently a 
> >> restriction
> >> on ppc64.
> > 
> > How common is to have possible_map >> online_map? If this is ppc64 then
> > what is the downside of keeping the current restriction instead?
> > 
> 
> On my system CONFIG_NODE_SHIFT is 8, 256 nodes and possible_nodes are 2
> The downside is the ability to hotplug and online an offline node.
> Please see http://www.spinics.net/lists/linux-mm/msg116724.html

OK, so we are slowly getting to what I've asked originally ;) So who
cares? Depending on CONFIG_NODE_SHIFT (which tends to be quite large in
distribution or other general purpose kernels) the overhead is 424B (as
per pahole on the current kernel) for one numa node. Most machines are
to be expected 1-4 numa nodes so the overhead might be somewhere around
100K per memcg (with 256 possible nodes). Not trivial amount for sure
but I would rather encourage people to lower the possible node count for
their hardware if it is artificially large.

> >> A typical system that I use has about 100-150 directories, depending on the
> >> number of users/docker instances/configuration/virtual machines. These 
> >> numbers
> >> will only grow as we pack more of these instances on them.
> >>
> >> From a complexity view point, the patches are quite straight forward.
> > 
> > Well, I would like to hear more about that. {get,put}_online_memory
> > at random places doesn't sound all that straightforward to me.
> > 
> 
> I thought those places were not random :) I tried to think them out as
> discussed with Vladimir. I don't claim the code is bug free, we can fix
> any bugs as we test this more.

I am more worried about synchronization with the hotplug which tends to
be a PITA in places were we were simply safe by definition until now. We
do not have all that many users of memcg->nodeinfo[nid] from what I can see
but are all of them safe to never race with the hotplug. A lack of
highlevel design description is less than encouraging. So please try to
spend some time describing how do we use nodeinfo currently and how is
the synchronization with the hotplug supposed to work and what
guarantees that no stale nodinfos can be ever used. This is just too
easy to get wrong...
-- 
Michal Hocko
SUSE Labs


Re: [PATCH v10 04/10] kexec_file: Add support for purgatory built as PIE.

2016-11-23 Thread Dave Young
On 11/21/16 at 09:49pm, Thiago Jung Bauermann wrote:
> Hello Dave,
> 
> Thanks for your review.
> 
> Am Sonntag, 20. November 2016, 10:45:46 BRST schrieb Dave Young:
> > On 11/10/16 at 01:27am, Thiago Jung Bauermann wrote:
> > > powerpc's purgatory.ro has 12 relocation types when built as
> > > a relocatable object. To implement support for them requires
> > > arch_kexec_apply_relocations_add to duplicate a lot of code with
> > > module_64.c:apply_relocate_add.
> > > 
> > > When built as a Position Independent Executable there are only 4
> > > relocation types in purgatory.ro, so it becomes practical for the powerpc
> > > implementation of kexec_file to have its own relocation implementation.
> > > 
> > > Also, the purgatory is an executable and not an intermediary output from
> > > the compiler so it makes sense conceptually that it is easier to build
> > > it as a PIE than as a partially linked object.
> > > 
> > > Apart from the greatly reduced number of relocations, there are two
> > > differences between a relocatable object and a PIE:
> > > 
> > > 1. __kexec_load_purgatory needs to use the program headers rather than the
> > > 
> > >section headers to figure out how to load the binary.
> > > 
> > > 2. Symbol values are absolute addresses instead of relative to the
> > > 
> > >start of the section.
> > > 
> > > This patch adds the support needed in generic code for the differences
> > > above and allows powerpc to load and relocate a position independent
> > > purgatory.
> > 
> > [snip]
> > 
> > The kexec-tools machine_apply_elf_rel is pretty simple for ppc64, it is
> > not that complex. So could you look into simplify your kexec_file
> > implementation?
> 
> I can try, but there is one fundamental issue here: powerpc 
> position-dependent 
> code relies more on relocations than x86 position-dependent code does, so 
> there's a limit to how simple it can be made without switching to position-
> independent code. And it will always be more involved than it is on x86.
> 
> BTW, building x86's purgatory as PIE results in it not having any relocation 
> at all, so it's an advantage even in that architecture. Unfortunately, the 
> machine locks up during reboot and I didn't have time to try to figure out 
> what's going on.
> 
> > kernel/kexec_file.c kexec_apply_relocations only do limited things
> > and some of the logic is in arch/x86, so move general code out of arch
> > code, then I guess the arch code will be simpler
> 
> I agree that is a good idea. Is the patch below what you had in mind?
> 
> > and then we probably do not need this PIE stuff anymore.
> 
> If you are ok with the patch below I can post a new version of the series 
> based on it and we can see if Michael Ellerman thinks it is enough.
> 
> > BTW, __kexec_really_load_purgatory looks worse than
> > ___kexec_load_purgatory ;)
> 
> Really? I find the special handling of bss makes the section-based loader a 
> bit more confusing.
> 
> -- 
> Thiago Jung Bauermann
> IBM Linux Technology Center
> 
> 
> Subject: [PATCH] kexec_file: Move generic relocation code from arch/x86 to
>  kernel/kexec_file.c
> 
> The check for undefined symbols stays in arch-specific code because
> powerpc needs to allow TOC symbols to be processed even though they're
> undefined.
> 
> There is no functional change.
> 
> Suggested-by: Dave Young 
> Signed-off-by: Thiago Jung Bauermann 
> ---
>  arch/x86/kernel/machine_kexec_64.c | 160 
> +++--
>  include/linux/kexec.h  |   9 ++-
>  kernel/kexec_file.c| 120 +++-
>  3 files changed, 154 insertions(+), 135 deletions(-)
> 
> diff --git a/arch/x86/kernel/machine_kexec_64.c 
> b/arch/x86/kernel/machine_kexec_64.c
> index 8c1f218926d7..f4860c408ece 100644
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
> @@ -401,143 +401,45 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, 
> void *kernel,
>  }
>  #endif
>  
> -/*
> - * Apply purgatory relocations.
> - *
> - * ehdr: Pointer to elf headers
> - * sechdrs: Pointer to section headers.
> - * relsec: section index of SHT_RELA section.
> - *
> - * TODO: Some of the code belongs to generic code. Move that in kexec.c.
> - */
> -int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
> -  Elf64_Shdr *sechdrs, unsigned int relsec)
> +int arch_kexec_apply_relocation_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
> + unsigned int reltype, Elf_Sym *sym,
> + const char *name, unsigned long *location,
> + unsigned long address, unsigned long value)
>  {
> - unsigned int i;
> - Elf64_Rela *rel;
> - Elf64_Sym *sym;
> - void *location;
> - Elf64_Shdr *section, *symtabsec;
> - unsigned long address, sec_base, value;
> - const char *strtab, *name, *shstrtab;

Re: [mm v2 0/3] Support memory cgroup hotplug

2016-11-23 Thread Balbir Singh


On 23/11/16 19:07, Michal Hocko wrote:
> On Wed 23-11-16 18:50:42, Balbir Singh wrote:
>>
>>
>> On 23/11/16 18:25, Michal Hocko wrote:
>>> On Wed 23-11-16 15:36:51, Balbir Singh wrote:
 In the absence of hotplug we use extra memory proportional to
 (possible_nodes - online_nodes) * number_of_cgroups. PPC64 has a patch
 to disable large consumption with large number of cgroups. This patch
 adds hotplug support to memory cgroups and reverts the commit that
 limited possible nodes to online nodes.
>>>
>>> Balbir,
>>> I have asked this in the previous version but there still seems to be a
>>> lack of information of _why_ do we want this, _how_ much do we save on
>>> the memory overhead on most systems and _why_ the additional complexity
>>> is really worth it. Please make sure to add all this in the cover
>>> letter.
>>>
>>
>> The data is in the patch referred to in patch 3. The order of waste was
>> 200MB for 400 cgroup directories enough for us to restrict possible_map
>> to online_map. These patches allow us to have a larger possible map and
>> allow onlining nodes not in the online_map, which is currently a restriction
>> on ppc64.
> 
> How common is to have possible_map >> online_map? If this is ppc64 then
> what is the downside of keeping the current restriction instead?
> 

On my system CONFIG_NODE_SHIFT is 8, 256 nodes and possible_nodes are 2
The downside is the ability to hotplug and online an offline node.
Please see http://www.spinics.net/lists/linux-mm/msg116724.html

>> A typical system that I use has about 100-150 directories, depending on the
>> number of users/docker instances/configuration/virtual machines. These 
>> numbers
>> will only grow as we pack more of these instances on them.
>>
>> From a complexity view point, the patches are quite straight forward.
> 
> Well, I would like to hear more about that. {get,put}_online_memory
> at random places doesn't sound all that straightforward to me.
> 

I thought those places were not random :) I tried to think them out as
discussed with Vladimir. I don't claim the code is bug free, we can fix
any bugs as we test this more.

Balbir Singh.


Re: [mm v2 0/3] Support memory cgroup hotplug

2016-11-23 Thread Michal Hocko
On Wed 23-11-16 18:50:42, Balbir Singh wrote:
> 
> 
> On 23/11/16 18:25, Michal Hocko wrote:
> > On Wed 23-11-16 15:36:51, Balbir Singh wrote:
> >> In the absence of hotplug we use extra memory proportional to
> >> (possible_nodes - online_nodes) * number_of_cgroups. PPC64 has a patch
> >> to disable large consumption with large number of cgroups. This patch
> >> adds hotplug support to memory cgroups and reverts the commit that
> >> limited possible nodes to online nodes.
> > 
> > Balbir,
> > I have asked this in the previous version but there still seems to be a
> > lack of information of _why_ do we want this, _how_ much do we save on
> > the memory overhead on most systems and _why_ the additional complexity
> > is really worth it. Please make sure to add all this in the cover
> > letter.
> > 
> 
> The data is in the patch referred to in patch 3. The order of waste was
> 200MB for 400 cgroup directories enough for us to restrict possible_map
> to online_map. These patches allow us to have a larger possible map and
> allow onlining nodes not in the online_map, which is currently a restriction
> on ppc64.

How common is to have possible_map >> online_map? If this is ppc64 then
what is the downside of keeping the current restriction instead?

> A typical system that I use has about 100-150 directories, depending on the
> number of users/docker instances/configuration/virtual machines. These numbers
> will only grow as we pack more of these instances on them.
> 
> From a complexity view point, the patches are quite straight forward.

Well, I would like to hear more about that. {get,put}_online_memory
at random places doesn't sound all that straightforward to me.

-- 
Michal Hocko
SUSE Labs


[RFC] [PATCH] Trace TLBIE's

2016-11-23 Thread Balbir Singh

Just a quick patch to trace tlbie(l)'s. The idea being that it can be
enabled when we suspect corruption or when we need to see if we are doing
the right thing during flush. I think the format can be enhanced to
make it nicer (expand the RB/RS/IS/L cases in more detail). For now I am
sharing the idea to get inputs

A typical trace might look like this


<...>-5141  [062]  1354.486693: tlbie:
tlbie with lpid 0, local 0, rb=7b5d0ff874f11f1, rs=0, ric=0 prs=0 r=0
systemd-udevd-2584  [018]  1354.486772: tlbie:
tlbie with lpid 0, local 0, rb=17be1f421adc10c1, rs=0, ric=0 prs=0 r=0
...

qemu-system-ppc-5371  [016]  1412.369519: tlbie:
tlbie with lpid 0, local 1, rb=67bd8900174c11c1, rs=0, ric=0 prs=0 r=0
qemu-system-ppc-5377  [056]  1421.687262: tlbie:
tlbie with lpid 1, local 0, rb=5f04edffa00c11c1, rs=1, ric=0 prs=0 r=0


Signed-off-by: Balbir Singh 


diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index 32e36b1..b4e02ba 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -168,6 +168,38 @@ TRACE_EVENT(hash_fault,
  __entry->addr, __entry->access, __entry->trap)
 );
 
+
+TRACE_EVENT(tlbie,
+
+   TP_PROTO(unsigned long lpid, unsigned long local, unsigned long rb,
+   unsigned long rs, unsigned long ric, unsigned long prs,
+   unsigned long r),
+   TP_ARGS(lpid, local, rb, rs, ric, prs, r),
+   TP_STRUCT__entry(
+   __field(unsigned long, lpid)
+   __field(unsigned long, local)
+   __field(unsigned long, rb)
+   __field(unsigned long, rs)
+   __field(unsigned long, ric)
+   __field(unsigned long, prs)
+   __field(unsigned long, r)
+   ),
+
+   TP_fast_assign(
+   __entry->lpid = lpid;
+   __entry->local = local;
+   __entry->rb = rb;
+   __entry->rs = rs;
+   __entry->ric = ric;
+   __entry->prs = prs;
+   __entry->r = r;
+   ),
+
+   TP_printk("lpid=%ld, local=%ld, rb=0x%lx, rs=0x%lx, ric=0x%lx, "
+   "prs=0x%lx, r=0x%lx", __entry->lpid, __entry->local, 
__entry->rb,
+   __entry->rs, __entry->ric, __entry->prs, __entry->r)
+);
+
 #endif /* _TRACE_POWERPC_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 99b4e9d..5b032e9 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -15,6 +15,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -414,16 +415,22 @@ static void do_tlbies(struct kvm *kvm, unsigned long 
*rbvalues,
cpu_relax();
if (need_sync)
asm volatile("ptesync" : : : "memory");
-   for (i = 0; i < npages; ++i)
+   for (i = 0; i < npages; ++i) {
asm volatile(PPC_TLBIE(%1,%0) : :
 "r" (rbvalues[i]), "r" (kvm->arch.lpid));
+   trace_tlbie(kvm->arch.lpid, 0, rbvalues[i],
+   kvm->arch.lpid, 0, 0, 0);
+   }
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
kvm->arch.tlbie_lock = 0;
} else {
if (need_sync)
asm volatile("ptesync" : : : "memory");
-   for (i = 0; i < npages; ++i)
+   for (i = 0; i < npages; ++i) {
asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
+   trace_tlbie(kvm->arch.lpid, 1, rbvalues[i],
+   0, 0, 0, 0);
+   }
asm volatile("ptesync" : : : "memory");
}
 }
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 9d9b3ef..a31ff2b 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -98,6 +99,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int 
apsize, int ssize)
 : "memory");
break;
}
+   trace_tlbie(0, 0, va, 0, 0, 0, 0);
 }
 
 static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int 
ssize)
@@ -147,6 +149,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, 
int apsize, int ssize)
 : "memory");
break;
}
+   trace_tlbie(0, 1, va, 0, 0, 0, 0);
 
 }
 
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index bda8c43..933b373 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -16,6 +16,7 @@
 
 #include 
 #include 
+#include 
 
 static DEFINE_RAW_SPINLOCK(native_tlbie_lock);