Re: [PATCH v8 3/6] Uprobes: Support SDT markers having reference count (semaphore)

2018-08-12 Thread Ravi Bangoria
Hi Song,

On 08/11/2018 01:27 PM, Song Liu wrote:
>> +
>> +static void delayed_uprobe_delete(struct delayed_uprobe *du)
>> +{
>> +   if (!du)
>> +   return;
> Do we really need this check?


Not necessary though, but I would still like to keep it for a safety.


> 
>> +   list_del(>list);
>> +   kfree(du);
>> +}
>> +
>> +static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct 
>> *mm)
>> +{
>> +   struct list_head *pos, *q;
>> +   struct delayed_uprobe *du;
>> +
>> +   if (!uprobe && !mm)
>> +   return;
> And do we really need this check?


Yes. delayed_uprobe_remove(uprobe=NULL, mm=NULL) is an invalid case. If I remove
this check, code below (or more accurately code suggested by Oleg) will remove
all entries from delayed_uprobe_list. So I will keep this check but put a 
comment
above function.


[...]
>> +
>> +   ret = get_user_pages_remote(NULL, mm, vaddr, 1,
>> +   FOLL_WRITE, , , NULL);
>> +   if (unlikely(ret <= 0)) {
>> +   /*
>> +* We are asking for 1 page. If get_user_pages_remote() 
>> fails,
>> +* it may return 0, in that case we have to return error.
>> +*/
>> +   ret = (ret == 0) ? -EBUSY : ret;
>> +   pr_warn("Failed to %s ref_ctr. (%d)\n",
>> +   d > 0 ? "increment" : "decrement", ret);
> This warning is not really useful. Seems this function has little information
> about which uprobe is failing here. Maybe we only need warning in the caller
> (or caller of caller).


Sure, I can move this warning to caller of this function but what are the
exact fields you would like to print with warning? Something like this is
fine?

pr_warn("ref_ctr %s failed for 0x%lx, 0x%lx, 0x%lx, 0x%p",
d > 0 ? "increment" : "decrement", inode->i_ino,
offset, ref_ctr_offset, mm);

More importantly, the reason I didn't print more info is because dmesg is
accessible to unprivileged users in many distros but uprobes are not. So
printing this information may be a security violation. No?


> 
>> +   return ret;
>> +   }
>> +
>> +   kaddr = kmap_atomic(page);
>> +   ptr = kaddr + (vaddr & ~PAGE_MASK);
>> +
>> +   if (unlikely(*ptr + d < 0)) {
>> +   pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
>> +   "curr val: %d, delta: %d\n", vaddr, *ptr, d);
>> +   ret = -EINVAL;
>> +   goto out;
>> +   }
>> +
>> +   *ptr += d;
>> +   ret = 0;
>> +out:
>> +   kunmap_atomic(kaddr);
>> +   put_page(page);
>> +   return ret;
>> +}
>> +
>> +static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
>> + bool is_register)
> What's the reason of bool is_register here vs. short d in __update_ref_ctr()?
> Can we use short for both?


Yes, I can use short as well.


> 
>> +{
>> +   struct vm_area_struct *rc_vma;
>> +   unsigned long rc_vaddr;
>> +   int ret = 0;
>> +
>> +   rc_vma = find_ref_ctr_vma(uprobe, mm);
>> +
>> +   if (rc_vma) {
>> +   rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
>> +   ret = __update_ref_ctr(mm, rc_vaddr, is_register ? 1 : -1);
>> +
>> +   if (is_register)
>> +   return ret;
>> +   }
> Mixing __update_ref_ctr() here and delayed_uprobe_add() in the same
> function is a little confusing (at least for me). How about we always use
> delayed uprobe for uprobe_mmap() and use non-delayed in other case(s)?


No. delayed_uprobe_add() is needed for uprobe_register() case to handle race
between uprobe_register() and process creation.


[...]
>>
>> +static int delayed_uprobe_install(struct vm_area_struct *vma)
> This function name is confusing. How about we call it delayed_ref_ctr_incr() 
> or
> something similar? Also, we should add comments to highlight this is vma is 
> not
> the vma containing the uprobe, but the vma containing the ref_ctr.


Sure, I'll do that.


> 
>> +{
>> +   struct list_head *pos, *q;
>> +   struct delayed_uprobe *du;
>> +   unsigned long vaddr;
>> +   int ret = 0, err = 0;
>> +
>> +   mutex_lock(_uprobe_lock);
>> +   list_for_each_safe(pos, q, _uprobe_list) {
>> +   du = list_entry(pos, struct delayed_uprobe, list);
>> +
>> +   if (!valid_ref_ctr_vma(du->uprobe, vma))
>> +   continue;
>> +
>> +   vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
>> +   ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
>> +   /* Record an error and continue. */
>> +   if (ret && !err)
>> +   err = ret;
> I think this is a good place (when ret != 0) to call pr_warn(). I guess we can
> print which mm get error for which uprobe (inode+offset).


__update_ref_ctr() is already printing warning, so I didn't add anything 

Re: [PATCH v8 3/6] Uprobes: Support SDT markers having reference count (semaphore)

2018-08-12 Thread Ravi Bangoria
Hi Song,

On 08/11/2018 01:27 PM, Song Liu wrote:
>> +
>> +static void delayed_uprobe_delete(struct delayed_uprobe *du)
>> +{
>> +   if (!du)
>> +   return;
> Do we really need this check?


Not necessary though, but I would still like to keep it for a safety.


> 
>> +   list_del(>list);
>> +   kfree(du);
>> +}
>> +
>> +static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct 
>> *mm)
>> +{
>> +   struct list_head *pos, *q;
>> +   struct delayed_uprobe *du;
>> +
>> +   if (!uprobe && !mm)
>> +   return;
> And do we really need this check?


Yes. delayed_uprobe_remove(uprobe=NULL, mm=NULL) is an invalid case. If I remove
this check, code below (or more accurately code suggested by Oleg) will remove
all entries from delayed_uprobe_list. So I will keep this check but put a 
comment
above function.


[...]
>> +
>> +   ret = get_user_pages_remote(NULL, mm, vaddr, 1,
>> +   FOLL_WRITE, , , NULL);
>> +   if (unlikely(ret <= 0)) {
>> +   /*
>> +* We are asking for 1 page. If get_user_pages_remote() 
>> fails,
>> +* it may return 0, in that case we have to return error.
>> +*/
>> +   ret = (ret == 0) ? -EBUSY : ret;
>> +   pr_warn("Failed to %s ref_ctr. (%d)\n",
>> +   d > 0 ? "increment" : "decrement", ret);
> This warning is not really useful. Seems this function has little information
> about which uprobe is failing here. Maybe we only need warning in the caller
> (or caller of caller).


Sure, I can move this warning to caller of this function but what are the
exact fields you would like to print with warning? Something like this is
fine?

pr_warn("ref_ctr %s failed for 0x%lx, 0x%lx, 0x%lx, 0x%p",
d > 0 ? "increment" : "decrement", inode->i_ino,
offset, ref_ctr_offset, mm);

More importantly, the reason I didn't print more info is because dmesg is
accessible to unprivileged users in many distros but uprobes are not. So
printing this information may be a security violation. No?


> 
>> +   return ret;
>> +   }
>> +
>> +   kaddr = kmap_atomic(page);
>> +   ptr = kaddr + (vaddr & ~PAGE_MASK);
>> +
>> +   if (unlikely(*ptr + d < 0)) {
>> +   pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
>> +   "curr val: %d, delta: %d\n", vaddr, *ptr, d);
>> +   ret = -EINVAL;
>> +   goto out;
>> +   }
>> +
>> +   *ptr += d;
>> +   ret = 0;
>> +out:
>> +   kunmap_atomic(kaddr);
>> +   put_page(page);
>> +   return ret;
>> +}
>> +
>> +static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
>> + bool is_register)
> What's the reason of bool is_register here vs. short d in __update_ref_ctr()?
> Can we use short for both?


Yes, I can use short as well.


> 
>> +{
>> +   struct vm_area_struct *rc_vma;
>> +   unsigned long rc_vaddr;
>> +   int ret = 0;
>> +
>> +   rc_vma = find_ref_ctr_vma(uprobe, mm);
>> +
>> +   if (rc_vma) {
>> +   rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
>> +   ret = __update_ref_ctr(mm, rc_vaddr, is_register ? 1 : -1);
>> +
>> +   if (is_register)
>> +   return ret;
>> +   }
> Mixing __update_ref_ctr() here and delayed_uprobe_add() in the same
> function is a little confusing (at least for me). How about we always use
> delayed uprobe for uprobe_mmap() and use non-delayed in other case(s)?


No. delayed_uprobe_add() is needed for uprobe_register() case to handle race
between uprobe_register() and process creation.


[...]
>>
>> +static int delayed_uprobe_install(struct vm_area_struct *vma)
> This function name is confusing. How about we call it delayed_ref_ctr_incr() 
> or
> something similar? Also, we should add comments to highlight this is vma is 
> not
> the vma containing the uprobe, but the vma containing the ref_ctr.


Sure, I'll do that.


> 
>> +{
>> +   struct list_head *pos, *q;
>> +   struct delayed_uprobe *du;
>> +   unsigned long vaddr;
>> +   int ret = 0, err = 0;
>> +
>> +   mutex_lock(_uprobe_lock);
>> +   list_for_each_safe(pos, q, _uprobe_list) {
>> +   du = list_entry(pos, struct delayed_uprobe, list);
>> +
>> +   if (!valid_ref_ctr_vma(du->uprobe, vma))
>> +   continue;
>> +
>> +   vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
>> +   ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
>> +   /* Record an error and continue. */
>> +   if (ret && !err)
>> +   err = ret;
> I think this is a good place (when ret != 0) to call pr_warn(). I guess we can
> print which mm get error for which uprobe (inode+offset).


__update_ref_ctr() is already printing warning, so I didn't add anything 

[PATCH] nds32: Fix empty call trace

2018-08-12 Thread Zong Li
The compiler predefined macro 'NDS32_ABI_2' had been removed, it should
use the '__NDS32_ABI_2' here.

Signed-off-by: Zong Li 
---
 arch/nds32/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c
index a6205fd..f0e9743 100644
--- a/arch/nds32/kernel/traps.c
+++ b/arch/nds32/kernel/traps.c
@@ -137,7 +137,7 @@ static void __dump(struct task_struct *tsk, unsigned long 
*base_reg)
   !((unsigned long)base_reg & 0x3) &&
   ((unsigned long)base_reg >= TASK_SIZE)) {
unsigned long next_fp;
-#if !defined(NDS32_ABI_2)
+#if !defined(__NDS32_ABI_2)
ret_addr = base_reg[0];
next_fp = base_reg[1];
 #else
-- 
2.7.4



[PATCH] nds32: Fix empty call trace

2018-08-12 Thread Zong Li
The compiler predefined macro 'NDS32_ABI_2' had been removed, it should
use the '__NDS32_ABI_2' here.

Signed-off-by: Zong Li 
---
 arch/nds32/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c
index a6205fd..f0e9743 100644
--- a/arch/nds32/kernel/traps.c
+++ b/arch/nds32/kernel/traps.c
@@ -137,7 +137,7 @@ static void __dump(struct task_struct *tsk, unsigned long 
*base_reg)
   !((unsigned long)base_reg & 0x3) &&
   ((unsigned long)base_reg >= TASK_SIZE)) {
unsigned long next_fp;
-#if !defined(NDS32_ABI_2)
+#if !defined(__NDS32_ABI_2)
ret_addr = base_reg[0];
next_fp = base_reg[1];
 #else
-- 
2.7.4



Re: [PATCH v2 1/3] dt-bindings: interrupt-controller: Actions external interrupt controller

2018-08-12 Thread Manivannan Sadhasivam
Hi Parthiban,

On Sun, Aug 12, 2018 at 02:22:13PM +0200, Parthiban Nallathambi wrote:
> Actions Semi OWL family SoC's provides support for external interrupt
> controller to be connected and controlled using SIRQ pins. S500, S700
> and S900 provides 3 SIRQ lines and works independently for 3 external
> interrupt controllers.
> 
> Signed-off-by: Parthiban Nallathambi 
> Signed-off-by: Saravanan Sekar 
> ---
>  .../interrupt-controller/actions,owl-sirq.txt  | 46 
> ++
>  1 file changed, 46 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.txt
> 
> diff --git 
> a/Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.txt 
> b/Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.txt
> new file mode 100644
> index ..4b8437751331
> --- /dev/null
> +++ 
> b/Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.txt
> @@ -0,0 +1,46 @@
> +Actions Semi Owl SoCs SIRQ interrupt controller
> +
> +S500, S700 and S900 SoC's from Actions provides 3 SPI's from GIC,
> +in which external interrupt controller can be connected. 3 SPI's
> +45, 46, 47 from GIC are directly exposed as SIRQ. It has
> +the following properties:

We should really document the driver here. What it does? and how the
hierarchy is handled with GIC? etc...

> +
> +- inputs three interrupt signal from external interrupt controller
> +
> +Required properties:
> +
> +- compatible: should be "actions,owl-sirq"
> +- reg: physical base address of the controller and length of memory mapped.

...length of memory mapped region?

> +- interrupt-controller: identifies the node as an interrupt controller
> +- #interrupt-cells: specifies the number of cells needed to encode an 
> interrupt
> +  source, should be 2.
> +- actions,sirq-shared-reg: Applicable for S500 and S700 where SIRQ register
> +  details are maintained at same offset/register.
> +- actions,sirq-offset: register offset for SIRQ interrupts. When registers 
> are
> +  shared, all the three offsets will be same (S500 and S700).
> +- actions,sirq-clk-sel: external interrupt controller can be either
> +  connected to 32Khz or 24Mhz external/internal clock. This needs

Hertz should be specified as Hz.

> +  to be configured for per SIRQ line. Failing defaults to 32Khz clock.

What value needs to be specified for selecting 24MHz clock? You should
mention the available options this property supports.

> +
> +Example for S900:
> +
> +sirq: interrupt-controller@e01b {
> + compatible = "actions,owl-sirq";
> + reg = <0 0xe01b 0 0x1000>;

could be: reg = <0x0 0xe01b 0x0 0x1000>;

> + interrupt-controller;
> + #interrupt-cells = <2>;
> + actions,sirq-clk-sel = <0 0 0>;
> + actions,sirq-offset = <0x200 0x528 0x52c>;
> +};
> +
> +Example for S500 and S700:
> +
> +sirq: interrupt-controller@e01b {
> + compatible = "actions,owl-sirq";
> + reg = <0 0xe01b 0 0x1000>;

For S500, reg base is 0xb01b.

Thanks
Mani

> + interrupt-controller;
> + #interrupt-cells = <2>;
> + actions,sirq-shared-reg;
> + actions,sirq-clk-sel = <0 0 0>;
> + actions,sirq-offset = <0x200 0x200 0x200>;
> +};
> -- 
> 2.14.4
> 


Re: [PATCH v2 1/3] dt-bindings: interrupt-controller: Actions external interrupt controller

2018-08-12 Thread Manivannan Sadhasivam
Hi Parthiban,

On Sun, Aug 12, 2018 at 02:22:13PM +0200, Parthiban Nallathambi wrote:
> Actions Semi OWL family SoC's provides support for external interrupt
> controller to be connected and controlled using SIRQ pins. S500, S700
> and S900 provides 3 SIRQ lines and works independently for 3 external
> interrupt controllers.
> 
> Signed-off-by: Parthiban Nallathambi 
> Signed-off-by: Saravanan Sekar 
> ---
>  .../interrupt-controller/actions,owl-sirq.txt  | 46 
> ++
>  1 file changed, 46 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.txt
> 
> diff --git 
> a/Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.txt 
> b/Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.txt
> new file mode 100644
> index ..4b8437751331
> --- /dev/null
> +++ 
> b/Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.txt
> @@ -0,0 +1,46 @@
> +Actions Semi Owl SoCs SIRQ interrupt controller
> +
> +S500, S700 and S900 SoC's from Actions provides 3 SPI's from GIC,
> +in which external interrupt controller can be connected. 3 SPI's
> +45, 46, 47 from GIC are directly exposed as SIRQ. It has
> +the following properties:

We should really document the driver here. What it does? and how the
hierarchy is handled with GIC? etc...

> +
> +- inputs three interrupt signal from external interrupt controller
> +
> +Required properties:
> +
> +- compatible: should be "actions,owl-sirq"
> +- reg: physical base address of the controller and length of memory mapped.

...length of memory mapped region?

> +- interrupt-controller: identifies the node as an interrupt controller
> +- #interrupt-cells: specifies the number of cells needed to encode an 
> interrupt
> +  source, should be 2.
> +- actions,sirq-shared-reg: Applicable for S500 and S700 where SIRQ register
> +  details are maintained at same offset/register.
> +- actions,sirq-offset: register offset for SIRQ interrupts. When registers 
> are
> +  shared, all the three offsets will be same (S500 and S700).
> +- actions,sirq-clk-sel: external interrupt controller can be either
> +  connected to 32Khz or 24Mhz external/internal clock. This needs

Hertz should be specified as Hz.

> +  to be configured for per SIRQ line. Failing defaults to 32Khz clock.

What value needs to be specified for selecting 24MHz clock? You should
mention the available options this property supports.

> +
> +Example for S900:
> +
> +sirq: interrupt-controller@e01b {
> + compatible = "actions,owl-sirq";
> + reg = <0 0xe01b 0 0x1000>;

could be: reg = <0x0 0xe01b 0x0 0x1000>;

> + interrupt-controller;
> + #interrupt-cells = <2>;
> + actions,sirq-clk-sel = <0 0 0>;
> + actions,sirq-offset = <0x200 0x528 0x52c>;
> +};
> +
> +Example for S500 and S700:
> +
> +sirq: interrupt-controller@e01b {
> + compatible = "actions,owl-sirq";
> + reg = <0 0xe01b 0 0x1000>;

For S500, reg base is 0xb01b.

Thanks
Mani

> + interrupt-controller;
> + #interrupt-cells = <2>;
> + actions,sirq-shared-reg;
> + actions,sirq-clk-sel = <0 0 0>;
> + actions,sirq-offset = <0x200 0x200 0x200>;
> +};
> -- 
> 2.14.4
> 


Re: [PATCH] mm: migration: fix migration of huge PMD shared pages

2018-08-12 Thread kbuild test robot
Hi Mike,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.18 next-20180810]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Mike-Kravetz/mm-migration-fix-migration-of-huge-PMD-shared-pages/20180813-114549
config: i386-tinyconfig (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   mm/rmap.c: In function 'try_to_unmap_one':
>> mm/rmap.c:1425:7: error: implicit declaration of function 
>> 'huge_pmd_unshare'; did you mean 'do_huge_pmd_wp_page'? 
>> [-Werror=implicit-function-declaration]
  huge_pmd_unshare(mm, , pvmw.pte)) {
  ^~~~
  do_huge_pmd_wp_page
   cc1: some warnings being treated as errors

vim +1425 mm/rmap.c

  1382  
  1383  /*
  1384   * If the page is mlock()d, we cannot swap it out.
  1385   * If it's recently referenced (perhaps page_referenced
  1386   * skipped over this mm) then we should reactivate it.
  1387   */
  1388  if (!(flags & TTU_IGNORE_MLOCK)) {
  1389  if (vma->vm_flags & VM_LOCKED) {
  1390  /* PTE-mapped THP are never mlocked */
  1391  if (!PageTransCompound(page)) {
  1392  /*
  1393   * Holding pte lock, we do 
*not* need
  1394   * mmap_sem here
  1395   */
  1396  mlock_vma_page(page);
  1397  }
  1398  ret = false;
  1399  page_vma_mapped_walk_done();
  1400  break;
  1401  }
  1402  if (flags & TTU_MUNLOCK)
  1403  continue;
  1404  }
  1405  
  1406  /* Unexpected PMD-mapped THP? */
  1407  VM_BUG_ON_PAGE(!pvmw.pte, page);
  1408  
  1409  subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
  1410  address = pvmw.address;
  1411  
  1412  /*
  1413   * PMDs for hugetlbfs pages could be shared.  In this 
case,
  1414   * pages with shared PMDs will have a mapcount of 1 no 
matter
  1415   * how many times it is actually mapped.  Map counting 
for
  1416   * PMD sharing is mostly done via the reference count 
on the
  1417   * PMD page itself.  If the page we are trying to unmap 
is a
  1418   * hugetlbfs page, attempt to 'unshare' at the PMD 
level.
  1419   * huge_pmd_unshare takes care of clearing the PUD and
  1420   * reference counting on the PMD page which effectively 
unmaps
  1421   * the page.  Take care of flushing cache and TLB for 
page in
  1422   * this specific mapping here.
  1423   */
  1424  if (PageHuge(page) &&
> 1425  huge_pmd_unshare(mm, , pvmw.pte)) {
  1426  unsigned long end_add = address + 
vma_mmu_pagesize(vma);
  1427  
  1428  flush_cache_range(vma, address, end_add);
  1429  flush_tlb_range(vma, address, end_add);
  1430  mmu_notifier_invalidate_range(mm, address, 
end_add);
  1431  continue;
  1432  }
  1433  
  1434  if (IS_ENABLED(CONFIG_MIGRATION) &&
  1435  (flags & TTU_MIGRATION) &&
  1436  is_zone_device_page(page)) {
  1437  swp_entry_t entry;
  1438  pte_t swp_pte;
  1439  
  1440  pteval = ptep_get_and_clear(mm, pvmw.address, 
pvmw.pte);
  1441  
  1442  /*
  1443   * Store the pfn of the page in a special 
migration
  1444   * pte. do_swap_page() will wait until the 
migration
  1445   * pte is removed and then restart fault 
handling.
  1446   */
  1447  entry = make_migration_entry(page, 0);
  1448  swp_pte = swp_entry_to_pte(entry);
  1449  if (pte_soft_dirty(pteval))
  1450  swp_pte = pte_swp_mksoft_dirty(swp_pte);
  1451  set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
  1452

Re: [PATCH] mm: migration: fix migration of huge PMD shared pages

2018-08-12 Thread kbuild test robot
Hi Mike,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.18 next-20180810]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Mike-Kravetz/mm-migration-fix-migration-of-huge-PMD-shared-pages/20180813-114549
config: i386-tinyconfig (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   mm/rmap.c: In function 'try_to_unmap_one':
>> mm/rmap.c:1425:7: error: implicit declaration of function 
>> 'huge_pmd_unshare'; did you mean 'do_huge_pmd_wp_page'? 
>> [-Werror=implicit-function-declaration]
  huge_pmd_unshare(mm, , pvmw.pte)) {
  ^~~~
  do_huge_pmd_wp_page
   cc1: some warnings being treated as errors

vim +1425 mm/rmap.c

  1382  
  1383  /*
  1384   * If the page is mlock()d, we cannot swap it out.
  1385   * If it's recently referenced (perhaps page_referenced
  1386   * skipped over this mm) then we should reactivate it.
  1387   */
  1388  if (!(flags & TTU_IGNORE_MLOCK)) {
  1389  if (vma->vm_flags & VM_LOCKED) {
  1390  /* PTE-mapped THP are never mlocked */
  1391  if (!PageTransCompound(page)) {
  1392  /*
  1393   * Holding pte lock, we do 
*not* need
  1394   * mmap_sem here
  1395   */
  1396  mlock_vma_page(page);
  1397  }
  1398  ret = false;
  1399  page_vma_mapped_walk_done();
  1400  break;
  1401  }
  1402  if (flags & TTU_MUNLOCK)
  1403  continue;
  1404  }
  1405  
  1406  /* Unexpected PMD-mapped THP? */
  1407  VM_BUG_ON_PAGE(!pvmw.pte, page);
  1408  
  1409  subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
  1410  address = pvmw.address;
  1411  
  1412  /*
  1413   * PMDs for hugetlbfs pages could be shared.  In this 
case,
  1414   * pages with shared PMDs will have a mapcount of 1 no 
matter
  1415   * how many times it is actually mapped.  Map counting 
for
  1416   * PMD sharing is mostly done via the reference count 
on the
  1417   * PMD page itself.  If the page we are trying to unmap 
is a
  1418   * hugetlbfs page, attempt to 'unshare' at the PMD 
level.
  1419   * huge_pmd_unshare takes care of clearing the PUD and
  1420   * reference counting on the PMD page which effectively 
unmaps
  1421   * the page.  Take care of flushing cache and TLB for 
page in
  1422   * this specific mapping here.
  1423   */
  1424  if (PageHuge(page) &&
> 1425  huge_pmd_unshare(mm, , pvmw.pte)) {
  1426  unsigned long end_add = address + 
vma_mmu_pagesize(vma);
  1427  
  1428  flush_cache_range(vma, address, end_add);
  1429  flush_tlb_range(vma, address, end_add);
  1430  mmu_notifier_invalidate_range(mm, address, 
end_add);
  1431  continue;
  1432  }
  1433  
  1434  if (IS_ENABLED(CONFIG_MIGRATION) &&
  1435  (flags & TTU_MIGRATION) &&
  1436  is_zone_device_page(page)) {
  1437  swp_entry_t entry;
  1438  pte_t swp_pte;
  1439  
  1440  pteval = ptep_get_and_clear(mm, pvmw.address, 
pvmw.pte);
  1441  
  1442  /*
  1443   * Store the pfn of the page in a special 
migration
  1444   * pte. do_swap_page() will wait until the 
migration
  1445   * pte is removed and then restart fault 
handling.
  1446   */
  1447  entry = make_migration_entry(page, 0);
  1448  swp_pte = swp_entry_to_pte(entry);
  1449  if (pte_soft_dirty(pteval))
  1450  swp_pte = pte_swp_mksoft_dirty(swp_pte);
  1451  set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
  1452

Re: [BUG] kernel: rcu: a possible sleep-in-atomic-context bug in srcu_read_delay()

2018-08-12 Thread Paul E. McKenney
On Mon, Aug 13, 2018 at 11:04:10AM +0800, Jia-Ju Bai wrote:
> The kernel may sleep with holding a spinlock.
> 
> The function call paths (from bottom to top) in Linux-4.16 are:
> 
> [FUNC] schedule_timeout_interruptible
> kernel/rcu/rcutorture.c, 523: schedule_timeout_interruptible in
> srcu_read_delay
> kernel/rcu/rcutorture.c, 1105: [FUNC_PTR]srcu_read_delay in
> rcu_torture_timer
> kernel/rcu/rcutorture.c, 1104: spin_lock in rcu_torture_timer
> 
> Note that [FUNC_PTR] means a function pointer call is used.
> 
> I do not find a good way to fix, so I only report.
> This is found by my static analysis tool (DSAC).

Interesting.  I would have expected to have gotten a "scheduling while
atomic" error message, which I do not recall seeing.  And I ran a great
deal of rcutorture on v4.16.

So let's see...  As you say, the rcu_torture_timer() function does in
fact acquire rand_lock in 4.16 and 4.17, in which case sleeping would
indeed be illegal.  But let's take a look at srcu_read_delay():

static void
srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
{
long delay;
const long uspertick = 100 / HZ;
const long longdelay = 10;

/* We want there to be long-running readers, but not all the time. */

delay = torture_random(rrsp) %
(nrealreaders * 2 * longdelay * uspertick);
if (!delay && in_task()) {
schedule_timeout_interruptible(longdelay);
rtrsp->rt_delay_jiffies = longdelay;
} else {
rcu_read_delay(rrsp, rtrsp);
}
}

The call to schedule_timeout_interruptible() cannot happen unless the
in_task() macro returns true, which it won't if the SOFTIRQ_OFFSET bit
is set:

#define in_task()   (!(preempt_count() & \
   (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))

And the SOFTIRQ_OFFSET bit will be set if srcu_read_delay()
is invoked from a timer handler, which is the case for the
call from rcu_torture_timer().  So if that lock is held,
schedule_timeout_interruptible() won't ever be invoked.

So what am I missing here?

Thanx, Paul



Re: [BUG] kernel: rcu: a possible sleep-in-atomic-context bug in srcu_read_delay()

2018-08-12 Thread Paul E. McKenney
On Mon, Aug 13, 2018 at 11:04:10AM +0800, Jia-Ju Bai wrote:
> The kernel may sleep with holding a spinlock.
> 
> The function call paths (from bottom to top) in Linux-4.16 are:
> 
> [FUNC] schedule_timeout_interruptible
> kernel/rcu/rcutorture.c, 523: schedule_timeout_interruptible in
> srcu_read_delay
> kernel/rcu/rcutorture.c, 1105: [FUNC_PTR]srcu_read_delay in
> rcu_torture_timer
> kernel/rcu/rcutorture.c, 1104: spin_lock in rcu_torture_timer
> 
> Note that [FUNC_PTR] means a function pointer call is used.
> 
> I do not find a good way to fix, so I only report.
> This is found by my static analysis tool (DSAC).

Interesting.  I would have expected to have gotten a "scheduling while
atomic" error message, which I do not recall seeing.  And I ran a great
deal of rcutorture on v4.16.

So let's see...  As you say, the rcu_torture_timer() function does in
fact acquire rand_lock in 4.16 and 4.17, in which case sleeping would
indeed be illegal.  But let's take a look at srcu_read_delay():

static void
srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
{
long delay;
const long uspertick = 100 / HZ;
const long longdelay = 10;

/* We want there to be long-running readers, but not all the time. */

delay = torture_random(rrsp) %
(nrealreaders * 2 * longdelay * uspertick);
if (!delay && in_task()) {
schedule_timeout_interruptible(longdelay);
rtrsp->rt_delay_jiffies = longdelay;
} else {
rcu_read_delay(rrsp, rtrsp);
}
}

The call to schedule_timeout_interruptible() cannot happen unless the
in_task() macro returns true, which it won't if the SOFTIRQ_OFFSET bit
is set:

#define in_task()   (!(preempt_count() & \
   (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))

And the SOFTIRQ_OFFSET bit will be set if srcu_read_delay()
is invoked from a timer handler, which is the case for the
call from rcu_torture_timer().  So if that lock is held,
schedule_timeout_interruptible() won't ever be invoked.

So what am I missing here?

Thanx, Paul



Re: [PATCH] mm: migration: fix migration of huge PMD shared pages

2018-08-12 Thread kbuild test robot
Hi Mike,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.18 next-20180810]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Mike-Kravetz/mm-migration-fix-migration-of-huge-PMD-shared-pages/20180813-114549
config: i386-randconfig-x003-201832 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   mm/rmap.c: In function 'try_to_unmap_one':
>> mm/rmap.c:1425:7: error: implicit declaration of function 
>> 'huge_pmd_unshare'; did you mean '__NR_unshare'? 
>> [-Werror=implicit-function-declaration]
  huge_pmd_unshare(mm, , pvmw.pte)) {
  ^~~~
  __NR_unshare
   cc1: some warnings being treated as errors

vim +1425 mm/rmap.c

  1382  
  1383  /*
  1384   * If the page is mlock()d, we cannot swap it out.
  1385   * If it's recently referenced (perhaps page_referenced
  1386   * skipped over this mm) then we should reactivate it.
  1387   */
  1388  if (!(flags & TTU_IGNORE_MLOCK)) {
  1389  if (vma->vm_flags & VM_LOCKED) {
  1390  /* PTE-mapped THP are never mlocked */
  1391  if (!PageTransCompound(page)) {
  1392  /*
  1393   * Holding pte lock, we do 
*not* need
  1394   * mmap_sem here
  1395   */
  1396  mlock_vma_page(page);
  1397  }
  1398  ret = false;
  1399  page_vma_mapped_walk_done();
  1400  break;
  1401  }
  1402  if (flags & TTU_MUNLOCK)
  1403  continue;
  1404  }
  1405  
  1406  /* Unexpected PMD-mapped THP? */
  1407  VM_BUG_ON_PAGE(!pvmw.pte, page);
  1408  
  1409  subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
  1410  address = pvmw.address;
  1411  
  1412  /*
  1413   * PMDs for hugetlbfs pages could be shared.  In this 
case,
  1414   * pages with shared PMDs will have a mapcount of 1 no 
matter
  1415   * how many times it is actually mapped.  Map counting 
for
  1416   * PMD sharing is mostly done via the reference count 
on the
  1417   * PMD page itself.  If the page we are trying to unmap 
is a
  1418   * hugetlbfs page, attempt to 'unshare' at the PMD 
level.
  1419   * huge_pmd_unshare takes care of clearing the PUD and
  1420   * reference counting on the PMD page which effectively 
unmaps
  1421   * the page.  Take care of flushing cache and TLB for 
page in
  1422   * this specific mapping here.
  1423   */
  1424  if (PageHuge(page) &&
> 1425  huge_pmd_unshare(mm, , pvmw.pte)) {
  1426  unsigned long end_add = address + 
vma_mmu_pagesize(vma);
  1427  
  1428  flush_cache_range(vma, address, end_add);
  1429  flush_tlb_range(vma, address, end_add);
  1430  mmu_notifier_invalidate_range(mm, address, 
end_add);
  1431  continue;
  1432  }
  1433  
  1434  if (IS_ENABLED(CONFIG_MIGRATION) &&
  1435  (flags & TTU_MIGRATION) &&
  1436  is_zone_device_page(page)) {
  1437  swp_entry_t entry;
  1438  pte_t swp_pte;
  1439  
  1440  pteval = ptep_get_and_clear(mm, pvmw.address, 
pvmw.pte);
  1441  
  1442  /*
  1443   * Store the pfn of the page in a special 
migration
  1444   * pte. do_swap_page() will wait until the 
migration
  1445   * pte is removed and then restart fault 
handling.
  1446   */
  1447  entry = make_migration_entry(page, 0);
  1448  swp_pte = swp_entry_to_pte(entry);
  1449  if (pte_soft_dirty(pteval))
  1450  swp_pte = pte_swp_mksoft_dirty(swp_pte);
  1451  set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
  1452  

Re: [PATCH] mm: migration: fix migration of huge PMD shared pages

2018-08-12 Thread kbuild test robot
Hi Mike,

I love your patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.18 next-20180810]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Mike-Kravetz/mm-migration-fix-migration-of-huge-PMD-shared-pages/20180813-114549
config: i386-randconfig-x003-201832 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   mm/rmap.c: In function 'try_to_unmap_one':
>> mm/rmap.c:1425:7: error: implicit declaration of function 
>> 'huge_pmd_unshare'; did you mean '__NR_unshare'? 
>> [-Werror=implicit-function-declaration]
  huge_pmd_unshare(mm, , pvmw.pte)) {
  ^~~~
  __NR_unshare
   cc1: some warnings being treated as errors

vim +1425 mm/rmap.c

  1382  
  1383  /*
  1384   * If the page is mlock()d, we cannot swap it out.
  1385   * If it's recently referenced (perhaps page_referenced
  1386   * skipped over this mm) then we should reactivate it.
  1387   */
  1388  if (!(flags & TTU_IGNORE_MLOCK)) {
  1389  if (vma->vm_flags & VM_LOCKED) {
  1390  /* PTE-mapped THP are never mlocked */
  1391  if (!PageTransCompound(page)) {
  1392  /*
  1393   * Holding pte lock, we do 
*not* need
  1394   * mmap_sem here
  1395   */
  1396  mlock_vma_page(page);
  1397  }
  1398  ret = false;
  1399  page_vma_mapped_walk_done();
  1400  break;
  1401  }
  1402  if (flags & TTU_MUNLOCK)
  1403  continue;
  1404  }
  1405  
  1406  /* Unexpected PMD-mapped THP? */
  1407  VM_BUG_ON_PAGE(!pvmw.pte, page);
  1408  
  1409  subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
  1410  address = pvmw.address;
  1411  
  1412  /*
  1413   * PMDs for hugetlbfs pages could be shared.  In this 
case,
  1414   * pages with shared PMDs will have a mapcount of 1 no 
matter
  1415   * how many times it is actually mapped.  Map counting 
for
  1416   * PMD sharing is mostly done via the reference count 
on the
  1417   * PMD page itself.  If the page we are trying to unmap 
is a
  1418   * hugetlbfs page, attempt to 'unshare' at the PMD 
level.
  1419   * huge_pmd_unshare takes care of clearing the PUD and
  1420   * reference counting on the PMD page which effectively 
unmaps
  1421   * the page.  Take care of flushing cache and TLB for 
page in
  1422   * this specific mapping here.
  1423   */
  1424  if (PageHuge(page) &&
> 1425  huge_pmd_unshare(mm, , pvmw.pte)) {
  1426  unsigned long end_add = address + 
vma_mmu_pagesize(vma);
  1427  
  1428  flush_cache_range(vma, address, end_add);
  1429  flush_tlb_range(vma, address, end_add);
  1430  mmu_notifier_invalidate_range(mm, address, 
end_add);
  1431  continue;
  1432  }
  1433  
  1434  if (IS_ENABLED(CONFIG_MIGRATION) &&
  1435  (flags & TTU_MIGRATION) &&
  1436  is_zone_device_page(page)) {
  1437  swp_entry_t entry;
  1438  pte_t swp_pte;
  1439  
  1440  pteval = ptep_get_and_clear(mm, pvmw.address, 
pvmw.pte);
  1441  
  1442  /*
  1443   * Store the pfn of the page in a special 
migration
  1444   * pte. do_swap_page() will wait until the 
migration
  1445   * pte is removed and then restart fault 
handling.
  1446   */
  1447  entry = make_migration_entry(page, 0);
  1448  swp_pte = swp_entry_to_pte(entry);
  1449  if (pte_soft_dirty(pteval))
  1450  swp_pte = pte_swp_mksoft_dirty(swp_pte);
  1451  set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
  1452  

[BUG] fs: nfs: pnfs_nfs: a possible sleep-in-atomic-context bug in pnfs_generic_recover_commit_reqs()

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] mutex_lock_nested
fs/nfs/write.c, 1045: mutex_lock_nested in nfs_scan_commit_list
fs/nfs/pnfs_nfs.c, 145: nfs_scan_commit_list in 
pnfs_generic_recover_commit_reqs

fs/nfs/pnfs_nfs.c, 154: spin_lock in pnfs_generic_recover_commit_reqs

I do not find a good way to fix, so I only report.
This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai


[BUG] fs: nfs: pnfs_nfs: a possible sleep-in-atomic-context bug in pnfs_generic_recover_commit_reqs()

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] mutex_lock_nested
fs/nfs/write.c, 1045: mutex_lock_nested in nfs_scan_commit_list
fs/nfs/pnfs_nfs.c, 145: nfs_scan_commit_list in 
pnfs_generic_recover_commit_reqs

fs/nfs/pnfs_nfs.c, 154: spin_lock in pnfs_generic_recover_commit_reqs

I do not find a good way to fix, so I only report.
This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai


[BUG] fs: nfs: callback_proc: a possible sleep-in-atomic-context bug in nfs4_callback_sequence()

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] schedule_timeout
fs/nfs/nfs4session.c, 223: schedule_timeout in nfs4_slot_wait_on_seqid
fs/nfs/callback_proc.c, 466: nfs4_slot_wait_on_seqid in 
referring_call_exists

fs/nfs/callback_proc.c, 544: referring_call_exists in nfs4_callback_sequence
fs/nfs/callback_proc.c, 504: spin_lock in nfs4_callback_sequence

I do not find a good way to fix, so I only report.
This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai


[BUG] fs: nfs: callback_proc: a possible sleep-in-atomic-context bug in nfs4_callback_sequence()

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] schedule_timeout
fs/nfs/nfs4session.c, 223: schedule_timeout in nfs4_slot_wait_on_seqid
fs/nfs/callback_proc.c, 466: nfs4_slot_wait_on_seqid in 
referring_call_exists

fs/nfs/callback_proc.c, 544: referring_call_exists in nfs4_callback_sequence
fs/nfs/callback_proc.c, 504: spin_lock in nfs4_callback_sequence

I do not find a good way to fix, so I only report.
This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai


[PATCH] mm: migration: fix migration of huge PMD shared pages

2018-08-12 Thread Mike Kravetz
The page migration code employs try_to_unmap() to try and unmap the
source page.  This is accomplished by using rmap_walk to find all
vmas where the page is mapped.  This search stops when page mapcount
is zero.  For shared PMD huge pages, the page map count is always 1
not matter the number of mappings.  Shared mappings are tracked via
the reference count of the PMD page.  Therefore, try_to_unmap stops
prematurely and does not completely unmap all mappings of the source
page.

This problem can result is data corruption as writes to the original
source page can happen after contents of the page are copied to the
target page.  Hence, data is lost.

This problem was originally seen as DB corruption of shared global
areas after a huge page was soft offlined.  DB developers noticed
they could reproduce the issue by (hotplug) offlining memory used
to back huge pages.  A simple testcase can reproduce the problem by
creating a shared PMD mapping (note that this must be at least
PUD_SIZE in size and PUD_SIZE aligned (1GB on x86)), and using
migrate_pages() to migrate process pages between nodes.

To fix, have the try_to_unmap_one routine check for huge PMD sharing
by calling huge_pmd_unshare for hugetlbfs huge pages.  If it is a
shared mapping it will be 'unshared' which removes the page table
entry and drops reference on PMD page.  After this, flush caches and
TLB.

Signed-off-by: Mike Kravetz 
---
I am not %100 sure on the required flushing, so suggestions would be
appreciated.  This also should go to stable.  It has been around for
a long time so still looking for an appropriate 'fixes:'.

 mm/rmap.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/mm/rmap.c b/mm/rmap.c
index 09a799c9aebd..45583758bf16 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1409,6 +1409,27 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
 
+   /*
+* PMDs for hugetlbfs pages could be shared.  In this case,
+* pages with shared PMDs will have a mapcount of 1 no matter
+* how many times it is actually mapped.  Map counting for
+* PMD sharing is mostly done via the reference count on the
+* PMD page itself.  If the page we are trying to unmap is a
+* hugetlbfs page, attempt to 'unshare' at the PMD level.
+* huge_pmd_unshare takes care of clearing the PUD and
+* reference counting on the PMD page which effectively unmaps
+* the page.  Take care of flushing cache and TLB for page in
+* this specific mapping here.
+*/
+   if (PageHuge(page) &&
+   huge_pmd_unshare(mm, , pvmw.pte)) {
+   unsigned long end_add = address + vma_mmu_pagesize(vma);
+
+   flush_cache_range(vma, address, end_add);
+   flush_tlb_range(vma, address, end_add);
+   mmu_notifier_invalidate_range(mm, address, end_add);
+   continue;
+   }
 
if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION) &&
-- 
2.17.1



[PATCH] mm: migration: fix migration of huge PMD shared pages

2018-08-12 Thread Mike Kravetz
The page migration code employs try_to_unmap() to try and unmap the
source page.  This is accomplished by using rmap_walk to find all
vmas where the page is mapped.  This search stops when page mapcount
is zero.  For shared PMD huge pages, the page map count is always 1
not matter the number of mappings.  Shared mappings are tracked via
the reference count of the PMD page.  Therefore, try_to_unmap stops
prematurely and does not completely unmap all mappings of the source
page.

This problem can result is data corruption as writes to the original
source page can happen after contents of the page are copied to the
target page.  Hence, data is lost.

This problem was originally seen as DB corruption of shared global
areas after a huge page was soft offlined.  DB developers noticed
they could reproduce the issue by (hotplug) offlining memory used
to back huge pages.  A simple testcase can reproduce the problem by
creating a shared PMD mapping (note that this must be at least
PUD_SIZE in size and PUD_SIZE aligned (1GB on x86)), and using
migrate_pages() to migrate process pages between nodes.

To fix, have the try_to_unmap_one routine check for huge PMD sharing
by calling huge_pmd_unshare for hugetlbfs huge pages.  If it is a
shared mapping it will be 'unshared' which removes the page table
entry and drops reference on PMD page.  After this, flush caches and
TLB.

Signed-off-by: Mike Kravetz 
---
I am not %100 sure on the required flushing, so suggestions would be
appreciated.  This also should go to stable.  It has been around for
a long time so still looking for an appropriate 'fixes:'.

 mm/rmap.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/mm/rmap.c b/mm/rmap.c
index 09a799c9aebd..45583758bf16 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1409,6 +1409,27 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
 
+   /*
+* PMDs for hugetlbfs pages could be shared.  In this case,
+* pages with shared PMDs will have a mapcount of 1 no matter
+* how many times it is actually mapped.  Map counting for
+* PMD sharing is mostly done via the reference count on the
+* PMD page itself.  If the page we are trying to unmap is a
+* hugetlbfs page, attempt to 'unshare' at the PMD level.
+* huge_pmd_unshare takes care of clearing the PUD and
+* reference counting on the PMD page which effectively unmaps
+* the page.  Take care of flushing cache and TLB for page in
+* this specific mapping here.
+*/
+   if (PageHuge(page) &&
+   huge_pmd_unshare(mm, , pvmw.pte)) {
+   unsigned long end_add = address + vma_mmu_pagesize(vma);
+
+   flush_cache_range(vma, address, end_add);
+   flush_tlb_range(vma, address, end_add);
+   mmu_notifier_invalidate_range(mm, address, end_add);
+   continue;
+   }
 
if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION) &&
-- 
2.17.1



[BUG] fs: jffs2: possible sleep-in-atomic-context bugs in jffs2_iget

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] schedule
fs/inode.c, 1916: schedule in __wait_on_freeing_inode
fs/inode.c, 826: __wait_on_freeing_inode in find_inode_fast
fs/inode.c, 1107: find_inode_fast in iget_locked
fs/jffs2/fs.c, 263: iget_locked in jffs2_iget
fs/jffs2/fs.c, 665: jffs2_iget in jffs2_gc_fetch_inode
fs/jffs2/wbuf.c, 505: jffs2_gc_fetch_inode in jffs2_wbuf_recover
fs/jffs2/wbuf.c, 462: spin_lock in jffs2_wbuf_recover

[FUNC] mutex_lock_nested
fs/jffs2/fs.c, 273: mutex_lock_nested in jffs2_iget
fs/jffs2/fs.c, 665: jffs2_iget in jffs2_gc_fetch_inode
fs/jffs2/wbuf.c, 505: jffs2_gc_fetch_inode in jffs2_wbuf_recover
fs/jffs2/wbuf.c, 462: spin_lock in jffs2_wbuf_recover

I do not find a good way to fix, so I only report.
Maybe the spinlock should be released before calling jffs2_iget(), and 
then be acquired again.

This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai



[BUG] fs: jffs2: possible sleep-in-atomic-context bugs in jffs2_iget

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] schedule
fs/inode.c, 1916: schedule in __wait_on_freeing_inode
fs/inode.c, 826: __wait_on_freeing_inode in find_inode_fast
fs/inode.c, 1107: find_inode_fast in iget_locked
fs/jffs2/fs.c, 263: iget_locked in jffs2_iget
fs/jffs2/fs.c, 665: jffs2_iget in jffs2_gc_fetch_inode
fs/jffs2/wbuf.c, 505: jffs2_gc_fetch_inode in jffs2_wbuf_recover
fs/jffs2/wbuf.c, 462: spin_lock in jffs2_wbuf_recover

[FUNC] mutex_lock_nested
fs/jffs2/fs.c, 273: mutex_lock_nested in jffs2_iget
fs/jffs2/fs.c, 665: jffs2_iget in jffs2_gc_fetch_inode
fs/jffs2/wbuf.c, 505: jffs2_gc_fetch_inode in jffs2_wbuf_recover
fs/jffs2/wbuf.c, 462: spin_lock in jffs2_wbuf_recover

I do not find a good way to fix, so I only report.
Maybe the spinlock should be released before calling jffs2_iget(), and 
then be acquired again.

This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai



[PATCH] fs: jffs2: fix a sleep-in-atomic-context bug in jffs2_alloc_refblock()

2018-08-12 Thread Jia-Ju Bai
The kernel may sleep with holding a spin lock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] kmem_cache_alloc(GFP_KERNEL)
fs/jffs2/malloc.c, 188: 
kmem_cache_alloc in jffs2_alloc_refblock
fs/jffs2/malloc.c, 221: 
jffs2_alloc_refblock in jffs2_prealloc_raw_node_refs
fs/jffs2/wbuf.c, 164: 
jffs2_prealloc_raw_node_refs in jffs2_block_refile
fs/jffs2/wbuf.c, 927: 
jffs2_block_refile in jffs2_flash_writev
fs/jffs2/wbuf.c, 924: 
spin_lock in jffs2_flash_writev

To fix it, GFP_KERNEL in kmem_cache_alloc() is replaced with GFP_ATOMIC.

This is found by my static analysis tool (DSAC).

Signed-off-by: Jia-Ju Bai 
---
 fs/jffs2/malloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index ce1189793288..66496ef09716 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -185,7 +185,7 @@ static struct jffs2_raw_node_ref *jffs2_alloc_refblock(void)
 {
struct jffs2_raw_node_ref *ret;
 
-   ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL);
+   ret = kmem_cache_alloc(raw_node_ref_slab, GFP_ATOMIC);
if (ret) {
int i = 0;
for (i=0; i < REFS_PER_BLOCK; i++) {
-- 
2.17.0



[PATCH] fs: jffs2: fix a sleep-in-atomic-context bug in jffs2_alloc_refblock()

2018-08-12 Thread Jia-Ju Bai
The kernel may sleep with holding a spin lock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] kmem_cache_alloc(GFP_KERNEL)
fs/jffs2/malloc.c, 188: 
kmem_cache_alloc in jffs2_alloc_refblock
fs/jffs2/malloc.c, 221: 
jffs2_alloc_refblock in jffs2_prealloc_raw_node_refs
fs/jffs2/wbuf.c, 164: 
jffs2_prealloc_raw_node_refs in jffs2_block_refile
fs/jffs2/wbuf.c, 927: 
jffs2_block_refile in jffs2_flash_writev
fs/jffs2/wbuf.c, 924: 
spin_lock in jffs2_flash_writev

To fix it, GFP_KERNEL in kmem_cache_alloc() is replaced with GFP_ATOMIC.

This is found by my static analysis tool (DSAC).

Signed-off-by: Jia-Ju Bai 
---
 fs/jffs2/malloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index ce1189793288..66496ef09716 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -185,7 +185,7 @@ static struct jffs2_raw_node_ref *jffs2_alloc_refblock(void)
 {
struct jffs2_raw_node_ref *ret;
 
-   ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL);
+   ret = kmem_cache_alloc(raw_node_ref_slab, GFP_ATOMIC);
if (ret) {
int i = 0;
for (i=0; i < REFS_PER_BLOCK; i++) {
-- 
2.17.0



[BUG] mm: truncate: a possible sleep-in-atomic-context bug in truncate_exceptional_pvec_entries()

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] schedule
fs/dax.c, 259: schedule in get_unlocked_mapping_entry
fs/dax.c, 450: get_unlocked_mapping_entry in __dax_invalidate_mapping_entry
fs/dax.c, 471: __dax_invalidate_mapping_entry in dax_delete_mapping_entry
mm/truncate.c, 97: dax_delete_mapping_entry in 
truncate_exceptional_pvec_entries

mm/truncate.c, 82: spin_lock_irq in truncate_exceptional_pvec_entries

I do not find a good way to fix, so I only report.
This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai


[BUG] mm: truncate: a possible sleep-in-atomic-context bug in truncate_exceptional_pvec_entries()

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] schedule
fs/dax.c, 259: schedule in get_unlocked_mapping_entry
fs/dax.c, 450: get_unlocked_mapping_entry in __dax_invalidate_mapping_entry
fs/dax.c, 471: __dax_invalidate_mapping_entry in dax_delete_mapping_entry
mm/truncate.c, 97: dax_delete_mapping_entry in 
truncate_exceptional_pvec_entries

mm/truncate.c, 82: spin_lock_irq in truncate_exceptional_pvec_entries

I do not find a good way to fix, so I only report.
This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai


[BUG] kernel: rcu: a possible sleep-in-atomic-context bug in srcu_read_delay()

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] schedule_timeout_interruptible
kernel/rcu/rcutorture.c, 523: schedule_timeout_interruptible in 
srcu_read_delay
kernel/rcu/rcutorture.c, 1105: [FUNC_PTR]srcu_read_delay in 
rcu_torture_timer

kernel/rcu/rcutorture.c, 1104: spin_lock in rcu_torture_timer

Note that [FUNC_PTR] means a function pointer call is used.

I do not find a good way to fix, so I only report.
This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai


[BUG] kernel: rcu: a possible sleep-in-atomic-context bug in srcu_read_delay()

2018-08-12 Thread Jia-Ju Bai

The kernel may sleep with holding a spinlock.

The function call paths (from bottom to top) in Linux-4.16 are:

[FUNC] schedule_timeout_interruptible
kernel/rcu/rcutorture.c, 523: schedule_timeout_interruptible in 
srcu_read_delay
kernel/rcu/rcutorture.c, 1105: [FUNC_PTR]srcu_read_delay in 
rcu_torture_timer

kernel/rcu/rcutorture.c, 1104: spin_lock in rcu_torture_timer

Note that [FUNC_PTR] means a function pointer call is used.

I do not find a good way to fix, so I only report.
This is found by my static analysis tool (DSAC).


Thanks,
Jia-Ju Bai


RE: [PATCH 2/2] clk: imx: imx7d: remove clks_init_on array

2018-08-12 Thread Peng Fan
Hi Anson,

> > > -Original Message-
> > > From: Anson Huang
> > > Sent: 2018年8月8日 12:39
> > > To: shawn...@kernel.org; s.ha...@pengutronix.de;
> > > ker...@pengutronix.de; Fabio Estevam ;
> > > mturque...@baylibre.com; sb...@kernel.org;
> > > linux-arm-ker...@lists.infradead.org;
> > > linux-...@vger.kernel.org; linux-kernel@vger.kernel.org
> > > Cc: dl-linux-imx 
> > > Subject: [PATCH 2/2] clk: imx: imx7d: remove clks_init_on array
> > >
> > > Clock framework will enable those clocks registered with
> > > CLK_IS_CRITICAL flag, so no need to have clks_init_on array during
> > > clock
> > initialization now.
> >
> > Will it be more flexible to parse dts saying "critical-clocks = "
> > or "init-on-arrary="
> > and enable those clocks?
> 
> Parsing the clocks arrays from dtb is another way of enabling critical 
> clocks, but
> for current i.MX6/7 platforms, we implement it in same way as most of other
> SoCs, currently I did NOT see any necessity of putting them in dtb, just 
> adding
> flag during clock registering is more simple, if there is any special 
> requirement
> for different clocks set to be enabled, then we can add support to enable the
> method of parsing critical-clocks from dtb. Just my two cents.

Thinking about OP-TEE want to use one device, but it's clocks are registered
by Linux, because there is no module in Linux side use it, it will shutdown the 
clock,
which cause OP-TEE could not access the device.

Then people have to modify clk code to add CLK_IS_CRITICAL flag to make sure
the clocks are not shutdown by Linux.

However adding a new property in clk node and let driver code parse the dts,
there is no need to modify clk driver code when OP-TEE needs another device 
clock.

Regards,
Peng.

> 
> Anson.
> 
> >
> > Regards,
> > Peng.
> >
> > >
> > > Signed-off-by: Anson Huang 
> > > ---
> > >  drivers/clk/imx/clk-imx7d.c | 27 ---
> > >  drivers/clk/imx/clk.h   |  7 +++
> > >  2 files changed, 15 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/drivers/clk/imx/clk-imx7d.c
> > > b/drivers/clk/imx/clk-imx7d.c index c4518d7..076460b 100644
> > > --- a/drivers/clk/imx/clk-imx7d.c
> > > +++ b/drivers/clk/imx/clk-imx7d.c
> > > @@ -379,13 +379,6 @@ static const char *pll_enet_bypass_sel[] = {
> > > "pll_enet_main", "pll_enet_main_src  static const char
> > > *pll_audio_bypass_sel[] = { "pll_audio_main", "pll_audio_main_src",
> > > }; static const char *pll_video_bypass_sel[] = { "pll_video_main",
> > > "pll_video_main_src", };
> > >
> > > -static int const clks_init_on[] __initconst = {
> > > - IMX7D_ARM_A7_ROOT_CLK, IMX7D_MAIN_AXI_ROOT_CLK,
> > > - IMX7D_PLL_SYS_MAIN_480M_CLK, IMX7D_IPG_ROOT_CLK,
> > > - IMX7D_DRAM_PHYM_ROOT_CLK, IMX7D_DRAM_ROOT_CLK,
> > > - IMX7D_DRAM_PHYM_ALT_ROOT_CLK, IMX7D_DRAM_ALT_ROOT_CLK,
> > > -};
> > > -
> > >  static struct clk_onecell_data clk_data;
> > >
> > >  static struct clk ** const uart_clks[] __initconst = { @@ -403,7
> > > +396,6 @@ static void __init imx7d_clocks_init(struct device_node
> > *ccm_node)  {
> > >   struct device_node *np;
> > >   void __iomem *base;
> > > - int i;
> > >
> > >   clks[IMX7D_CLK_DUMMY] = imx_clk_fixed("dummy", 0);
> > >   clks[IMX7D_OSC_24M_CLK] = of_clk_get_by_name(ccm_node, "osc");
> > @@
> > > -466,7 +458,7 @@ static void __init imx7d_clocks_init(struct
> > > device_node
> > > *ccm_node)
> > >   clks[IMX7D_PLL_SYS_MAIN_120M] =
> > > imx_clk_fixed_factor("pll_sys_main_120m", "pll_sys_main_clk", 1, 4);
> > >   clks[IMX7D_PLL_DRAM_MAIN_533M] =
> > > imx_clk_fixed_factor("pll_dram_533m", "pll_dram_main_clk", 1, 2);
> > >
> > > - clks[IMX7D_PLL_SYS_MAIN_480M_CLK] =
> > > imx_clk_gate_dis("pll_sys_main_480m_clk", "pll_sys_main_480m", base
> > > + 0xb0, 4);
> > > + clks[IMX7D_PLL_SYS_MAIN_480M_CLK] =
> > > +imx_clk_gate_dis_flags("pll_sys_main_480m_clk",
> > > +"pll_sys_main_480m", base + 0xb0, 4, CLK_IS_CRITICAL);
> > >   clks[IMX7D_PLL_SYS_MAIN_240M_CLK] =
> > > imx_clk_gate_dis("pll_sys_main_240m_clk", "pll_sys_main_240m", base
> > > + 0xb0, 5);
> > >   clks[IMX7D_PLL_SYS_MAIN_120M_CLK] =
> > > imx_clk_gate_dis("pll_sys_main_120m_clk", "pll_sys_main_120m", base
> > > + 0xb0, 6);
> > >   clks[IMX7D_PLL_DRAM_MAIN_533M_CLK] =
> > > imx_clk_gate("pll_dram_533m_clk", "pll_dram_533m", base + 0x70, 12);
> > > @@
> > > -719,7 +711,7 @@ static void __init imx7d_clocks_init(struct
> > > device_node
> > > *ccm_node)
> > >   clks[IMX7D_ENET_AXI_ROOT_DIV] =
> > > imx_clk_divider2("enet_axi_post_div", "enet_axi_pre_div", base +
> > > 0x8900, 0,
> > 6);
> > >   clks[IMX7D_NAND_USDHC_BUS_ROOT_CLK] =
> > > imx_clk_divider2("nand_usdhc_root_clk", "nand_usdhc_pre_div", base +
> > > 0x8980, 0, 6);
> > >   clks[IMX7D_AHB_CHANNEL_ROOT_DIV] =
> > > imx_clk_divider2("ahb_root_clk", "ahb_pre_div", base + 0x9000, 0, 6);
> > > - clks[IMX7D_IPG_ROOT_CLK] = imx_clk_divider2("ipg_root_clk",
> > > "ahb_root_clk", base + 0x9080, 0, 2);
> > > + clks[IMX7D_IPG_ROOT_CLK] = imx_clk_divider_flags("ipg_root_clk",
> > > 

RE: [PATCH 2/2] clk: imx: imx7d: remove clks_init_on array

2018-08-12 Thread Peng Fan
Hi Anson,

> > > -Original Message-
> > > From: Anson Huang
> > > Sent: 2018年8月8日 12:39
> > > To: shawn...@kernel.org; s.ha...@pengutronix.de;
> > > ker...@pengutronix.de; Fabio Estevam ;
> > > mturque...@baylibre.com; sb...@kernel.org;
> > > linux-arm-ker...@lists.infradead.org;
> > > linux-...@vger.kernel.org; linux-kernel@vger.kernel.org
> > > Cc: dl-linux-imx 
> > > Subject: [PATCH 2/2] clk: imx: imx7d: remove clks_init_on array
> > >
> > > Clock framework will enable those clocks registered with
> > > CLK_IS_CRITICAL flag, so no need to have clks_init_on array during
> > > clock
> > initialization now.
> >
> > Will it be more flexible to parse dts saying "critical-clocks = "
> > or "init-on-arrary="
> > and enable those clocks?
> 
> Parsing the clocks arrays from dtb is another way of enabling critical 
> clocks, but
> for current i.MX6/7 platforms, we implement it in same way as most of other
> SoCs, currently I did NOT see any necessity of putting them in dtb, just 
> adding
> flag during clock registering is more simple, if there is any special 
> requirement
> for different clocks set to be enabled, then we can add support to enable the
> method of parsing critical-clocks from dtb. Just my two cents.

Thinking about OP-TEE want to use one device, but it's clocks are registered
by Linux, because there is no module in Linux side use it, it will shutdown the 
clock,
which cause OP-TEE could not access the device.

Then people have to modify clk code to add CLK_IS_CRITICAL flag to make sure
the clocks are not shutdown by Linux.

However adding a new property in clk node and let driver code parse the dts,
there is no need to modify clk driver code when OP-TEE needs another device 
clock.

Regards,
Peng.

> 
> Anson.
> 
> >
> > Regards,
> > Peng.
> >
> > >
> > > Signed-off-by: Anson Huang 
> > > ---
> > >  drivers/clk/imx/clk-imx7d.c | 27 ---
> > >  drivers/clk/imx/clk.h   |  7 +++
> > >  2 files changed, 15 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/drivers/clk/imx/clk-imx7d.c
> > > b/drivers/clk/imx/clk-imx7d.c index c4518d7..076460b 100644
> > > --- a/drivers/clk/imx/clk-imx7d.c
> > > +++ b/drivers/clk/imx/clk-imx7d.c
> > > @@ -379,13 +379,6 @@ static const char *pll_enet_bypass_sel[] = {
> > > "pll_enet_main", "pll_enet_main_src  static const char
> > > *pll_audio_bypass_sel[] = { "pll_audio_main", "pll_audio_main_src",
> > > }; static const char *pll_video_bypass_sel[] = { "pll_video_main",
> > > "pll_video_main_src", };
> > >
> > > -static int const clks_init_on[] __initconst = {
> > > - IMX7D_ARM_A7_ROOT_CLK, IMX7D_MAIN_AXI_ROOT_CLK,
> > > - IMX7D_PLL_SYS_MAIN_480M_CLK, IMX7D_IPG_ROOT_CLK,
> > > - IMX7D_DRAM_PHYM_ROOT_CLK, IMX7D_DRAM_ROOT_CLK,
> > > - IMX7D_DRAM_PHYM_ALT_ROOT_CLK, IMX7D_DRAM_ALT_ROOT_CLK,
> > > -};
> > > -
> > >  static struct clk_onecell_data clk_data;
> > >
> > >  static struct clk ** const uart_clks[] __initconst = { @@ -403,7
> > > +396,6 @@ static void __init imx7d_clocks_init(struct device_node
> > *ccm_node)  {
> > >   struct device_node *np;
> > >   void __iomem *base;
> > > - int i;
> > >
> > >   clks[IMX7D_CLK_DUMMY] = imx_clk_fixed("dummy", 0);
> > >   clks[IMX7D_OSC_24M_CLK] = of_clk_get_by_name(ccm_node, "osc");
> > @@
> > > -466,7 +458,7 @@ static void __init imx7d_clocks_init(struct
> > > device_node
> > > *ccm_node)
> > >   clks[IMX7D_PLL_SYS_MAIN_120M] =
> > > imx_clk_fixed_factor("pll_sys_main_120m", "pll_sys_main_clk", 1, 4);
> > >   clks[IMX7D_PLL_DRAM_MAIN_533M] =
> > > imx_clk_fixed_factor("pll_dram_533m", "pll_dram_main_clk", 1, 2);
> > >
> > > - clks[IMX7D_PLL_SYS_MAIN_480M_CLK] =
> > > imx_clk_gate_dis("pll_sys_main_480m_clk", "pll_sys_main_480m", base
> > > + 0xb0, 4);
> > > + clks[IMX7D_PLL_SYS_MAIN_480M_CLK] =
> > > +imx_clk_gate_dis_flags("pll_sys_main_480m_clk",
> > > +"pll_sys_main_480m", base + 0xb0, 4, CLK_IS_CRITICAL);
> > >   clks[IMX7D_PLL_SYS_MAIN_240M_CLK] =
> > > imx_clk_gate_dis("pll_sys_main_240m_clk", "pll_sys_main_240m", base
> > > + 0xb0, 5);
> > >   clks[IMX7D_PLL_SYS_MAIN_120M_CLK] =
> > > imx_clk_gate_dis("pll_sys_main_120m_clk", "pll_sys_main_120m", base
> > > + 0xb0, 6);
> > >   clks[IMX7D_PLL_DRAM_MAIN_533M_CLK] =
> > > imx_clk_gate("pll_dram_533m_clk", "pll_dram_533m", base + 0x70, 12);
> > > @@
> > > -719,7 +711,7 @@ static void __init imx7d_clocks_init(struct
> > > device_node
> > > *ccm_node)
> > >   clks[IMX7D_ENET_AXI_ROOT_DIV] =
> > > imx_clk_divider2("enet_axi_post_div", "enet_axi_pre_div", base +
> > > 0x8900, 0,
> > 6);
> > >   clks[IMX7D_NAND_USDHC_BUS_ROOT_CLK] =
> > > imx_clk_divider2("nand_usdhc_root_clk", "nand_usdhc_pre_div", base +
> > > 0x8980, 0, 6);
> > >   clks[IMX7D_AHB_CHANNEL_ROOT_DIV] =
> > > imx_clk_divider2("ahb_root_clk", "ahb_pre_div", base + 0x9000, 0, 6);
> > > - clks[IMX7D_IPG_ROOT_CLK] = imx_clk_divider2("ipg_root_clk",
> > > "ahb_root_clk", base + 0x9080, 0, 2);
> > > + clks[IMX7D_IPG_ROOT_CLK] = imx_clk_divider_flags("ipg_root_clk",
> > > 

REPLY AS SOON AS POSSIBLE

2018-08-12 Thread Dr Chien Direktor von Hang Seng
I am Vice Chairman of Hang Seng Bank, I have Important Matter to Discuss with 
you concerning my late client. Died without a NEXT OF KIN. Send me your private 
email for full details information. email me at E-Mail: dray...@gmail.com

Regards 
Mr.Fung


REPLY AS SOON AS POSSIBLE

2018-08-12 Thread Dr Chien Direktor von Hang Seng
I am Vice Chairman of Hang Seng Bank, I have Important Matter to Discuss with 
you concerning my late client. Died without a NEXT OF KIN. Send me your private 
email for full details information. email me at E-Mail: dray...@gmail.com

Regards 
Mr.Fung


[PATCH] Bluetooth: mediatek: fix semicolon.cocci warnings

2018-08-12 Thread kbuild test robot
From: kbuild test robot 

drivers/bluetooth/btmtk.c:86:2-3: Unneeded semicolon


 Remove unneeded semicolon.

Generated by: scripts/coccinelle/misc/semicolon.cocci

Fixes: a52562c05bdf ("Bluetooth: mediatek: Add protocol support for MediaTek 
MT7668U USB devices")
CC: Sean Wang 
Signed-off-by: kbuild test robot 
---

 btmtk.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/bluetooth/btmtk.c
+++ b/drivers/bluetooth/btmtk.c
@@ -83,7 +83,7 @@ btmtk_hci_wmt_sync(struct hci_dev *hdev,
else
status = BTMTK_WMT_ON_UNDONE;
break;
-   };
+   }
 
if (params->status)
*params->status = status;


[PATCH] Bluetooth: mediatek: fix semicolon.cocci warnings

2018-08-12 Thread kbuild test robot
From: kbuild test robot 

drivers/bluetooth/btmtk.c:86:2-3: Unneeded semicolon


 Remove unneeded semicolon.

Generated by: scripts/coccinelle/misc/semicolon.cocci

Fixes: a52562c05bdf ("Bluetooth: mediatek: Add protocol support for MediaTek 
MT7668U USB devices")
CC: Sean Wang 
Signed-off-by: kbuild test robot 
---

 btmtk.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/bluetooth/btmtk.c
+++ b/drivers/bluetooth/btmtk.c
@@ -83,7 +83,7 @@ btmtk_hci_wmt_sync(struct hci_dev *hdev,
else
status = BTMTK_WMT_ON_UNDONE;
break;
-   };
+   }
 
if (params->status)
*params->status = status;


Re: [PATCH v1 1/2] Bluetooth: mediatek: Add protocol support for MediaTek MT7668U USB devices

2018-08-12 Thread kbuild test robot
Hi Sean,

I love your patch! Perhaps something to improve:

[auto build test WARNING on bluetooth/master]
[also build test WARNING on v4.18 next-20180810]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/linux-kernel-owner-vger-kernel-org/Bluetooth-mediatek-Add-protocol-support-for-MediaTek-MT7668U-USB-devices/20180813-043802
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git 
master


coccinelle warnings: (new ones prefixed by >>)

>> drivers/bluetooth/btmtk.c:86:2-3: Unneeded semicolon

Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


Re: [PATCH v1 1/2] Bluetooth: mediatek: Add protocol support for MediaTek MT7668U USB devices

2018-08-12 Thread kbuild test robot
Hi Sean,

I love your patch! Perhaps something to improve:

[auto build test WARNING on bluetooth/master]
[also build test WARNING on v4.18 next-20180810]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/linux-kernel-owner-vger-kernel-org/Bluetooth-mediatek-Add-protocol-support-for-MediaTek-MT7668U-USB-devices/20180813-043802
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git 
master


coccinelle warnings: (new ones prefixed by >>)

>> drivers/bluetooth/btmtk.c:86:2-3: Unneeded semicolon

Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


Re: Linux 3.18.111

2018-08-12 Thread Seung-Woo Kim



On 2018년 08월 10일 19:11, Greg Kroah-Hartman wrote:
> On Fri, Aug 10, 2018 at 03:43:02PM +0900, Seung-Woo Kim wrote:
>> On 2018년 08월 08일 19:06, Seung-Woo Kim wrote:
>>> On 2018년 07월 05일 09:52, Al Viro wrote:
 On Mon, Jul 02, 2018 at 10:01:25PM -0700, Linus Torvalds wrote:
> On Mon, Jul 2, 2018 at 9:43 PM Seung-Woo Kim  
> wrote:
>>
>> I think the commit itself is required. Simple, but not reliable,
>> workaround fix is like below:
>>
>> diff --git a/fs/dcache.c b/fs/dcache.c
>> index a34d401..7c751f2 100644
>> --- a/fs/dcache.c
>> +++ b/fs/dcache.c
>> @@ -1879,6 +1879,8 @@ void d_instantiate_new(struct dentry *entry,
>> struct inode *inode)
>> BUG_ON(!hlist_unhashed(>d_u.d_alias));
>> BUG_ON(!inode);
>> lockdep_annotate_inode_mutex_key(inode);
>> +   /* WORKAROUND for calling security_d_instantiate() */
>> +   entry->d_inode = inode;
>> security_d_instantiate(entry, inode);
>> spin_lock(>i_lock);
>> __d_instantiate(entry, inode);
>
> Ugh. That looks horrible even if it might avoid the oops.
>
> I think a much better solution is to back-port commit b296821a7c42
> ("xattr_handler: pass dentry and inode as separate arguments of
> ->get()") to older kernels. Then the inode is passed down all the way,
> and you don't have people try to get it from the (not yet initialized)
> dentry.
>
> But there might be other parts missing too, and I didn't look at how
> easy/painful that backport would be.
>
> Al - comments? This is all because of commit 1e2e547a93a0 ("do
> d_instantiate/unlock_new_inode combinations safely") being marked for
> stable, and various cases of security_d_instantiate() calling down to
> getxattr. Which used to not get the inode at all, so those older
> kernels use d_inode(dentry), which doesn't work in this path since
> dentry->d_inode hasn't been instantiated yet..

 You also want b96809173e94 and ce23e6401334 there...
>>>
>>> For above two commits, also b296821a7c42 is required. And after
>>> backport, smack still crashed because setxattr. To fix it, 5930122683df
>>> and 3767e255b390 are also required.
>>>
>>> By the way, does no one have met this kind getxattr crash issue with
>>> selinux from 3.18.y?
>>>
>>
>> I have checked with selinux, and it is confirmed that there is no crash
>> because selinux_d_instantiate() has null check for inode. So, it is only
>> security smack issue.
> 
> So are the 5 patches you sent ok to apply to the 3.18-stable tree?  Or
> do we need to do something else?
> 

Those 5 patches are fine in my smack environment. I have not tested all
the file systems in run-time except ext2/4 and I only tested build for
those file systems.

Best Regards,
- Seung-Woo Kim

> thanks,
> 
> greg k-h
> 
> 


Re: Linux 3.18.111

2018-08-12 Thread Seung-Woo Kim



On 2018년 08월 10일 19:11, Greg Kroah-Hartman wrote:
> On Fri, Aug 10, 2018 at 03:43:02PM +0900, Seung-Woo Kim wrote:
>> On 2018년 08월 08일 19:06, Seung-Woo Kim wrote:
>>> On 2018년 07월 05일 09:52, Al Viro wrote:
 On Mon, Jul 02, 2018 at 10:01:25PM -0700, Linus Torvalds wrote:
> On Mon, Jul 2, 2018 at 9:43 PM Seung-Woo Kim  
> wrote:
>>
>> I think the commit itself is required. Simple, but not reliable,
>> workaround fix is like below:
>>
>> diff --git a/fs/dcache.c b/fs/dcache.c
>> index a34d401..7c751f2 100644
>> --- a/fs/dcache.c
>> +++ b/fs/dcache.c
>> @@ -1879,6 +1879,8 @@ void d_instantiate_new(struct dentry *entry,
>> struct inode *inode)
>> BUG_ON(!hlist_unhashed(>d_u.d_alias));
>> BUG_ON(!inode);
>> lockdep_annotate_inode_mutex_key(inode);
>> +   /* WORKAROUND for calling security_d_instantiate() */
>> +   entry->d_inode = inode;
>> security_d_instantiate(entry, inode);
>> spin_lock(>i_lock);
>> __d_instantiate(entry, inode);
>
> Ugh. That looks horrible even if it might avoid the oops.
>
> I think a much better solution is to back-port commit b296821a7c42
> ("xattr_handler: pass dentry and inode as separate arguments of
> ->get()") to older kernels. Then the inode is passed down all the way,
> and you don't have people try to get it from the (not yet initialized)
> dentry.
>
> But there might be other parts missing too, and I didn't look at how
> easy/painful that backport would be.
>
> Al - comments? This is all because of commit 1e2e547a93a0 ("do
> d_instantiate/unlock_new_inode combinations safely") being marked for
> stable, and various cases of security_d_instantiate() calling down to
> getxattr. Which used to not get the inode at all, so those older
> kernels use d_inode(dentry), which doesn't work in this path since
> dentry->d_inode hasn't been instantiated yet..

 You also want b96809173e94 and ce23e6401334 there...
>>>
>>> For above two commits, also b296821a7c42 is required. And after
>>> backport, smack still crashed because setxattr. To fix it, 5930122683df
>>> and 3767e255b390 are also required.
>>>
>>> By the way, does no one have met this kind getxattr crash issue with
>>> selinux from 3.18.y?
>>>
>>
>> I have checked with selinux, and it is confirmed that there is no crash
>> because selinux_d_instantiate() has null check for inode. So, it is only
>> security smack issue.
> 
> So are the 5 patches you sent ok to apply to the 3.18-stable tree?  Or
> do we need to do something else?
> 

Those 5 patches are fine in my smack environment. I have not tested all
the file systems in run-time except ext2/4 and I only tested build for
those file systems.

Best Regards,
- Seung-Woo Kim

> thanks,
> 
> greg k-h
> 
> 


Re: [PATCH] tools/memory-model: Fix a couple of typos

2018-08-12 Thread Paul E. McKenney
On Sun, Aug 12, 2018 at 02:29:28PM +0900, SeongJae Park wrote:
> This commit fixes a couple of typos in README and recipies.txt.
> 
> Signed-off-by: SeongJae Park 

Good catch on the first "The the"!  (Why say it once when you can say
it twice?)

On the second, the quoted output doesn't have a comma, so the quote of
that output also needs not to have a comma.  So I applied the first
patch and left out the second.  But please let me know if I am missing
something.

Thanx, Paul

> ---
>  tools/memory-model/Documentation/recipes.txt | 2 +-
>  tools/memory-model/README| 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/memory-model/Documentation/recipes.txt 
> b/tools/memory-model/Documentation/recipes.txt
> index a40802fa1099..29df496c1c77 100644
> --- a/tools/memory-model/Documentation/recipes.txt
> +++ b/tools/memory-model/Documentation/recipes.txt
> @@ -311,7 +311,7 @@ The smp_wmb() macro orders prior stores against later 
> stores, and the
>  smp_rmb() macro orders prior loads against later loads.  Therefore, if
>  the final value of r0 is 1, the final value of r1 must also be 1.
> 
> -The the xlog_state_switch_iclogs() function in fs/xfs/xfs_log.c contains
> +The xlog_state_switch_iclogs() function in fs/xfs/xfs_log.c contains
>  the following write-side code fragment:
> 
>   log->l_curr_block -= log->l_logBBsize;
> diff --git a/tools/memory-model/README b/tools/memory-model/README
> index ee987ce20aae..73313faf4036 100644
> --- a/tools/memory-model/README
> +++ b/tools/memory-model/README
> @@ -89,7 +89,7 @@ The corresponding output includes:
>Observation SB+fencembonceonces Never 0 200
>Time SB+fencembonceonces 0.16
> 
> -The "Positive: 0 Negative: 200" and the "Never 0 200" indicate
> +The "Positive: 0, Negative: 200" and the "Never 0 200" indicate
>  that during two million trials, the state specified in this litmus
>  test's "exists" clause was not reached.
> 
> -- 
> 2.13.0
> 



Re: [PATCH] tools/memory-model: Fix a couple of typos

2018-08-12 Thread Paul E. McKenney
On Sun, Aug 12, 2018 at 02:29:28PM +0900, SeongJae Park wrote:
> This commit fixes a couple of typos in README and recipies.txt.
> 
> Signed-off-by: SeongJae Park 

Good catch on the first "The the"!  (Why say it once when you can say
it twice?)

On the second, the quoted output doesn't have a comma, so the quote of
that output also needs not to have a comma.  So I applied the first
patch and left out the second.  But please let me know if I am missing
something.

Thanx, Paul

> ---
>  tools/memory-model/Documentation/recipes.txt | 2 +-
>  tools/memory-model/README| 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/memory-model/Documentation/recipes.txt 
> b/tools/memory-model/Documentation/recipes.txt
> index a40802fa1099..29df496c1c77 100644
> --- a/tools/memory-model/Documentation/recipes.txt
> +++ b/tools/memory-model/Documentation/recipes.txt
> @@ -311,7 +311,7 @@ The smp_wmb() macro orders prior stores against later 
> stores, and the
>  smp_rmb() macro orders prior loads against later loads.  Therefore, if
>  the final value of r0 is 1, the final value of r1 must also be 1.
> 
> -The the xlog_state_switch_iclogs() function in fs/xfs/xfs_log.c contains
> +The xlog_state_switch_iclogs() function in fs/xfs/xfs_log.c contains
>  the following write-side code fragment:
> 
>   log->l_curr_block -= log->l_logBBsize;
> diff --git a/tools/memory-model/README b/tools/memory-model/README
> index ee987ce20aae..73313faf4036 100644
> --- a/tools/memory-model/README
> +++ b/tools/memory-model/README
> @@ -89,7 +89,7 @@ The corresponding output includes:
>Observation SB+fencembonceonces Never 0 200
>Time SB+fencembonceonces 0.16
> 
> -The "Positive: 0 Negative: 200" and the "Never 0 200" indicate
> +The "Positive: 0, Negative: 200" and the "Never 0 200" indicate
>  that during two million trials, the state specified in this litmus
>  test's "exists" clause was not reached.
> 
> -- 
> 2.13.0
> 



[git pull] vfs.git - a bit that should've been in misc branch

2018-08-12 Thread Al Viro
I expected more fs/dcache.c cleanups this cycle, so that went into
a separate branch; said cleanups have missed the window, so in the
hindsight it could've gone into work.misc instead.  Decided not
to cherry-pick, thus the separate pull request.

PS: That's the last pull request for today; the only stuff not included
into those is dhowells' mount series.

PPS: Looking through the sent mail, this commit (pure removal of unused
function) is the only one not posted on fsdevel/l-k this cycle...

The following changes since commit ce397d215ccd07b8ae3f71db689aedb85d56ab40:

  Linux 4.18-rc1 (2018-06-17 08:04:49 +0900)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.dcache

for you to fetch changes up to 63a67a926e214dac94e29147c0f3d11499f655a1:

  kill dentry_update_name_case() (2018-06-23 17:16:44 -0400)


Al Viro (1):
  kill dentry_update_name_case()

 fs/dcache.c| 27 ---
 include/linux/dcache.h |  2 --
 2 files changed, 29 deletions(-)


[git pull] vfs.git - a bit that should've been in misc branch

2018-08-12 Thread Al Viro
I expected more fs/dcache.c cleanups this cycle, so that went into
a separate branch; said cleanups have missed the window, so in the
hindsight it could've gone into work.misc instead.  Decided not
to cherry-pick, thus the separate pull request.

PS: That's the last pull request for today; the only stuff not included
into those is dhowells' mount series.

PPS: Looking through the sent mail, this commit (pure removal of unused
function) is the only one not posted on fsdevel/l-k this cycle...

The following changes since commit ce397d215ccd07b8ae3f71db689aedb85d56ab40:

  Linux 4.18-rc1 (2018-06-17 08:04:49 +0900)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.dcache

for you to fetch changes up to 63a67a926e214dac94e29147c0f3d11499f655a1:

  kill dentry_update_name_case() (2018-06-23 17:16:44 -0400)


Al Viro (1):
  kill dentry_update_name_case()

 fs/dcache.c| 27 ---
 include/linux/dcache.h |  2 --
 2 files changed, 29 deletions(-)


[git pull] vfs.git - misc stuff

2018-08-12 Thread Al Viro
misc cleanups from various folks all over the place

The following changes since commit ce397d215ccd07b8ae3f71db689aedb85d56ab40:

  Linux 4.18-rc1 (2018-06-17 08:04:49 +0900)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.misc

for you to fetch changes up to 7964410fcf135d7e76deef4e475816ec02482f7b:

  fs: dcache: Use true and false for boolean values (2018-08-05 15:52:44 -0400)


Al Viro (1):
  fold generic_readlink() into its only caller

Amir Goldstein (1):
  fs: shave 8 bytes off of struct inode

Gustavo A. R. Silva (1):
  fs: dcache: Use true and false for boolean values

Matthew Wilcox (2):
  fs: Fix attr.c kernel-doc
  fs: Add more kernel-doc to the produced documentation

Vasily Averin (1):
  removed extra extern file_fdatawait_range

 Documentation/filesystems/index.rst | 33 +
 fs/attr.c   |  5 +++--
 fs/dcache.c | 12 ++--
 fs/namei.c  | 36 
 include/linux/fs.h  |  7 +++
 5 files changed, 57 insertions(+), 36 deletions(-)


[git pull] vfs.git - misc stuff

2018-08-12 Thread Al Viro
misc cleanups from various folks all over the place

The following changes since commit ce397d215ccd07b8ae3f71db689aedb85d56ab40:

  Linux 4.18-rc1 (2018-06-17 08:04:49 +0900)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.misc

for you to fetch changes up to 7964410fcf135d7e76deef4e475816ec02482f7b:

  fs: dcache: Use true and false for boolean values (2018-08-05 15:52:44 -0400)


Al Viro (1):
  fold generic_readlink() into its only caller

Amir Goldstein (1):
  fs: shave 8 bytes off of struct inode

Gustavo A. R. Silva (1):
  fs: dcache: Use true and false for boolean values

Matthew Wilcox (2):
  fs: Fix attr.c kernel-doc
  fs: Add more kernel-doc to the produced documentation

Vasily Averin (1):
  removed extra extern file_fdatawait_range

 Documentation/filesystems/index.rst | 33 +
 fs/attr.c   |  5 +++--
 fs/dcache.c | 12 ++--
 fs/namei.c  | 36 
 include/linux/fs.h  |  7 +++
 5 files changed, 57 insertions(+), 36 deletions(-)


[git pull] Christoph's aio poll, saner this time around

2018-08-12 Thread Al Viro
This time it's pretty much local to fs/aio.c.  Hopefully race-free...

The following changes since commit 1ffaddd029c867d134a1dde39f540dcc8c52e274:

  Linux 4.18-rc8 (2018-08-05 12:37:41 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.aio

for you to fetch changes up to e8693bcfa0b4a56268946f0756153d942cb66cf7:

  aio: allow direct aio poll comletions for keyed wakeups (2018-08-06 10:24:39 
+0200)


Christoph Hellwig (4):
  timerfd: add support for keyed wakeups
  aio: add a iocb refcount
  aio: implement IOCB_CMD_POLL
  aio: allow direct aio poll comletions for keyed wakeups

 fs/aio.c | 208 ++-
 fs/timerfd.c |   6 +-
 include/uapi/linux/aio_abi.h |   6 +-
 3 files changed, 209 insertions(+), 11 deletions(-)


[git pull] Christoph's aio poll, saner this time around

2018-08-12 Thread Al Viro
This time it's pretty much local to fs/aio.c.  Hopefully race-free...

The following changes since commit 1ffaddd029c867d134a1dde39f540dcc8c52e274:

  Linux 4.18-rc8 (2018-08-05 12:37:41 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.aio

for you to fetch changes up to e8693bcfa0b4a56268946f0756153d942cb66cf7:

  aio: allow direct aio poll comletions for keyed wakeups (2018-08-06 10:24:39 
+0200)


Christoph Hellwig (4):
  timerfd: add support for keyed wakeups
  aio: add a iocb refcount
  aio: implement IOCB_CMD_POLL
  aio: allow direct aio poll comletions for keyed wakeups

 fs/aio.c | 208 ++-
 fs/timerfd.c |   6 +-
 include/uapi/linux/aio_abi.h |   6 +-
 3 files changed, 209 insertions(+), 11 deletions(-)


[git pull] more conversions of ->lookup() to d_splice_alias()

2018-08-12 Thread Al Viro
should be reasonably complete now - the only leftovers are in ceph.

  Linux 4.18-rc1 (2018-06-17 08:04:49 +0900)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.lookup

for you to fetch changes up to 808aa6c5e34a25213bff4d80d3cdb588752879dd:

  Merge branch 'work.hpfs' into work.lookup (2018-08-05 15:51:10 -0400)


Al Viro (6):
  hostfs_lookup: switch to d_splice_alias()
  hpfs: fix an inode leak in lookup, switch to d_splice_alias()
  afs: switch dynroot lookups to d_splice_alias()
  afs_lookup(): switch to d_splice_alias()
  afs_try_auto_mntpt(): return NULL instead of ERR_PTR(-ENOENT)
  Merge branch 'work.hpfs' into work.lookup

 fs/afs/dir.c| 45 ++---
 fs/afs/dynroot.c| 25 ++---
 fs/hostfs/hostfs_kern.c | 28 
 fs/hpfs/dir.c   | 23 +++
 4 files changed, 27 insertions(+), 94 deletions(-)


[git pull] more conversions of ->lookup() to d_splice_alias()

2018-08-12 Thread Al Viro
should be reasonably complete now - the only leftovers are in ceph.

  Linux 4.18-rc1 (2018-06-17 08:04:49 +0900)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.lookup

for you to fetch changes up to 808aa6c5e34a25213bff4d80d3cdb588752879dd:

  Merge branch 'work.hpfs' into work.lookup (2018-08-05 15:51:10 -0400)


Al Viro (6):
  hostfs_lookup: switch to d_splice_alias()
  hpfs: fix an inode leak in lookup, switch to d_splice_alias()
  afs: switch dynroot lookups to d_splice_alias()
  afs_lookup(): switch to d_splice_alias()
  afs_try_auto_mntpt(): return NULL instead of ERR_PTR(-ENOENT)
  Merge branch 'work.hpfs' into work.lookup

 fs/afs/dir.c| 45 ++---
 fs/afs/dynroot.c| 25 ++---
 fs/hostfs/hostfs_kern.c | 28 
 fs/hpfs/dir.c   | 23 +++
 4 files changed, 27 insertions(+), 94 deletions(-)


[git pull] Dealing with icache races around mkdir and object creation in general

2018-08-12 Thread Al Viro
* NFS mkdir/open_by_handle race fix
* analogous solution for FUSE, replacing the one currently in mainline
* new primitive to be used when discarding halfway set up inodes on failed
  object creation; gives sane warranties re icache lookups not returning
  such doomed by still not freed inodes.  A bunch of filesystems switched
  to that animal.
* Miklos' fix for last cycle regression in iget5_locked(); -stable will need
  a slightly different variant, unfortunately.
* misc bits and pieces around things icache-related (in adfs and jfs).

The following changes since commit 877f919e192a09e77962a13d7165783027dee5fd:

  proc: add proc_seq_release (2018-06-27 20:44:38 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.mkdir

for you to fetch changes up to c7b15a8657da7f8d11269c7cc3d8beef10d26b43:

  jfs: don't bother with make_bad_inode() in ialloc() (2018-08-03 16:03:33 
-0400)


Al Viro (11):
  nfs_instantiate(): prevent multiple aliases for directory inode
  kill d_instantiate_no_diralias()
  new primitive: discard_new_inode()
  btrfs: switch to discard_new_inode()
  ufs: switch to discard_new_inode()
  udf: switch to discard_new_inode()
  ext2: make sure that partially set up inodes won't be returned by 
ext2_iget()
  jfs: switch to discard_new_inode()
  new helper: inode_fake_hash()
  adfs: don't put inodes into icache
  jfs: don't bother with make_bad_inode() in ialloc()

Miklos Szeredi (1):
  vfs: don't evict uninitialized inode

 fs/adfs/inode.c|   2 +-
 fs/adfs/super.c|   1 +
 fs/btrfs/inode.c   | 106 ++---
 fs/dcache.c|  29 +-
 fs/ext2/ialloc.c   |   3 +-
 fs/ext2/namei.c|   9 ++---
 fs/fuse/dir.c  |  15 +--
 fs/hfs/inode.c |   2 +-
 fs/inode.c |  53 ++---
 fs/jfs/jfs_imap.c  |   8 +---
 fs/jfs/jfs_inode.c |  10 ++---
 fs/jfs/namei.c |  12 ++
 fs/jfs/super.c |   2 +-
 fs/nfs/dir.c   |   9 +++--
 fs/udf/namei.c |  12 ++
 fs/ufs/ialloc.c|   3 +-
 fs/ufs/namei.c |   9 ++---
 fs/xfs/xfs_iops.c  |   2 +-
 include/linux/dcache.h |   1 -
 include/linux/fs.h |  17 +++-
 20 files changed, 146 insertions(+), 159 deletions(-)


[git pull] Dealing with icache races around mkdir and object creation in general

2018-08-12 Thread Al Viro
* NFS mkdir/open_by_handle race fix
* analogous solution for FUSE, replacing the one currently in mainline
* new primitive to be used when discarding halfway set up inodes on failed
  object creation; gives sane warranties re icache lookups not returning
  such doomed by still not freed inodes.  A bunch of filesystems switched
  to that animal.
* Miklos' fix for last cycle regression in iget5_locked(); -stable will need
  a slightly different variant, unfortunately.
* misc bits and pieces around things icache-related (in adfs and jfs).

The following changes since commit 877f919e192a09e77962a13d7165783027dee5fd:

  proc: add proc_seq_release (2018-06-27 20:44:38 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.mkdir

for you to fetch changes up to c7b15a8657da7f8d11269c7cc3d8beef10d26b43:

  jfs: don't bother with make_bad_inode() in ialloc() (2018-08-03 16:03:33 
-0400)


Al Viro (11):
  nfs_instantiate(): prevent multiple aliases for directory inode
  kill d_instantiate_no_diralias()
  new primitive: discard_new_inode()
  btrfs: switch to discard_new_inode()
  ufs: switch to discard_new_inode()
  udf: switch to discard_new_inode()
  ext2: make sure that partially set up inodes won't be returned by 
ext2_iget()
  jfs: switch to discard_new_inode()
  new helper: inode_fake_hash()
  adfs: don't put inodes into icache
  jfs: don't bother with make_bad_inode() in ialloc()

Miklos Szeredi (1):
  vfs: don't evict uninitialized inode

 fs/adfs/inode.c|   2 +-
 fs/adfs/super.c|   1 +
 fs/btrfs/inode.c   | 106 ++---
 fs/dcache.c|  29 +-
 fs/ext2/ialloc.c   |   3 +-
 fs/ext2/namei.c|   9 ++---
 fs/fuse/dir.c  |  15 +--
 fs/hfs/inode.c |   2 +-
 fs/inode.c |  53 ++---
 fs/jfs/jfs_imap.c  |   8 +---
 fs/jfs/jfs_inode.c |  10 ++---
 fs/jfs/namei.c |  12 ++
 fs/jfs/super.c |   2 +-
 fs/nfs/dir.c   |   9 +++--
 fs/udf/namei.c |  12 ++
 fs/ufs/ialloc.c|   3 +-
 fs/ufs/namei.c |   9 ++---
 fs/xfs/xfs_iops.c  |   2 +-
 include/linux/dcache.h |   1 -
 include/linux/fs.h |  17 +++-
 20 files changed, 146 insertions(+), 159 deletions(-)


[git pull] rework of open-related logics

2018-08-12 Thread Al Viro
* "do we need fput() or put_filp()" rules are gone - it's always fput() now.
  We keep track of that state where it belongs - in ->f_mode.
* int *opened mess killed - in finish_open(), in ->atomic_open() instances
  and in fs/namei.c code around do_last()/lookup_open()/atomic_open().
* alloc_file() wrappers with saner calling conventions are introduced
  (alloc_file_clone() and alloc_file_pseudo()); callers converted, with
  much simplification.
* while we are at it, saner calling conventions for path_init() and
  link_path_walk(), simplifying things inside fs/namei.c (both on
  open-related paths and elsewhere).

The following changes since commit c7e9075fb89362812059fbf8e25bb4a6e825c4c5:

  ocxlflash_getfile(): fix double-iput() on alloc_file() failures (2018-07-10 
23:29:03 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.open3

for you to fetch changes up to 5f336e722cc961be94d264d96b90c92888fffae1:

  few more cleanups of link_path_walk() callers (2018-07-12 10:04:31 -0400)


Al Viro (40):
  fold security_file_free() into file_free()
  turn filp_clone_open() into inline wrapper for dentry_open()
  create_pipe_files(): use fput() if allocation of the second file fails
  make sure do_dentry_open() won't return positive as an error
  alloc_file(): switch to passing O_... flags instead of FMODE_... mode
  pass creds to get_empty_filp(), make sure dentry_open() passes the right 
creds
  pass ->f_flags value to alloc_empty_file()
  get rid of cred argument of vfs_open() and do_dentry_open()
  security_file_open(): lose cred argument
  ->file_open(): lose cred argument
  introduce FMODE_OPENED
  fold put_filp() into fput()
  lift fput() on late failures into path_openat()
  now we can fold open_check_o_direct() into do_dentry_open()
  switch all remaining checks for FILE_OPENED to FMODE_OPENED
  introduce FMODE_CREATED and switch to it
  IMA: don't propagate opened through the entire thing
  getting rid of 'opened' argument of ->atomic_open() - part 1
  getting rid of 'opened' argument of ->atomic_open() - part 2
  get rid of 'opened' argument of ->atomic_open() - part 3
  get rid of 'opened' in path_openat() and the helpers downstream
  ->atomic_open(): return 0 in all success cases
  document ->atomic_open() changes
  switch atomic_open() and lookup_open() to returning 0 in all success cases
  kill FILE_{CREATED,OPENED}
  new wrapper: alloc_file_pseudo()
  __shmem_file_setup(): reorder allocations
  ... and switch shmem_file_setup() to alloc_file_pseudo()
  cxl_getfile(): switch to alloc_file_pseudo()
  ocxlflash_getfile(): switch to alloc_file_pseudo()
  hugetlb_file_setup(): switch to alloc_file_pseudo()
  anon_inode_getfile(): switch to alloc_file_pseudo()
  create_pipe_files(): switch the first allocation to alloc_file_pseudo()
  new helper: alloc_file_clone()
  do_shmat(): grab shp->shm_file earlier, switch to alloc_file_clone()
  make alloc_file() static
  document alloc_file() changes
  make path_init() unconditionally paired with terminate_walk()
  allow link_path_walk() to take ERR_PTR()
  few more cleanups of link_path_walk() callers

 Documentation/filesystems/Locking |   2 +-
 Documentation/filesystems/porting |  20 +++
 Documentation/filesystems/vfs.txt |  18 +--
 drivers/gpu/drm/drm_lease.c   |   2 +-
 drivers/misc/cxl/api.c|  22 +---
 drivers/scsi/cxlflash/ocxl_hw.c   |  24 +---
 fs/9p/vfs_inode.c |   7 +-
 fs/9p/vfs_inode_dotl.c|   7 +-
 fs/aio.c  |  24 +---
 fs/anon_inodes.c  |  30 +
 fs/bad_inode.c|   2 +-
 fs/binfmt_misc.c  |   2 +-
 fs/ceph/file.c|   7 +-
 fs/ceph/super.h   |   3 +-
 fs/cifs/cifsfs.h  |   3 +-
 fs/cifs/dir.c |   7 +-
 fs/file_table.c   |  85 +
 fs/fuse/dir.c |  10 +-
 fs/gfs2/inode.c   |  32 +++--
 fs/hugetlbfs/inode.c  |  54 +++-
 fs/internal.h |   5 +-
 fs/namei.c| 225 +-
 fs/nfs/dir.c  |  14 ++-
 fs/nfs/nfs4_fs.h  |   2 +-
 fs/nfs/nfs4proc.c |   2 +-
 fs/nfsd/vfs.c |   2 +-
 fs/open.c |  88 -
 fs/pipe.c |  43 ++-
 include/linux/file.h  |   8 +-
 include/linux/fs.h|  17 +--
 include/linux/ima.h   |   4 +-
 include/linux/lsm_hooks.h 

[git pull] rework of open-related logics

2018-08-12 Thread Al Viro
* "do we need fput() or put_filp()" rules are gone - it's always fput() now.
  We keep track of that state where it belongs - in ->f_mode.
* int *opened mess killed - in finish_open(), in ->atomic_open() instances
  and in fs/namei.c code around do_last()/lookup_open()/atomic_open().
* alloc_file() wrappers with saner calling conventions are introduced
  (alloc_file_clone() and alloc_file_pseudo()); callers converted, with
  much simplification.
* while we are at it, saner calling conventions for path_init() and
  link_path_walk(), simplifying things inside fs/namei.c (both on
  open-related paths and elsewhere).

The following changes since commit c7e9075fb89362812059fbf8e25bb4a6e825c4c5:

  ocxlflash_getfile(): fix double-iput() on alloc_file() failures (2018-07-10 
23:29:03 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git work.open3

for you to fetch changes up to 5f336e722cc961be94d264d96b90c92888fffae1:

  few more cleanups of link_path_walk() callers (2018-07-12 10:04:31 -0400)


Al Viro (40):
  fold security_file_free() into file_free()
  turn filp_clone_open() into inline wrapper for dentry_open()
  create_pipe_files(): use fput() if allocation of the second file fails
  make sure do_dentry_open() won't return positive as an error
  alloc_file(): switch to passing O_... flags instead of FMODE_... mode
  pass creds to get_empty_filp(), make sure dentry_open() passes the right 
creds
  pass ->f_flags value to alloc_empty_file()
  get rid of cred argument of vfs_open() and do_dentry_open()
  security_file_open(): lose cred argument
  ->file_open(): lose cred argument
  introduce FMODE_OPENED
  fold put_filp() into fput()
  lift fput() on late failures into path_openat()
  now we can fold open_check_o_direct() into do_dentry_open()
  switch all remaining checks for FILE_OPENED to FMODE_OPENED
  introduce FMODE_CREATED and switch to it
  IMA: don't propagate opened through the entire thing
  getting rid of 'opened' argument of ->atomic_open() - part 1
  getting rid of 'opened' argument of ->atomic_open() - part 2
  get rid of 'opened' argument of ->atomic_open() - part 3
  get rid of 'opened' in path_openat() and the helpers downstream
  ->atomic_open(): return 0 in all success cases
  document ->atomic_open() changes
  switch atomic_open() and lookup_open() to returning 0 in all success cases
  kill FILE_{CREATED,OPENED}
  new wrapper: alloc_file_pseudo()
  __shmem_file_setup(): reorder allocations
  ... and switch shmem_file_setup() to alloc_file_pseudo()
  cxl_getfile(): switch to alloc_file_pseudo()
  ocxlflash_getfile(): switch to alloc_file_pseudo()
  hugetlb_file_setup(): switch to alloc_file_pseudo()
  anon_inode_getfile(): switch to alloc_file_pseudo()
  create_pipe_files(): switch the first allocation to alloc_file_pseudo()
  new helper: alloc_file_clone()
  do_shmat(): grab shp->shm_file earlier, switch to alloc_file_clone()
  make alloc_file() static
  document alloc_file() changes
  make path_init() unconditionally paired with terminate_walk()
  allow link_path_walk() to take ERR_PTR()
  few more cleanups of link_path_walk() callers

 Documentation/filesystems/Locking |   2 +-
 Documentation/filesystems/porting |  20 +++
 Documentation/filesystems/vfs.txt |  18 +--
 drivers/gpu/drm/drm_lease.c   |   2 +-
 drivers/misc/cxl/api.c|  22 +---
 drivers/scsi/cxlflash/ocxl_hw.c   |  24 +---
 fs/9p/vfs_inode.c |   7 +-
 fs/9p/vfs_inode_dotl.c|   7 +-
 fs/aio.c  |  24 +---
 fs/anon_inodes.c  |  30 +
 fs/bad_inode.c|   2 +-
 fs/binfmt_misc.c  |   2 +-
 fs/ceph/file.c|   7 +-
 fs/ceph/super.h   |   3 +-
 fs/cifs/cifsfs.h  |   3 +-
 fs/cifs/dir.c |   7 +-
 fs/file_table.c   |  85 +
 fs/fuse/dir.c |  10 +-
 fs/gfs2/inode.c   |  32 +++--
 fs/hugetlbfs/inode.c  |  54 +++-
 fs/internal.h |   5 +-
 fs/namei.c| 225 +-
 fs/nfs/dir.c  |  14 ++-
 fs/nfs/nfs4_fs.h  |   2 +-
 fs/nfs/nfs4proc.c |   2 +-
 fs/nfsd/vfs.c |   2 +-
 fs/open.c |  88 -
 fs/pipe.c |  43 ++-
 include/linux/file.h  |   8 +-
 include/linux/fs.h|  17 +--
 include/linux/ima.h   |   4 +-
 include/linux/lsm_hooks.h 

Re: [PATCH] cpuidle: menu: Handle stopped tick more aggressively

2018-08-12 Thread Dan Carpenter
Hi Rafael,

I love your patch! Perhaps something to improve:

url:
https://github.com/0day-ci/linux/commits/Rafael-J-Wysocki/cpuidle-menu-Handle-stopped-tick-more-aggressively/20180811-191914
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git 
linux-next

smatch warnings:
drivers/cpuidle/governors/menu.c:374 menu_select() error: uninitialized symbol 
'first_idx'.

# 
https://github.com/0day-ci/linux/commit/5f9f09809ebd1b4f7820c9925a0cbd417bd3a823
git remote add linux-review https://github.com/0day-ci/linux
git remote update linux-review
git checkout 5f9f09809ebd1b4f7820c9925a0cbd417bd3a823
vim +/first_idx +374 drivers/cpuidle/governors/menu.c

1f85f87d4 Arjan van de Ven  2010-05-24  276  
4f86d3a8e Len Brown 2007-10-03  277  /**
4f86d3a8e Len Brown 2007-10-03  278   * menu_select - 
selects the next idle state to enter
46bcfad7a Deepthi Dharwar   2011-10-28  279   * @drv: cpuidle 
driver containing state data
4f86d3a8e Len Brown 2007-10-03  280   * @dev: the CPU
45f1ff59e Rafael J. Wysocki 2018-03-22  281   * @stop_tick: 
indication on whether or not to stop the tick
4f86d3a8e Len Brown 2007-10-03  282   */
45f1ff59e Rafael J. Wysocki 2018-03-22  283  static int 
menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
45f1ff59e Rafael J. Wysocki 2018-03-22  284
bool *stop_tick)
4f86d3a8e Len Brown 2007-10-03  285  {
229b6863b Christoph Lameter 2014-08-17  286 struct 
menu_device *data = this_cpu_ptr(_devices);
0fc784fb0 Rafael J. Wysocki 2018-05-30  287 int latency_req 
= cpuidle_governor_latency_req(dev->cpu);
4f86d3a8e Len Brown 2007-10-03  288 int i;
3ed09c945 Nicholas Piggin   2017-06-26  289 int first_idx;
3ed09c945 Nicholas Piggin   2017-06-26  290 int idx;
96e95182e tuukka.tikka...@linaro.org2014-02-24  291 unsigned int 
interactivity_req;
e132b9b3b Rik van Riel  2016-03-16  292 unsigned int 
expected_interval;
372ba8cb4 Mel Gorman2014-08-06  293 unsigned long 
nr_iowaiters, cpu_load;
296bb1e51 Rafael J. Wysocki 2018-04-05  294 ktime_t 
delta_next;
4f86d3a8e Len Brown 2007-10-03  295  
672917dcc Corrado Zoccolo   2009-09-21  296 if 
(data->needs_update) {
46bcfad7a Deepthi Dharwar   2011-10-28  297 
menu_update(drv, dev);
672917dcc Corrado Zoccolo   2009-09-21  298 
data->needs_update = 0;
672917dcc Corrado Zoccolo   2009-09-21  299 }
672917dcc Corrado Zoccolo   2009-09-21  300  
69d25870f Arjan van de Ven  2009-09-21  301 /* Special case 
when user has set very strict latency requirement */
45f1ff59e Rafael J. Wysocki 2018-03-22  302 if 
(unlikely(latency_req == 0)) {
45f1ff59e Rafael J. Wysocki 2018-03-22  303 
*stop_tick = false;
a2bd92023 venkatesh.pallip...@intel.com 2008-07-30  304 return 
0;
45f1ff59e Rafael J. Wysocki 2018-03-22  305 }
a2bd92023 venkatesh.pallip...@intel.com 2008-07-30  306  
69d25870f Arjan van de Ven  2009-09-21  307 /* determine 
the expected residency time, round up */
296bb1e51 Rafael J. Wysocki 2018-04-05  308 
data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(_next));
69d25870f Arjan van de Ven  2009-09-21  309  
5f9f09809 Rafael J. Wysocki 2018-08-10  310 /*
5f9f09809 Rafael J. Wysocki 2018-08-10  311  * If the tick 
is already stopped, the cost of possible short idle
5f9f09809 Rafael J. Wysocki 2018-08-10  312  * duration 
misprediction is much higher, because the CPU may be stuck
5f9f09809 Rafael J. Wysocki 2018-08-10  313  * in a shallow 
idle state for a long time as a result of it.  In that
5f9f09809 Rafael J. Wysocki 2018-08-10  314  * case say we 
might mispredict and use the known time till the closest
5f9f09809 Rafael J. Wysocki 2018-08-10  315  * timer event 
for the idle state selection.
5f9f09809 Rafael J. Wysocki 2018-08-10  316  */
5f9f09809 Rafael J. Wysocki 2018-08-10  317 if 
(tick_nohz_tick_stopped()) {
5f9f09809 Rafael J. Wysocki 2018-08-10  318 
data->predicted_us = ktime_to_us(delta_next);
5f9f09809 Rafael J. Wysocki 2018-08-10  319 goto 
select;

^^^
We hit this goto

5f9f09809 Rafael J. Wysocki 2018-08-10  320 }
5f9f09809 Rafael J. 

Re: [PATCH] cpuidle: menu: Handle stopped tick more aggressively

2018-08-12 Thread Dan Carpenter
Hi Rafael,

I love your patch! Perhaps something to improve:

url:
https://github.com/0day-ci/linux/commits/Rafael-J-Wysocki/cpuidle-menu-Handle-stopped-tick-more-aggressively/20180811-191914
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git 
linux-next

smatch warnings:
drivers/cpuidle/governors/menu.c:374 menu_select() error: uninitialized symbol 
'first_idx'.

# 
https://github.com/0day-ci/linux/commit/5f9f09809ebd1b4f7820c9925a0cbd417bd3a823
git remote add linux-review https://github.com/0day-ci/linux
git remote update linux-review
git checkout 5f9f09809ebd1b4f7820c9925a0cbd417bd3a823
vim +/first_idx +374 drivers/cpuidle/governors/menu.c

1f85f87d4 Arjan van de Ven  2010-05-24  276  
4f86d3a8e Len Brown 2007-10-03  277  /**
4f86d3a8e Len Brown 2007-10-03  278   * menu_select - 
selects the next idle state to enter
46bcfad7a Deepthi Dharwar   2011-10-28  279   * @drv: cpuidle 
driver containing state data
4f86d3a8e Len Brown 2007-10-03  280   * @dev: the CPU
45f1ff59e Rafael J. Wysocki 2018-03-22  281   * @stop_tick: 
indication on whether or not to stop the tick
4f86d3a8e Len Brown 2007-10-03  282   */
45f1ff59e Rafael J. Wysocki 2018-03-22  283  static int 
menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
45f1ff59e Rafael J. Wysocki 2018-03-22  284
bool *stop_tick)
4f86d3a8e Len Brown 2007-10-03  285  {
229b6863b Christoph Lameter 2014-08-17  286 struct 
menu_device *data = this_cpu_ptr(_devices);
0fc784fb0 Rafael J. Wysocki 2018-05-30  287 int latency_req 
= cpuidle_governor_latency_req(dev->cpu);
4f86d3a8e Len Brown 2007-10-03  288 int i;
3ed09c945 Nicholas Piggin   2017-06-26  289 int first_idx;
3ed09c945 Nicholas Piggin   2017-06-26  290 int idx;
96e95182e tuukka.tikka...@linaro.org2014-02-24  291 unsigned int 
interactivity_req;
e132b9b3b Rik van Riel  2016-03-16  292 unsigned int 
expected_interval;
372ba8cb4 Mel Gorman2014-08-06  293 unsigned long 
nr_iowaiters, cpu_load;
296bb1e51 Rafael J. Wysocki 2018-04-05  294 ktime_t 
delta_next;
4f86d3a8e Len Brown 2007-10-03  295  
672917dcc Corrado Zoccolo   2009-09-21  296 if 
(data->needs_update) {
46bcfad7a Deepthi Dharwar   2011-10-28  297 
menu_update(drv, dev);
672917dcc Corrado Zoccolo   2009-09-21  298 
data->needs_update = 0;
672917dcc Corrado Zoccolo   2009-09-21  299 }
672917dcc Corrado Zoccolo   2009-09-21  300  
69d25870f Arjan van de Ven  2009-09-21  301 /* Special case 
when user has set very strict latency requirement */
45f1ff59e Rafael J. Wysocki 2018-03-22  302 if 
(unlikely(latency_req == 0)) {
45f1ff59e Rafael J. Wysocki 2018-03-22  303 
*stop_tick = false;
a2bd92023 venkatesh.pallip...@intel.com 2008-07-30  304 return 
0;
45f1ff59e Rafael J. Wysocki 2018-03-22  305 }
a2bd92023 venkatesh.pallip...@intel.com 2008-07-30  306  
69d25870f Arjan van de Ven  2009-09-21  307 /* determine 
the expected residency time, round up */
296bb1e51 Rafael J. Wysocki 2018-04-05  308 
data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(_next));
69d25870f Arjan van de Ven  2009-09-21  309  
5f9f09809 Rafael J. Wysocki 2018-08-10  310 /*
5f9f09809 Rafael J. Wysocki 2018-08-10  311  * If the tick 
is already stopped, the cost of possible short idle
5f9f09809 Rafael J. Wysocki 2018-08-10  312  * duration 
misprediction is much higher, because the CPU may be stuck
5f9f09809 Rafael J. Wysocki 2018-08-10  313  * in a shallow 
idle state for a long time as a result of it.  In that
5f9f09809 Rafael J. Wysocki 2018-08-10  314  * case say we 
might mispredict and use the known time till the closest
5f9f09809 Rafael J. Wysocki 2018-08-10  315  * timer event 
for the idle state selection.
5f9f09809 Rafael J. Wysocki 2018-08-10  316  */
5f9f09809 Rafael J. Wysocki 2018-08-10  317 if 
(tick_nohz_tick_stopped()) {
5f9f09809 Rafael J. Wysocki 2018-08-10  318 
data->predicted_us = ktime_to_us(delta_next);
5f9f09809 Rafael J. Wysocki 2018-08-10  319 goto 
select;

^^^
We hit this goto

5f9f09809 Rafael J. Wysocki 2018-08-10  320 }
5f9f09809 Rafael J. 

Linux 4.18

2018-08-12 Thread Linus Torvalds
One week late(r) and here we are - 4.18 is out there.

It was a very calm week, and arguably I could just have released on
schedule last week, but we did have some minor updates. Mostly
networking, but some vfs race fixes (mentioned in the rc8 announment
as "pending") and a couple of driver fixes (scsi, networking, i2c).
Some other minor random things (arm crypto fix, parisc memory ordering
fix). Shortlog appended for the (few) details.

Some of these I was almost ready to just delay to until the next merge
window, but they were marked for stable anyway, so it would just have
caused more backporting. The vfs fixes are for old races that are
really hard to hit (which is obviously why they are old and weren't
noticed earlier). Some of them _have_ been seen in real life, some of
them probably need explicit help to ever trigger (ie artificial delays
just to show that "yes, this can actually happen in theory").

Anyway, with this, the merge window for 4.19 is obviously open, and
I'll start pulling tomorrow. I already have a couple of dozen pull
requests pending due to the one-week delay of 4.18, but keep them
coming.

 Linus

---

Al Viro (5):
  root dentries need RCU-delayed freeing
  cxgb4: mk_act_open_req() buggers ->{local, peer}_ip on big-endian hosts
  fix mntput/mntput race
  fix __legitimize_mnt()/mntput() race
  make sure that __dentry_kill() always invalidates d_seq, unhashed or not

Alexey Kodanev (1):
  dccp: fix undefined behavior with 'cwnd' shift in ccid2_cwnd_restart()

Andrew Lunn (1):
  dsa: slave: eee: Allow ports to use phylink

Andrey Ryabinin (1):
  lib/ubsan: remove null-pointer checks

Ard Biesheuvel (1):
  crypto: arm64 - revert NEON yield for fast AEAD implementations

Bart Van Assche (2):
  scsi: qedi: Fix a potential buffer overflow
  scsi: sr: Avoid that opening a CD-ROM hangs with runtime power
management enabled

Benjamin Tissoires (1):
  gpiolib-acpi: make sure we trigger edge events at least once on boot

Colin Ian King (1):
  net: thunderx: check for failed allocation lmac->dmacs

Cong Wang (3):
  ipv6: fix double refcount of fib6_metrics
  vsock: split dwork to avoid reinitializations
  llc: use refcount_inc_not_zero() for llc_sap_find()

Daniel Borkmann (3):
  bpf, sockmap: fix bpf_tcp_sendmsg sock error handling
  bpf, sockmap: fix leak in bpf_tcp_sendmsg wait for mem path
  bpf, sockmap: fix cork timeout for select due to epipe

David Howells (1):
  rxrpc: Fix the keepalive generator [ver #2]

Dmitry Bogdanov (1):
  net: aquantia: Fix IFF_ALLMULTI flag functionality

George Cherian (1):
  i2c: xlp9xx: Fix case where SSIF read transaction completes early

Helge Deller (1):
  parisc: Enable CONFIG_MLONGCALLS by default

Huy Nguyen (1):
  net/mlx5e: Cleanup of dcbnl related fields

Ivan Khoronzhuk (2):
  net: ethernet: ti: cpsw: clear all entries when delete vid
  net: ethernet: ti: cpsw: fix runtime_pm while add/kill vlan

Jason Wang (1):
  vhost: reset metadata cache when initializing new IOTLB

Jesper Dangaard Brouer (3):
  xdp: fix bug in cpumap teardown code path
  samples/bpf: xdp_redirect_cpu adjustment to reproduce teardown race easier
  xdp: fix bug in devmap teardown code path

Jim Gill (1):
  scsi: vmw_pvscsi: Return DID_RESET for status SAM_STAT_COMMAND_TERMINATED

Johannes Thumshirn (3):
  scsi: fcoe: fix use-after-free in fcoe_ctlr_els_send
  scsi: fcoe: drop frames in ELS LOGO error path
  scsi: fcoe: clear FC_RP_STARTED flags when receiving a LOGO

John David Anglin (1):
  parisc: Define mb() and add memory barriers to assembler unlock sequences

Juergen Gross (1):
  xen/netfront: don't cache skb_shinfo()

Kieran Bingham (1):
  MAINTAINERS: GDB: update e-mail address

Linus Torvalds (2):
  init: rename and re-order boot_cpu_state_init()
  Linux 4.18

Martin KaFai Lau (1):
  bpf: btf: Change tools/lib/bpf/btf to LGPL

Minchan Kim (1):
  zram: remove BD_CAP_SYNCHRONOUS_IO with writeback feature

Ondrej Mosnacek (1):
  crypto: x86/aegis,morus - Fix and simplify CPUID checks

Or Gerlitz (1):
  net/mlx5e: Properly check if hairpin is possible between two functions

Quinn Tran (1):
  scsi: qla2xxx: Fix memory leak for allocating abort IOCB

Sreekanth Reddy (1):
  scsi: mpt3sas: Swap I/O memory read value back to cpu endianness

Ursula Braun (3):
  net/smc: no shutdown in state SMC_LISTEN
  net/smc: allow sysctl rmem and wmem defaults for servers
  net/smc: move sock lock in smc_ioctl()

Willem de Bruijn (1):
  packet: refine ring v3 block size test to hold one frame

Xin Long (1):
  ip6_tunnel: use the right value for ipv4 min mtu check in ip6_tnl_xmit

Ying Xue (1):
  tipc: fix an interrupt unsafe locking scenario

jie@chenj...@huwei.com (1):
  mm/memory.c: check return value of ioremap_prot


Linux 4.18

2018-08-12 Thread Linus Torvalds
One week late(r) and here we are - 4.18 is out there.

It was a very calm week, and arguably I could just have released on
schedule last week, but we did have some minor updates. Mostly
networking, but some vfs race fixes (mentioned in the rc8 announment
as "pending") and a couple of driver fixes (scsi, networking, i2c).
Some other minor random things (arm crypto fix, parisc memory ordering
fix). Shortlog appended for the (few) details.

Some of these I was almost ready to just delay to until the next merge
window, but they were marked for stable anyway, so it would just have
caused more backporting. The vfs fixes are for old races that are
really hard to hit (which is obviously why they are old and weren't
noticed earlier). Some of them _have_ been seen in real life, some of
them probably need explicit help to ever trigger (ie artificial delays
just to show that "yes, this can actually happen in theory").

Anyway, with this, the merge window for 4.19 is obviously open, and
I'll start pulling tomorrow. I already have a couple of dozen pull
requests pending due to the one-week delay of 4.18, but keep them
coming.

 Linus

---

Al Viro (5):
  root dentries need RCU-delayed freeing
  cxgb4: mk_act_open_req() buggers ->{local, peer}_ip on big-endian hosts
  fix mntput/mntput race
  fix __legitimize_mnt()/mntput() race
  make sure that __dentry_kill() always invalidates d_seq, unhashed or not

Alexey Kodanev (1):
  dccp: fix undefined behavior with 'cwnd' shift in ccid2_cwnd_restart()

Andrew Lunn (1):
  dsa: slave: eee: Allow ports to use phylink

Andrey Ryabinin (1):
  lib/ubsan: remove null-pointer checks

Ard Biesheuvel (1):
  crypto: arm64 - revert NEON yield for fast AEAD implementations

Bart Van Assche (2):
  scsi: qedi: Fix a potential buffer overflow
  scsi: sr: Avoid that opening a CD-ROM hangs with runtime power
management enabled

Benjamin Tissoires (1):
  gpiolib-acpi: make sure we trigger edge events at least once on boot

Colin Ian King (1):
  net: thunderx: check for failed allocation lmac->dmacs

Cong Wang (3):
  ipv6: fix double refcount of fib6_metrics
  vsock: split dwork to avoid reinitializations
  llc: use refcount_inc_not_zero() for llc_sap_find()

Daniel Borkmann (3):
  bpf, sockmap: fix bpf_tcp_sendmsg sock error handling
  bpf, sockmap: fix leak in bpf_tcp_sendmsg wait for mem path
  bpf, sockmap: fix cork timeout for select due to epipe

David Howells (1):
  rxrpc: Fix the keepalive generator [ver #2]

Dmitry Bogdanov (1):
  net: aquantia: Fix IFF_ALLMULTI flag functionality

George Cherian (1):
  i2c: xlp9xx: Fix case where SSIF read transaction completes early

Helge Deller (1):
  parisc: Enable CONFIG_MLONGCALLS by default

Huy Nguyen (1):
  net/mlx5e: Cleanup of dcbnl related fields

Ivan Khoronzhuk (2):
  net: ethernet: ti: cpsw: clear all entries when delete vid
  net: ethernet: ti: cpsw: fix runtime_pm while add/kill vlan

Jason Wang (1):
  vhost: reset metadata cache when initializing new IOTLB

Jesper Dangaard Brouer (3):
  xdp: fix bug in cpumap teardown code path
  samples/bpf: xdp_redirect_cpu adjustment to reproduce teardown race easier
  xdp: fix bug in devmap teardown code path

Jim Gill (1):
  scsi: vmw_pvscsi: Return DID_RESET for status SAM_STAT_COMMAND_TERMINATED

Johannes Thumshirn (3):
  scsi: fcoe: fix use-after-free in fcoe_ctlr_els_send
  scsi: fcoe: drop frames in ELS LOGO error path
  scsi: fcoe: clear FC_RP_STARTED flags when receiving a LOGO

John David Anglin (1):
  parisc: Define mb() and add memory barriers to assembler unlock sequences

Juergen Gross (1):
  xen/netfront: don't cache skb_shinfo()

Kieran Bingham (1):
  MAINTAINERS: GDB: update e-mail address

Linus Torvalds (2):
  init: rename and re-order boot_cpu_state_init()
  Linux 4.18

Martin KaFai Lau (1):
  bpf: btf: Change tools/lib/bpf/btf to LGPL

Minchan Kim (1):
  zram: remove BD_CAP_SYNCHRONOUS_IO with writeback feature

Ondrej Mosnacek (1):
  crypto: x86/aegis,morus - Fix and simplify CPUID checks

Or Gerlitz (1):
  net/mlx5e: Properly check if hairpin is possible between two functions

Quinn Tran (1):
  scsi: qla2xxx: Fix memory leak for allocating abort IOCB

Sreekanth Reddy (1):
  scsi: mpt3sas: Swap I/O memory read value back to cpu endianness

Ursula Braun (3):
  net/smc: no shutdown in state SMC_LISTEN
  net/smc: allow sysctl rmem and wmem defaults for servers
  net/smc: move sock lock in smc_ioctl()

Willem de Bruijn (1):
  packet: refine ring v3 block size test to hold one frame

Xin Long (1):
  ip6_tunnel: use the right value for ipv4 min mtu check in ip6_tnl_xmit

Ying Xue (1):
  tipc: fix an interrupt unsafe locking scenario

jie@chenj...@huwei.com (1):
  mm/memory.c: check return value of ioremap_prot


Re: KASAN: use-after-free Read in finish_task_switch

2018-08-12 Thread syzbot

syzbot has found a reproducer for the following crash on:

HEAD commit:4110b42356f3 Add linux-next specific files for 20180810
git tree:   linux-next
console output: https://syzkaller.appspot.com/x/log.txt?x=107162c440
kernel config:  https://syzkaller.appspot.com/x/.config?x=1d80606e3795a4f5
dashboard link: https://syzkaller.appspot.com/bug?extid=e62f8ba2b2af8dbd6729
compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=16d33cc440
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=142c720240

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+e62f8ba2b2af8dbd6...@syzkaller.appspotmail.com

random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
==
BUG: KASAN: use-after-free in __fire_sched_in_preempt_notifiers  
kernel/sched/core.c:2481 [inline]
BUG: KASAN: use-after-free in fire_sched_in_preempt_notifiers  
kernel/sched/core.c:2487 [inline]
BUG: KASAN: use-after-free in finish_task_switch+0x544/0x870  
kernel/sched/core.c:2679

Read of size 8 at addr 8801c79482d8 by task syz-executor216/4445

CPU: 0 PID: 4445 Comm: syz-executor216 Not tainted  
4.18.0-rc8-next-20180810+ #36
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011

Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 __fire_sched_in_preempt_notifiers kernel/sched/core.c:2481 [inline]
 fire_sched_in_preempt_notifiers kernel/sched/core.c:2487 [inline]
 finish_task_switch+0x544/0x870 kernel/sched/core.c:2679
 context_switch kernel/sched/core.c:2826 [inline]
 __schedule+0x884/0x1ec0 kernel/sched/core.c:3471
 schedule+0xfb/0x450 kernel/sched/core.c:3515
 freezable_schedule include/linux/freezer.h:172 [inline]
 futex_wait_queue_me+0x3f9/0x840 kernel/futex.c:2530
 futex_wait+0x45b/0xa20 kernel/futex.c:2645
 do_futex+0x336/0x27d0 kernel/futex.c:3527
 __do_sys_futex kernel/futex.c:3587 [inline]
 __se_sys_futex kernel/futex.c:3555 [inline]
 __x64_sys_futex+0x472/0x6a0 kernel/futex.c:3555
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4468a9
Code: e8 0c e8 ff ff 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 7b 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00

RSP: 002b:7f34d3e78da8 EFLAGS: 0246 ORIG_RAX: 00ca
RAX: ffda RBX: 006dbc88 RCX: 004468a9
RDX:  RSI:  RDI: 006dbc88
RBP: 006dbc80 R08:  R09: 
R10:  R11: 0246 R12: 006dbc8c
R13: 0030656c69662f2e R14: 6c75662f7665642f R15: 006dbd6c

Allocated by task 4439:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
 kmem_cache_alloc+0x12e/0x760 mm/slab.c:3554
 kmem_cache_zalloc include/linux/slab.h:697 [inline]
 vmx_create_vcpu+0xcf/0x28b0 arch/x86/kvm/vmx.c:10682
 kvm_arch_vcpu_create+0xe5/0x220 arch/x86/kvm/x86.c:8401
 kvm_vm_ioctl_create_vcpu arch/x86/kvm/../../../virt/kvm/kvm_main.c:2476  
[inline]

 kvm_vm_ioctl+0x488/0x1d80 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2977
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:501 [inline]
 do_vfs_ioctl+0x1de/0x1720 fs/ioctl.c:685
 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:702
 __do_sys_ioctl fs/ioctl.c:709 [inline]
 __se_sys_ioctl fs/ioctl.c:707 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:707
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 4423:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
 vmx_free_vcpu+0x26b/0x300 arch/x86/kvm/vmx.c:10676
 kvm_arch_vcpu_free arch/x86/kvm/x86.c:8387 [inline]
 kvm_free_vcpus arch/x86/kvm/x86.c:8836 [inline]
 kvm_arch_destroy_vm+0x365/0x7c0 arch/x86/kvm/x86.c:8933
 kvm_destroy_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:752 [inline]
 kvm_put_kvm+0x73f/0x1060 arch/x86/kvm/../../../virt/kvm/kvm_main.c:773
 kvm_vcpu_release+0x7b/0xa0 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2407
 __fput+0x376/0x8a0 fs/file_table.c:279
 fput+0x15/0x20 fs/file_table.c:312
 task_work_run+0x1e8/0x2a0 

Re: KASAN: use-after-free Read in finish_task_switch

2018-08-12 Thread syzbot

syzbot has found a reproducer for the following crash on:

HEAD commit:4110b42356f3 Add linux-next specific files for 20180810
git tree:   linux-next
console output: https://syzkaller.appspot.com/x/log.txt?x=107162c440
kernel config:  https://syzkaller.appspot.com/x/.config?x=1d80606e3795a4f5
dashboard link: https://syzkaller.appspot.com/bug?extid=e62f8ba2b2af8dbd6729
compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=16d33cc440
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=142c720240

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+e62f8ba2b2af8dbd6...@syzkaller.appspotmail.com

random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
==
BUG: KASAN: use-after-free in __fire_sched_in_preempt_notifiers  
kernel/sched/core.c:2481 [inline]
BUG: KASAN: use-after-free in fire_sched_in_preempt_notifiers  
kernel/sched/core.c:2487 [inline]
BUG: KASAN: use-after-free in finish_task_switch+0x544/0x870  
kernel/sched/core.c:2679

Read of size 8 at addr 8801c79482d8 by task syz-executor216/4445

CPU: 0 PID: 4445 Comm: syz-executor216 Not tainted  
4.18.0-rc8-next-20180810+ #36
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011

Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 __fire_sched_in_preempt_notifiers kernel/sched/core.c:2481 [inline]
 fire_sched_in_preempt_notifiers kernel/sched/core.c:2487 [inline]
 finish_task_switch+0x544/0x870 kernel/sched/core.c:2679
 context_switch kernel/sched/core.c:2826 [inline]
 __schedule+0x884/0x1ec0 kernel/sched/core.c:3471
 schedule+0xfb/0x450 kernel/sched/core.c:3515
 freezable_schedule include/linux/freezer.h:172 [inline]
 futex_wait_queue_me+0x3f9/0x840 kernel/futex.c:2530
 futex_wait+0x45b/0xa20 kernel/futex.c:2645
 do_futex+0x336/0x27d0 kernel/futex.c:3527
 __do_sys_futex kernel/futex.c:3587 [inline]
 __se_sys_futex kernel/futex.c:3555 [inline]
 __x64_sys_futex+0x472/0x6a0 kernel/futex.c:3555
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4468a9
Code: e8 0c e8 ff ff 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 7b 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00

RSP: 002b:7f34d3e78da8 EFLAGS: 0246 ORIG_RAX: 00ca
RAX: ffda RBX: 006dbc88 RCX: 004468a9
RDX:  RSI:  RDI: 006dbc88
RBP: 006dbc80 R08:  R09: 
R10:  R11: 0246 R12: 006dbc8c
R13: 0030656c69662f2e R14: 6c75662f7665642f R15: 006dbd6c

Allocated by task 4439:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
 kmem_cache_alloc+0x12e/0x760 mm/slab.c:3554
 kmem_cache_zalloc include/linux/slab.h:697 [inline]
 vmx_create_vcpu+0xcf/0x28b0 arch/x86/kvm/vmx.c:10682
 kvm_arch_vcpu_create+0xe5/0x220 arch/x86/kvm/x86.c:8401
 kvm_vm_ioctl_create_vcpu arch/x86/kvm/../../../virt/kvm/kvm_main.c:2476  
[inline]

 kvm_vm_ioctl+0x488/0x1d80 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2977
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:501 [inline]
 do_vfs_ioctl+0x1de/0x1720 fs/ioctl.c:685
 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:702
 __do_sys_ioctl fs/ioctl.c:709 [inline]
 __se_sys_ioctl fs/ioctl.c:707 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:707
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 4423:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
 vmx_free_vcpu+0x26b/0x300 arch/x86/kvm/vmx.c:10676
 kvm_arch_vcpu_free arch/x86/kvm/x86.c:8387 [inline]
 kvm_free_vcpus arch/x86/kvm/x86.c:8836 [inline]
 kvm_arch_destroy_vm+0x365/0x7c0 arch/x86/kvm/x86.c:8933
 kvm_destroy_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:752 [inline]
 kvm_put_kvm+0x73f/0x1060 arch/x86/kvm/../../../virt/kvm/kvm_main.c:773
 kvm_vcpu_release+0x7b/0xa0 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2407
 __fput+0x376/0x8a0 fs/file_table.c:279
 fput+0x15/0x20 fs/file_table.c:312
 task_work_run+0x1e8/0x2a0 

Re: [PATCH v2 0/4] clk: meson: clk-pll driver update

2018-08-12 Thread Martin Blumenstingl
Hi Jerome,

On Wed, Aug 1, 2018 at 4:00 PM Jerome Brunet  wrote:
>
> This patchset is yet another round of update to the amlogic pll driver.
>
>  1) Enable bit is added so we don't rely on the bootloader or the init
> value to enable to pll device.
>  2) Remove unnecessary CLK_GET_RATE_NOCACHE flags.
>  3) OD post dividers are removed from the pll driver. This simplify the
> driver and let us provide the clocks which exist between those
> dividers. Some device are actually using these clocks.
>  4) The rates hard coded in parameter tables are remove. Instead, we
> only rely on the parent rate and the parameters to calculate the
> output rate, which is a lot better.
>
> This series has been tested on the gxl libretech cc and axg s400.
> I did not test it on meson8b yet.
>
> Changes since v1: [0]
>  - improve commit description of patch 1
>  - remove unnecessary CLK_GET_RATE_NOCACHE flags.
>  - add missing CLK_SET_RATE_PARENT.
>
> [0]: https://lkml.kernel.org/r/20180717095617.12240-1-jbru...@baylibre.com
>
> Jerome Brunet (4):
>   clk: meson: clk-pll: add enable bit
>   clk: meson: clk-pll: drop CLK_GET_RATE_NOCACHE where unnecessary
>   clk: meson: clk-pll: remove od parameters
>   clk: meson: clk-pll: drop hard-coded rates from pll tables
for the whole series:
Reviewed-by: Martin Blumenstingl 

as well as:
Tested-by: Martin Blumenstingl 
(tested on Meson8b / Odroid-C1, even CPU frequency scaling still works
with my out-of-tree patches)


Regards
Martin


Re: [PATCH v2 0/4] clk: meson: clk-pll driver update

2018-08-12 Thread Martin Blumenstingl
Hi Jerome,

On Wed, Aug 1, 2018 at 4:00 PM Jerome Brunet  wrote:
>
> This patchset is yet another round of update to the amlogic pll driver.
>
>  1) Enable bit is added so we don't rely on the bootloader or the init
> value to enable to pll device.
>  2) Remove unnecessary CLK_GET_RATE_NOCACHE flags.
>  3) OD post dividers are removed from the pll driver. This simplify the
> driver and let us provide the clocks which exist between those
> dividers. Some device are actually using these clocks.
>  4) The rates hard coded in parameter tables are remove. Instead, we
> only rely on the parent rate and the parameters to calculate the
> output rate, which is a lot better.
>
> This series has been tested on the gxl libretech cc and axg s400.
> I did not test it on meson8b yet.
>
> Changes since v1: [0]
>  - improve commit description of patch 1
>  - remove unnecessary CLK_GET_RATE_NOCACHE flags.
>  - add missing CLK_SET_RATE_PARENT.
>
> [0]: https://lkml.kernel.org/r/20180717095617.12240-1-jbru...@baylibre.com
>
> Jerome Brunet (4):
>   clk: meson: clk-pll: add enable bit
>   clk: meson: clk-pll: drop CLK_GET_RATE_NOCACHE where unnecessary
>   clk: meson: clk-pll: remove od parameters
>   clk: meson: clk-pll: drop hard-coded rates from pll tables
for the whole series:
Reviewed-by: Martin Blumenstingl 

as well as:
Tested-by: Martin Blumenstingl 
(tested on Meson8b / Odroid-C1, even CPU frequency scaling still works
with my out-of-tree patches)


Regards
Martin


Re: [PATCH] arm64: dts: rockchip: Add idle-states to device tree for rk3399

2018-08-12 Thread Tao Huang
Hi Heiko:

On 2018年08月10日 04:09, Heiko Stuebner wrote:
> Am Mittwoch, 6. Juli 2016, 10:20:54 CEST schrieb Caesar Wang:
>
>>  arch/arm64/boot/dts/rockchip/rk3399.dtsi | 18 ++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi 
>> b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
>> index a6dd623..12ce265 100644
>> --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi
>> +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
>> @@ -101,6 +101,18 @@
>>  };
>>  };
>>  
>> +idle-states {
>> +entry-method = "psci";
>> +cpu_sleep: cpu-sleep-0 {
>> +compatible = "arm,idle-state";
>> +local-timer-stop;
>> +arm,psci-suspend-param = <0x001>;
>> +entry-latency-us = <350>;
>> +exit-latency-us = <600>;
>> +min-residency-us = <1150>;
> Looking at the chromeos kernel, there are some more patches adapting
> this idle-state to use different timings.
Yes, we have another values. So the values of this patch are wrong.
>
> There also was a cluster-idle state added for a while but that seems to
> cause audio issues according to the CrOS history.

DMA or Audio driver should add PM_QOS_CPU_DMA_LATENCY or other methods to avoid 
the effects of idle.
Idle itself is good.

Thanks!



Re: [PATCH] arm64: dts: rockchip: Add idle-states to device tree for rk3399

2018-08-12 Thread Tao Huang
Hi Heiko:

On 2018年08月10日 04:09, Heiko Stuebner wrote:
> Am Mittwoch, 6. Juli 2016, 10:20:54 CEST schrieb Caesar Wang:
>
>>  arch/arm64/boot/dts/rockchip/rk3399.dtsi | 18 ++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi 
>> b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
>> index a6dd623..12ce265 100644
>> --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi
>> +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
>> @@ -101,6 +101,18 @@
>>  };
>>  };
>>  
>> +idle-states {
>> +entry-method = "psci";
>> +cpu_sleep: cpu-sleep-0 {
>> +compatible = "arm,idle-state";
>> +local-timer-stop;
>> +arm,psci-suspend-param = <0x001>;
>> +entry-latency-us = <350>;
>> +exit-latency-us = <600>;
>> +min-residency-us = <1150>;
> Looking at the chromeos kernel, there are some more patches adapting
> this idle-state to use different timings.
Yes, we have another values. So the values of this patch are wrong.
>
> There also was a cluster-idle state added for a while but that seems to
> cause audio issues according to the CrOS history.

DMA or Audio driver should add PM_QOS_CPU_DMA_LATENCY or other methods to avoid 
the effects of idle.
Idle itself is good.

Thanks!



Re: [PATCH] docs: provide more details about security bug reporting

2018-08-12 Thread Randy Dunlap
On 08/12/2018 06:23 AM, Willy Tarreau wrote:
> Hi Linus,
> 
> please consider applying the attached patch to improve the doc on
> the security reporting process.
> 
> Thanks,
> Willy


Hi,

I have a few corrections/comments below.
They can be addressed later if the patch is being merged immediately.


~~~
 Documentation/admin-guide/security-bugs.rst | 81 +
 1 file changed, 81 insertions(+)

diff --git a/Documentation/admin-guide/security-bugs.rst 
b/Documentation/admin-guide/security-bugs.rst
index 30491d9..91ecd48 100644
--- a/Documentation/admin-guide/security-bugs.rst
+++ b/Documentation/admin-guide/security-bugs.rst
@@ -26,6 +26,51 @@ information is helpful.  Any exploit code is very helpful 
and will not
 be released without consent from the reporter unless it has already been
 made public.
 
+Analysis
+
+
+Sometimes a bug will be very well understood by some of the security
+officers who will propose you a patch to test.  Please get prepared to

  will propose to you a patch to test.  Please be prepared to

+receiving extra questions and to provide answers on a timely basis.

 receive more questions and

+There is little chance a bug will get fixed if you send an incomplete
+report and disappear for two weeks.  It is also possible that some of
+the officers will conclude that the behaviour you observed is normal
+and expected, that it is bogus but doesn't present an imminent
+security risk and should rather be discussed on public lists, or that
+it does indeed represent a risk, but that the risk of breakage induced
+by fixing it outweights the risks of the bug being exploited.  In such

  outweighs

+situations, it is possible that you will be requested to post your
+report to another more suitable place.
+
+Analysing a report takes a lot of time, and while sometimes it's
+better to conclude to a wrong alert because there is nothing to fix,

   "to conclude to a wrong alert" needs some fixing, but I don't know 
what...

+it also is annoying if it is discovered that the reporter should have
+found it by himself, because the time lost on this analysis was not
+spent on another one.  This can happen all the time to be wrong about
+a report, but please be careful not to do this too often or your
+reports may not be taken seriously in the end.
+
+As a rule of thumb, it is recommended not to post messages suggesting
+that a bug may exist somewhere.  Since the security team manages
+imminent and important risks, bugs reported there must be based on
+facts and not on beliefs.  It is fine to report a panic message saying
+"I just got this, I don't know how it happened but it scares me", it is
+not fine to say "I ran my new automated analysis tool which thinks a
+check is missing here, could someone knowledgeable in this area please
+double-check".  The security team's role is not to have opinions on
+your beliefs but to spot the right people to help fix a real problem.
+
+Very often, some maintainers will be brought to the discussion as the
+analysis progresses. Most of the time these people will not have received
+the initial e-mail, and they're discovering the issue late.  So please do
+not get upset if they ask questions that were already addressed or which
+were present in the initial report.
+
+Also, don't consider the bug fixed until the fix is merged.  It can
+happen that a fix proposed by one of the security officers doesn't suit
+a subsystem maintainer and that it has to be reworked differently,
+possibly after a public discussion.
+
 Disclosure
 --
 
@@ -44,6 +89,25 @@ timeframe varies from immediate (esp. if it's already 
publicly known bug)
 to a few weeks.  As a basic default policy, we expect report date to
 release date to be on the order of 7 days.
 
+There is no point threatening to make a report public after XX days
+without a response because usually what you will end up with is a fix
+that is merged much earlier than what you possibly expected, for example
+if you promised to someone not to publish it before a certain date.
+Please just understand that the security team's goal is for your bug to
+be fixed as fast as possible and not to sleep on it.
+
+If you report a particularly complex issue that you intend to discuss
+at a conference a few weeks or months later, you cannot really expect
+from the security team to find a solution in time and at the same time
+to refrain from disclosing the issue to a broader audience or
+releasing the fix.  So at the very least you will have to take your
+dispositions to deal with a disclosure which happens much earlier than
+your public talk about the issue.  Also if you only sent an early
+notification about a forthcoming problem that is not yet fully
+disclosed, you must not expect the security officers to ping you again
+later about the issue; you are responsible for reloading the
+discussion at the right moment once all elements are gathered.
+
 Coordination
 
 
@@ 

Re: [PATCH] docs: provide more details about security bug reporting

2018-08-12 Thread Randy Dunlap
On 08/12/2018 06:23 AM, Willy Tarreau wrote:
> Hi Linus,
> 
> please consider applying the attached patch to improve the doc on
> the security reporting process.
> 
> Thanks,
> Willy


Hi,

I have a few corrections/comments below.
They can be addressed later if the patch is being merged immediately.


~~~
 Documentation/admin-guide/security-bugs.rst | 81 +
 1 file changed, 81 insertions(+)

diff --git a/Documentation/admin-guide/security-bugs.rst 
b/Documentation/admin-guide/security-bugs.rst
index 30491d9..91ecd48 100644
--- a/Documentation/admin-guide/security-bugs.rst
+++ b/Documentation/admin-guide/security-bugs.rst
@@ -26,6 +26,51 @@ information is helpful.  Any exploit code is very helpful 
and will not
 be released without consent from the reporter unless it has already been
 made public.
 
+Analysis
+
+
+Sometimes a bug will be very well understood by some of the security
+officers who will propose you a patch to test.  Please get prepared to

  will propose to you a patch to test.  Please be prepared to

+receiving extra questions and to provide answers on a timely basis.

 receive more questions and

+There is little chance a bug will get fixed if you send an incomplete
+report and disappear for two weeks.  It is also possible that some of
+the officers will conclude that the behaviour you observed is normal
+and expected, that it is bogus but doesn't present an imminent
+security risk and should rather be discussed on public lists, or that
+it does indeed represent a risk, but that the risk of breakage induced
+by fixing it outweights the risks of the bug being exploited.  In such

  outweighs

+situations, it is possible that you will be requested to post your
+report to another more suitable place.
+
+Analysing a report takes a lot of time, and while sometimes it's
+better to conclude to a wrong alert because there is nothing to fix,

   "to conclude to a wrong alert" needs some fixing, but I don't know 
what...

+it also is annoying if it is discovered that the reporter should have
+found it by himself, because the time lost on this analysis was not
+spent on another one.  This can happen all the time to be wrong about
+a report, but please be careful not to do this too often or your
+reports may not be taken seriously in the end.
+
+As a rule of thumb, it is recommended not to post messages suggesting
+that a bug may exist somewhere.  Since the security team manages
+imminent and important risks, bugs reported there must be based on
+facts and not on beliefs.  It is fine to report a panic message saying
+"I just got this, I don't know how it happened but it scares me", it is
+not fine to say "I ran my new automated analysis tool which thinks a
+check is missing here, could someone knowledgeable in this area please
+double-check".  The security team's role is not to have opinions on
+your beliefs but to spot the right people to help fix a real problem.
+
+Very often, some maintainers will be brought to the discussion as the
+analysis progresses. Most of the time these people will not have received
+the initial e-mail, and they're discovering the issue late.  So please do
+not get upset if they ask questions that were already addressed or which
+were present in the initial report.
+
+Also, don't consider the bug fixed until the fix is merged.  It can
+happen that a fix proposed by one of the security officers doesn't suit
+a subsystem maintainer and that it has to be reworked differently,
+possibly after a public discussion.
+
 Disclosure
 --
 
@@ -44,6 +89,25 @@ timeframe varies from immediate (esp. if it's already 
publicly known bug)
 to a few weeks.  As a basic default policy, we expect report date to
 release date to be on the order of 7 days.
 
+There is no point threatening to make a report public after XX days
+without a response because usually what you will end up with is a fix
+that is merged much earlier than what you possibly expected, for example
+if you promised to someone not to publish it before a certain date.
+Please just understand that the security team's goal is for your bug to
+be fixed as fast as possible and not to sleep on it.
+
+If you report a particularly complex issue that you intend to discuss
+at a conference a few weeks or months later, you cannot really expect
+from the security team to find a solution in time and at the same time
+to refrain from disclosing the issue to a broader audience or
+releasing the fix.  So at the very least you will have to take your
+dispositions to deal with a disclosure which happens much earlier than
+your public talk about the issue.  Also if you only sent an early
+notification about a forthcoming problem that is not yet fully
+disclosed, you must not expect the security officers to ping you again
+later about the issue; you are responsible for reloading the
+discussion at the right moment once all elements are gathered.
+
 Coordination
 
 
@@ 

[PATCH v1 2/5] cpuidle: menu: Record tick delta value in struct menu_device

2018-08-12 Thread Leo Yan
Since the tick delta is used in multiple places in menu_select(), it's
better to use single one variable to record this value; furthermore, for
more readable we can refactor the code to split a separate function to
making decision for stopping tick, which also needs to use tick delta
value as one metric for consideration.

To achieve these purposes, this patch adds a new item 'tick_delta_us' in
struct menu_device to record tick delta value.  This patch also is a
preparation for optimization stopping tick in sequential patches.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b972db1..83618ab 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -124,6 +124,7 @@ struct menu_device {
int tick_wakeup;
 
unsigned intnext_timer_us;
+   unsigned inttick_delta_us;
unsigned intpredicted_us;
unsigned intbucket;
unsigned intcorrection_factor[BUCKETS];
@@ -305,6 +306,7 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 
/* determine the expected residency time, round up */
data->next_timer_us = 
ktime_to_us(tick_nohz_get_sleep_length(_next));
+   data->tick_delta_us = ktime_to_us(delta_next);
 
get_iowait_load(_iowaiters, _load);
data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
@@ -317,7 +319,7 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 * timer event for the idle state selection.
 */
if (tick_nohz_tick_stopped()) {
-   data->predicted_us = ktime_to_us(delta_next);
+   data->predicted_us = data->tick_delta_us;
goto select;
}
 
@@ -400,11 +402,11 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 */
if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
data->predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
-   unsigned int delta_next_us = ktime_to_us(delta_next);
 
*stop_tick = false;
 
-   if (idx > 0 && drv->states[idx].target_residency > 
delta_next_us) {
+   if (idx > 0 &&
+   drv->states[idx].target_residency > data->tick_delta_us) {
/*
 * The tick is not going to be stopped and the target
 * residency of the state to be returned is not within
@@ -417,7 +419,8 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
continue;
 
idx = i;
-   if (drv->states[i].target_residency <= 
delta_next_us)
+   if (drv->states[i].target_residency <=
+   data->tick_delta_us)
break;
}
}
-- 
2.7.4



[PATCH v1 4/5] cpuidle: menu: Don't stay in shallow state for a long time

2018-08-12 Thread Leo Yan
To avoid staying in a shallow state for a long time, the menu governor
relies on not stopping tick when detects the prediction is shorter than
the tick event.  This is just luckily to cover most cases but cannot say
it is completely safe.  For example, if the prediction is 2000us and the
TICK_USEC=1000 so it's impossible to meet the condition
'data->predicted_us < TICK_USEC' and this lead to stop the tick for a
shallow state; finally the CPU is possible to stay in this shallow state
for very long time.

This patch checks the candidate idle state isn't deepest one and find if
the timer will come after more than 2 times of the maximum target
residency, though the governor selects a shallow state according to
prediction, due the timer is most reliable waken up source but it will
come very late, so the CPU has chance to stay in the shallow state
for a long time; the patch doesn't stop the tick for this case so can
avoid powernightmares issue.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 4f02207..566c65c 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -284,6 +284,10 @@ static unsigned int get_typical_interval(struct 
menu_device *data)
 static bool menu_decide_stopping_tick(struct cpuidle_driver *drv,
  struct menu_device *data, int idx)
 {
+   int max_target_residency;
+
+   max_target_residency = drv->states[drv->state_count-1].target_residency;
+
/*
 * If the tick has been stopped yet, force to stop it afterwards and
 * don't give chance to set *stop_tick to false.
@@ -302,6 +306,23 @@ static bool menu_decide_stopping_tick(struct 
cpuidle_driver *drv,
if (data->predicted_us < TICK_USEC)
return false;
 
+   /*
+* The candidate idle state isn't deepest one, on the other hand
+* the most reliable wakeup source is timer (compare against to
+* interrupts) says it will come after more than 2 times of maximum
+* target residency, this means the CPU has risk to stay in shallow
+* state for more than 2 times of maximum target residency.
+*
+* It's acceptable to stay in the shallow state at this time but we
+* need to ensure to wake up the CPU by tick to check if has better
+* choice.  Finally it can have choice to select deeper state and
+* avoid the CPU staying in shallow state for very long time and
+* without any wake up event.
+*/
+   if (idx < drv->state_count - 1 &&
+   data->next_timer_us > max_target_residency * 2)
+   return false;
+
/* Otherwise, let's stop the tick at this time. */
return true;
 }
-- 
2.7.4



[PATCH v1 3/5] cpuidle: menu: Provide menu_decide_stopping_tick()

2018-08-12 Thread Leo Yan
This patch is only for code refactoring and without functional change.
It introduces a new function menu_decide_stopping_tick(); we can use
this function to focus on making stopping tick decision.  With moving
out stopping tick decision code, it lets the below piece code is
simplized only for the idle state calibration and thus save one indent
level in the loop.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 76 ++--
 1 file changed, 50 insertions(+), 26 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 83618ab..4f02207 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -276,6 +276,37 @@ static unsigned int get_typical_interval(struct 
menu_device *data)
 }
 
 /**
+ * menu_decide_stopping_tick - decides if need to stopping tick
+ * @drv: cpuidle driver containing state data
+ * @data: menu_device structure pointer
+ * @idx: the candidate idle state index
+ */
+static bool menu_decide_stopping_tick(struct cpuidle_driver *drv,
+ struct menu_device *data, int idx)
+{
+   /*
+* If the tick has been stopped yet, force to stop it afterwards and
+* don't give chance to set *stop_tick to false.
+*/
+   if (tick_nohz_tick_stopped())
+   return true;
+
+   /* Don't stop the tick if the selected state is a polling one */
+   if (drv->states[idx].flags & CPUIDLE_FLAG_POLLING)
+   return false;
+
+   /*
+* Don't stop the tick if the prediction is shorter than the
+* tick period length.
+*/
+   if (data->predicted_us < TICK_USEC)
+   return false;
+
+   /* Otherwise, let's stop the tick at this time. */
+   return true;
+}
+
+/**
  * menu_select - selects the next idle state to enter
  * @drv: cpuidle driver containing state data
  * @dev: the CPU
@@ -396,33 +427,26 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
if (idx == -1)
idx = 0; /* No states enabled. Must use 0. */
 
-   /*
-* Don't stop the tick if the selected state is a polling one or if the
-* expected idle duration is shorter than the tick period length.
-*/
-   if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-   data->predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
-
-   *stop_tick = false;
+   *stop_tick = menu_decide_stopping_tick(drv, data, idx);
 
-   if (idx > 0 &&
-   drv->states[idx].target_residency > data->tick_delta_us) {
-   /*
-* The tick is not going to be stopped and the target
-* residency of the state to be returned is not within
-* the time until the next timer event including the
-* tick, so try to correct that.
-*/
-   for (i = idx - 1; i >= 0; i--) {
-   if (drv->states[i].disabled ||
-   dev->states_usage[i].disable)
-   continue;
-
-   idx = i;
-   if (drv->states[i].target_residency <=
-   data->tick_delta_us)
-   break;
-   }
+   /* Calibrate the idle state according to the tick event. */
+   if (!*stop_tick && idx > 0 &&
+   drv->states[idx].target_residency > data->tick_delta_us) {
+   /*
+* The tick is not going to be stopped and the target
+* residency of the state to be returned is not within
+* the time until the next timer event including the
+* tick, so try to correct that.
+*/
+   for (i = idx - 1; i >= 0; i--) {
+   if (drv->states[i].disabled ||
+   dev->states_usage[i].disable)
+   continue;
+
+   idx = i;
+   if (drv->states[i].target_residency <=
+   data->tick_delta_us)
+   break;
}
}
 
-- 
2.7.4



[PATCH v1 2/5] cpuidle: menu: Record tick delta value in struct menu_device

2018-08-12 Thread Leo Yan
Since the tick delta is used in multiple places in menu_select(), it's
better to use single one variable to record this value; furthermore, for
more readable we can refactor the code to split a separate function to
making decision for stopping tick, which also needs to use tick delta
value as one metric for consideration.

To achieve these purposes, this patch adds a new item 'tick_delta_us' in
struct menu_device to record tick delta value.  This patch also is a
preparation for optimization stopping tick in sequential patches.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b972db1..83618ab 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -124,6 +124,7 @@ struct menu_device {
int tick_wakeup;
 
unsigned intnext_timer_us;
+   unsigned inttick_delta_us;
unsigned intpredicted_us;
unsigned intbucket;
unsigned intcorrection_factor[BUCKETS];
@@ -305,6 +306,7 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 
/* determine the expected residency time, round up */
data->next_timer_us = 
ktime_to_us(tick_nohz_get_sleep_length(_next));
+   data->tick_delta_us = ktime_to_us(delta_next);
 
get_iowait_load(_iowaiters, _load);
data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
@@ -317,7 +319,7 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 * timer event for the idle state selection.
 */
if (tick_nohz_tick_stopped()) {
-   data->predicted_us = ktime_to_us(delta_next);
+   data->predicted_us = data->tick_delta_us;
goto select;
}
 
@@ -400,11 +402,11 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 */
if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
data->predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
-   unsigned int delta_next_us = ktime_to_us(delta_next);
 
*stop_tick = false;
 
-   if (idx > 0 && drv->states[idx].target_residency > 
delta_next_us) {
+   if (idx > 0 &&
+   drv->states[idx].target_residency > data->tick_delta_us) {
/*
 * The tick is not going to be stopped and the target
 * residency of the state to be returned is not within
@@ -417,7 +419,8 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
continue;
 
idx = i;
-   if (drv->states[i].target_residency <= 
delta_next_us)
+   if (drv->states[i].target_residency <=
+   data->tick_delta_us)
break;
}
}
-- 
2.7.4



[PATCH v1 4/5] cpuidle: menu: Don't stay in shallow state for a long time

2018-08-12 Thread Leo Yan
To avoid staying in a shallow state for a long time, the menu governor
relies on not stopping tick when detects the prediction is shorter than
the tick event.  This is just luckily to cover most cases but cannot say
it is completely safe.  For example, if the prediction is 2000us and the
TICK_USEC=1000 so it's impossible to meet the condition
'data->predicted_us < TICK_USEC' and this lead to stop the tick for a
shallow state; finally the CPU is possible to stay in this shallow state
for very long time.

This patch checks the candidate idle state isn't deepest one and find if
the timer will come after more than 2 times of the maximum target
residency, though the governor selects a shallow state according to
prediction, due the timer is most reliable waken up source but it will
come very late, so the CPU has chance to stay in the shallow state
for a long time; the patch doesn't stop the tick for this case so can
avoid powernightmares issue.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 4f02207..566c65c 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -284,6 +284,10 @@ static unsigned int get_typical_interval(struct 
menu_device *data)
 static bool menu_decide_stopping_tick(struct cpuidle_driver *drv,
  struct menu_device *data, int idx)
 {
+   int max_target_residency;
+
+   max_target_residency = drv->states[drv->state_count-1].target_residency;
+
/*
 * If the tick has been stopped yet, force to stop it afterwards and
 * don't give chance to set *stop_tick to false.
@@ -302,6 +306,23 @@ static bool menu_decide_stopping_tick(struct 
cpuidle_driver *drv,
if (data->predicted_us < TICK_USEC)
return false;
 
+   /*
+* The candidate idle state isn't deepest one, on the other hand
+* the most reliable wakeup source is timer (compare against to
+* interrupts) says it will come after more than 2 times of maximum
+* target residency, this means the CPU has risk to stay in shallow
+* state for more than 2 times of maximum target residency.
+*
+* It's acceptable to stay in the shallow state at this time but we
+* need to ensure to wake up the CPU by tick to check if has better
+* choice.  Finally it can have choice to select deeper state and
+* avoid the CPU staying in shallow state for very long time and
+* without any wake up event.
+*/
+   if (idx < drv->state_count - 1 &&
+   data->next_timer_us > max_target_residency * 2)
+   return false;
+
/* Otherwise, let's stop the tick at this time. */
return true;
 }
-- 
2.7.4



[PATCH v1 3/5] cpuidle: menu: Provide menu_decide_stopping_tick()

2018-08-12 Thread Leo Yan
This patch is only for code refactoring and without functional change.
It introduces a new function menu_decide_stopping_tick(); we can use
this function to focus on making stopping tick decision.  With moving
out stopping tick decision code, it lets the below piece code is
simplized only for the idle state calibration and thus save one indent
level in the loop.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 76 ++--
 1 file changed, 50 insertions(+), 26 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 83618ab..4f02207 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -276,6 +276,37 @@ static unsigned int get_typical_interval(struct 
menu_device *data)
 }
 
 /**
+ * menu_decide_stopping_tick - decides if need to stopping tick
+ * @drv: cpuidle driver containing state data
+ * @data: menu_device structure pointer
+ * @idx: the candidate idle state index
+ */
+static bool menu_decide_stopping_tick(struct cpuidle_driver *drv,
+ struct menu_device *data, int idx)
+{
+   /*
+* If the tick has been stopped yet, force to stop it afterwards and
+* don't give chance to set *stop_tick to false.
+*/
+   if (tick_nohz_tick_stopped())
+   return true;
+
+   /* Don't stop the tick if the selected state is a polling one */
+   if (drv->states[idx].flags & CPUIDLE_FLAG_POLLING)
+   return false;
+
+   /*
+* Don't stop the tick if the prediction is shorter than the
+* tick period length.
+*/
+   if (data->predicted_us < TICK_USEC)
+   return false;
+
+   /* Otherwise, let's stop the tick at this time. */
+   return true;
+}
+
+/**
  * menu_select - selects the next idle state to enter
  * @drv: cpuidle driver containing state data
  * @dev: the CPU
@@ -396,33 +427,26 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
if (idx == -1)
idx = 0; /* No states enabled. Must use 0. */
 
-   /*
-* Don't stop the tick if the selected state is a polling one or if the
-* expected idle duration is shorter than the tick period length.
-*/
-   if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-   data->predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
-
-   *stop_tick = false;
+   *stop_tick = menu_decide_stopping_tick(drv, data, idx);
 
-   if (idx > 0 &&
-   drv->states[idx].target_residency > data->tick_delta_us) {
-   /*
-* The tick is not going to be stopped and the target
-* residency of the state to be returned is not within
-* the time until the next timer event including the
-* tick, so try to correct that.
-*/
-   for (i = idx - 1; i >= 0; i--) {
-   if (drv->states[i].disabled ||
-   dev->states_usage[i].disable)
-   continue;
-
-   idx = i;
-   if (drv->states[i].target_residency <=
-   data->tick_delta_us)
-   break;
-   }
+   /* Calibrate the idle state according to the tick event. */
+   if (!*stop_tick && idx > 0 &&
+   drv->states[idx].target_residency > data->tick_delta_us) {
+   /*
+* The tick is not going to be stopped and the target
+* residency of the state to be returned is not within
+* the time until the next timer event including the
+* tick, so try to correct that.
+*/
+   for (i = idx - 1; i >= 0; i--) {
+   if (drv->states[i].disabled ||
+   dev->states_usage[i].disable)
+   continue;
+
+   idx = i;
+   if (drv->states[i].target_residency <=
+   data->tick_delta_us)
+   break;
}
}
 
-- 
2.7.4



[PATCH v1 0/5] Improvement stopping tick decision making in 'menu' idle governor

2018-08-12 Thread Leo Yan
We found the CPU cannot stay in deepest idle state as expected with
running synthetic workloads with mainline kernel on Arm platform
(96boards Hikey620 with octa CA53 CPUs).

The main issue is the criteria for decision stopping tick; now
the criteria is checking expected interval is less than TICK_USEC, but
this doesn't consider the next tick detla is float due CPU randomly
eneters and exits idle states; furthermore, it's stick to checking
TICK_USEC as boundary for decision stopping tick, unfortunately this has
hole to select a shallow state with stopping tick, so the CPU stays in
shallow state for long time.

This patch series is to explore more reasonable making decision for
stopping tick and the most important fixing is to avoid powernightmares
issue after we apply these criterias for making decisions.  Patches
0001 ~ 0003 are used to refactor the variables and structures for more
readable code, it also provides a function menu_decide_stopping_tick()
which can be used to encapsulate the making decision logics.  The last
two patches are primary for improvement, patch 0004 'cpuidle: menu:
Don't stay in shallow state for a long time' introduces a new criteria
(it's a more strict criteria than before) for not stopping tick for
shallow state cases; patch 0005 is use the dynamic tick detla to replace
the static value TICK_USEC for decision if the tick is expired before or
after the prediction, according this comparison we can get conclusion if
need to stop tick or not.

With more accurate decision for stopping tick, one immediate benefit is
the CPUs have more chance to stay in deepest state, it also can avoid to
run tick unnecessarily and so avoid a shallower state introduced by tick
event.  For the testing result in below table, we can see the result
proves the improvement by better stopping tick decision making in this
patch series, we run the workload generated by rt-app (a single task
with period 5ms and duty cycle 1%/3%/5%/10%/20%/30%/40%), the total
running time is 60s.  We do statistics for all CPUs for all idle states
duration, the unit is second (s), for cases (dutycycle=1%/3%/5%/10%/20%)
we can see the shallow state C0/C1 duration are reduced and the time
has been moved to deepest state, so the deepest state C2 duration can
have improvement for ~9s to ~21s.  for cases (dutycycle=30%/40%) though
we can see the deepest state durations are parity between with and
without patch series, but it has a minor improvement for C1 state
duration by stealing C0 state duration.

Some notations are used in the table:

state: C0: WFI; C1: CPU OFF; C2: Cluster OFF

All testing cases have single task with 5ms period:

 Without patches   With patches   Difference
---  ---   
--
Duty cycleC0 C1   C2   C0  C1  C2C0C1   
 C2
  1%2.397  16.528  471.905   0.9162.688  487.328   -1.481   -13.840 
  +15.422
  3%3.957  20.541  464.434   1.5102.398  485.914   -2.447   -18.143 
  +21.480
  5%2.866   8.609  474.777   1.1662.250  483.983   -1.699-6.359 
   +9.205
 10%2.893  28.753  453.277   1.147   14.134  469.190   -1.745   -14.618 
  +15.913
 20%7.620  41.086  431.735   1.595   35.055  442.482   -6.024-6.030 
  +10.747
 30%4.394  38.328  431.442   1.964   40.857  430.973   -2.430+2.529 
   -0.468
 40%7.390  29.415  430.914   1.789   34.832  431.588   -5.600+5.417 
   -0.673


P.s. for the testing, applied Rafael's patch 'cpuidle: menu: Handle
stopped tick more aggressively' [1] to avoid select unexpected shallow
state after tick has been stopped.

[1] https://lkml.org/lkml/2018/8/10/259

Leo Yan (5):
  cpuidle: menu: Clean up variables usage in menu_select()
  cpuidle: menu: Record tick delta value in struct menu_device
  cpuidle: menu: Provide menu_decide_stopping_tick()
  cpuidle: menu: Don't stay in shallow state for a long time
  cpuidle: menu: Change to compare prediction with tick delta

 drivers/cpuidle/governors/menu.c | 104 ---
 1 file changed, 76 insertions(+), 28 deletions(-)

-- 
2.7.4



[PATCH v1 5/5] cpuidle: menu: Change to compare prediction with tick delta

2018-08-12 Thread Leo Yan
The tick stopping decision is made by comparing the prediction with
TICK_USEC, if the prediction is shorter than TICK_USEC then this means
the CPU is likely waken up before the tick event so it's pointless to
stop tick.  In reality when make the decision, though the tick period is
fixed to TICK_USEC, but the CPU is randomly entering/exiting idle
states so the next tick delta is float and should be in the range
[0, TICK_USEC].  This can result in wrong decision for stopping tick,
e.g. if the prediction is 3ms idle duration and we compare with
TICK_USEC=4000 (HZ=250), this can lead to a wrong conclusion is the tick
event will be later than the prediction duration so the governor doesn't
stop the tick; but in fact the tick is expired for 1ms, so the tick
wakes up the CPU ahead and the CPU cannot stay in idle for 3ms as
expected.

Alternatively, 'data->tick_delta_us' is for the tick delta value and
it's a accurate estimation for tick event coming.  This patch changes to
compare prediction with tick delta rather than comparing with the static
tick interval.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 566c65c..06d5942 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -300,10 +300,11 @@ static bool menu_decide_stopping_tick(struct 
cpuidle_driver *drv,
return false;
 
/*
-* Don't stop the tick if the prediction is shorter than the
-* tick period length.
+* If the prediction is shorter than the next tick event, means
+* the CPU will be waken up before the tick event; don't stop
+* the tick.
 */
-   if (data->predicted_us < TICK_USEC)
+   if (data->predicted_us < data->tick_delta_us)
return false;
 
/*
-- 
2.7.4



[PATCH v1 1/5] cpuidle: menu: Clean up variables usage in menu_select()

2018-08-12 Thread Leo Yan
The usage for two variables 'data->predicted_us' and 'expected_interval'
in menu_select() are confused, especially these two variables are
assigned with each other: firstly 'data->predicted_us' is assigned to
the minimum value between 'data->predicted_us' and 'expected_interval',
so it presents the prediction period for taking account different
factors and include consideration for expected interval; but later
'data->predicted_us' is assigned back to 'expected_interval' and from
then on the function uses 'expected_interval' to select idle state; this
results in 'expected_interval' has two different semantics between the
top half and the bottom half of the same function.

This patch is to clean up the usage of these two variables, we always
use 'data->predicted_us' to present the idle duration predictions and
it can be used to compare with idle state target residency or tick
boundary for choosing idle state; we purely use 'expected_interval' to
record the expected interval value, which is mainly for interval
interrupt estimation.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 5eb7d6f..b972db1 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -363,7 +363,6 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
latency_req = interactivity_req;
 
 select:
-   expected_interval = data->predicted_us;
/*
 * Find the idle state with the lowest power while satisfying
 * our constraints.
@@ -386,7 +385,7 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 * expected idle duration so that the tick is retained
 * as long as that target residency is low enough.
 */
-   expected_interval = drv->states[idx].target_residency;
+   data->predicted_us = drv->states[idx].target_residency;
break;
}
idx = i;
@@ -400,7 +399,7 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 * expected idle duration is shorter than the tick period length.
 */
if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-   expected_interval < TICK_USEC) && !tick_nohz_tick_stopped()) {
+   data->predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
unsigned int delta_next_us = ktime_to_us(delta_next);
 
*stop_tick = false;
-- 
2.7.4



[PATCH v1 5/5] cpuidle: menu: Change to compare prediction with tick delta

2018-08-12 Thread Leo Yan
The tick stopping decision is made by comparing the prediction with
TICK_USEC, if the prediction is shorter than TICK_USEC then this means
the CPU is likely waken up before the tick event so it's pointless to
stop tick.  In reality when make the decision, though the tick period is
fixed to TICK_USEC, but the CPU is randomly entering/exiting idle
states so the next tick delta is float and should be in the range
[0, TICK_USEC].  This can result in wrong decision for stopping tick,
e.g. if the prediction is 3ms idle duration and we compare with
TICK_USEC=4000 (HZ=250), this can lead to a wrong conclusion is the tick
event will be later than the prediction duration so the governor doesn't
stop the tick; but in fact the tick is expired for 1ms, so the tick
wakes up the CPU ahead and the CPU cannot stay in idle for 3ms as
expected.

Alternatively, 'data->tick_delta_us' is for the tick delta value and
it's a accurate estimation for tick event coming.  This patch changes to
compare prediction with tick delta rather than comparing with the static
tick interval.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 566c65c..06d5942 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -300,10 +300,11 @@ static bool menu_decide_stopping_tick(struct 
cpuidle_driver *drv,
return false;
 
/*
-* Don't stop the tick if the prediction is shorter than the
-* tick period length.
+* If the prediction is shorter than the next tick event, means
+* the CPU will be waken up before the tick event; don't stop
+* the tick.
 */
-   if (data->predicted_us < TICK_USEC)
+   if (data->predicted_us < data->tick_delta_us)
return false;
 
/*
-- 
2.7.4



[PATCH v1 1/5] cpuidle: menu: Clean up variables usage in menu_select()

2018-08-12 Thread Leo Yan
The usage for two variables 'data->predicted_us' and 'expected_interval'
in menu_select() are confused, especially these two variables are
assigned with each other: firstly 'data->predicted_us' is assigned to
the minimum value between 'data->predicted_us' and 'expected_interval',
so it presents the prediction period for taking account different
factors and include consideration for expected interval; but later
'data->predicted_us' is assigned back to 'expected_interval' and from
then on the function uses 'expected_interval' to select idle state; this
results in 'expected_interval' has two different semantics between the
top half and the bottom half of the same function.

This patch is to clean up the usage of these two variables, we always
use 'data->predicted_us' to present the idle duration predictions and
it can be used to compare with idle state target residency or tick
boundary for choosing idle state; we purely use 'expected_interval' to
record the expected interval value, which is mainly for interval
interrupt estimation.

Signed-off-by: Leo Yan 
---
 drivers/cpuidle/governors/menu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 5eb7d6f..b972db1 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -363,7 +363,6 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
latency_req = interactivity_req;
 
 select:
-   expected_interval = data->predicted_us;
/*
 * Find the idle state with the lowest power while satisfying
 * our constraints.
@@ -386,7 +385,7 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 * expected idle duration so that the tick is retained
 * as long as that target residency is low enough.
 */
-   expected_interval = drv->states[idx].target_residency;
+   data->predicted_us = drv->states[idx].target_residency;
break;
}
idx = i;
@@ -400,7 +399,7 @@ static int menu_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev,
 * expected idle duration is shorter than the tick period length.
 */
if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-   expected_interval < TICK_USEC) && !tick_nohz_tick_stopped()) {
+   data->predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) {
unsigned int delta_next_us = ktime_to_us(delta_next);
 
*stop_tick = false;
-- 
2.7.4



[PATCH v1 0/5] Improvement stopping tick decision making in 'menu' idle governor

2018-08-12 Thread Leo Yan
We found the CPU cannot stay in deepest idle state as expected with
running synthetic workloads with mainline kernel on Arm platform
(96boards Hikey620 with octa CA53 CPUs).

The main issue is the criteria for decision stopping tick; now
the criteria is checking expected interval is less than TICK_USEC, but
this doesn't consider the next tick detla is float due CPU randomly
eneters and exits idle states; furthermore, it's stick to checking
TICK_USEC as boundary for decision stopping tick, unfortunately this has
hole to select a shallow state with stopping tick, so the CPU stays in
shallow state for long time.

This patch series is to explore more reasonable making decision for
stopping tick and the most important fixing is to avoid powernightmares
issue after we apply these criterias for making decisions.  Patches
0001 ~ 0003 are used to refactor the variables and structures for more
readable code, it also provides a function menu_decide_stopping_tick()
which can be used to encapsulate the making decision logics.  The last
two patches are primary for improvement, patch 0004 'cpuidle: menu:
Don't stay in shallow state for a long time' introduces a new criteria
(it's a more strict criteria than before) for not stopping tick for
shallow state cases; patch 0005 is use the dynamic tick detla to replace
the static value TICK_USEC for decision if the tick is expired before or
after the prediction, according this comparison we can get conclusion if
need to stop tick or not.

With more accurate decision for stopping tick, one immediate benefit is
the CPUs have more chance to stay in deepest state, it also can avoid to
run tick unnecessarily and so avoid a shallower state introduced by tick
event.  For the testing result in below table, we can see the result
proves the improvement by better stopping tick decision making in this
patch series, we run the workload generated by rt-app (a single task
with period 5ms and duty cycle 1%/3%/5%/10%/20%/30%/40%), the total
running time is 60s.  We do statistics for all CPUs for all idle states
duration, the unit is second (s), for cases (dutycycle=1%/3%/5%/10%/20%)
we can see the shallow state C0/C1 duration are reduced and the time
has been moved to deepest state, so the deepest state C2 duration can
have improvement for ~9s to ~21s.  for cases (dutycycle=30%/40%) though
we can see the deepest state durations are parity between with and
without patch series, but it has a minor improvement for C1 state
duration by stealing C0 state duration.

Some notations are used in the table:

state: C0: WFI; C1: CPU OFF; C2: Cluster OFF

All testing cases have single task with 5ms period:

 Without patches   With patches   Difference
---  ---   
--
Duty cycleC0 C1   C2   C0  C1  C2C0C1   
 C2
  1%2.397  16.528  471.905   0.9162.688  487.328   -1.481   -13.840 
  +15.422
  3%3.957  20.541  464.434   1.5102.398  485.914   -2.447   -18.143 
  +21.480
  5%2.866   8.609  474.777   1.1662.250  483.983   -1.699-6.359 
   +9.205
 10%2.893  28.753  453.277   1.147   14.134  469.190   -1.745   -14.618 
  +15.913
 20%7.620  41.086  431.735   1.595   35.055  442.482   -6.024-6.030 
  +10.747
 30%4.394  38.328  431.442   1.964   40.857  430.973   -2.430+2.529 
   -0.468
 40%7.390  29.415  430.914   1.789   34.832  431.588   -5.600+5.417 
   -0.673


P.s. for the testing, applied Rafael's patch 'cpuidle: menu: Handle
stopped tick more aggressively' [1] to avoid select unexpected shallow
state after tick has been stopped.

[1] https://lkml.org/lkml/2018/8/10/259

Leo Yan (5):
  cpuidle: menu: Clean up variables usage in menu_select()
  cpuidle: menu: Record tick delta value in struct menu_device
  cpuidle: menu: Provide menu_decide_stopping_tick()
  cpuidle: menu: Don't stay in shallow state for a long time
  cpuidle: menu: Change to compare prediction with tick delta

 drivers/cpuidle/governors/menu.c | 104 ---
 1 file changed, 76 insertions(+), 28 deletions(-)

-- 
2.7.4



Re: [RESEND PATCH v1 1/2] cpuidle: menu: Correct the criteria for stopping tick

2018-08-12 Thread leo . yan
On Sun, Aug 12, 2018 at 01:12:41PM +0200, Rafael J. Wysocki wrote:
> On Fri, Aug 10, 2018 at 11:03 AM  wrote:
> >
> > On Fri, Aug 10, 2018 at 04:49:06PM +0800, Leo Yan wrote:
> > > On Fri, Aug 10, 2018 at 09:22:10AM +0200, Rafael J. Wysocki wrote:
> > > > On Fri, Aug 10, 2018 at 9:13 AM,   wrote:
> > > > > On Thu, Aug 09, 2018 at 10:47:17PM +0200, Rafael J. Wysocki wrote:
> > > > >> On Thu, Aug 9, 2018 at 7:20 PM, Leo Yan  wrote:
> > > >
> > > > [cut]
> > > >
> > > > >> And that will cause the tick to be stopped unnecessarily in certain
> > > > >> situations, so why is this better?
> > > > >
> > > > > Let's see below two cases, the first one case we configure
> > > > > TICK_USEC=1000 (1ms) and the second case we configure TICK_USEC=4000
> > > > > (4ms).
> > > > >
> > > > > Let's assume we do the testing one the same platform and have two 
> > > > > runs,
> > > > > in the Case 1 we configure HZ=1000 so TICK_USEC=1ms, expected_interval
> > > > > is 3ms and deepest idle state target residency is 2ms, finally the 
> > > > > idle
> > > > > governor will choose the deepest state and skip to calibrate to 
> > > > > shallow
> > > > > state caused by 'expected_interval' > TICK_USEC;
> > > > >
> > > > > In the Case 2 we configure HZ=250 so TICK_USE=4ms, expected_interval
> > > > > (3ms) and deepest idle state target residency (2ms) are same with the
> > > > > Case 1; but because expected_interval < TICK_USEC so the idle governor
> > > > > will do calibration to select a shallower state.  If we image on one
> > > > > platform, the deepest idle state's target residency is smaller value,
> > > > > then it has bigger gap with TICK_USEC, the deepest idle state is 
> > > > > harder
> > > > > to be selected due 'expected_interval' can be easily hit the range
> > > > > [Deepest target residency..TICK_USEC).
> > > > >
> > > > > This patch has no any change for Case 1 and it wants to optimize for
> > > > > Case 2 so Case 2 has chance to stay in deepest idle state.  I
> > > > > understand from the performance pespective, we need to avoid to stop
> > > > > tick for shallow states; on the other hand we cannot prevent CPU run
> > > > > into deepest idle state just only we want to keep the tick running,
> > > > > especially the expected interval is longer than the deepest state
> > > > > target residency.
> > > > >
> > > > > Case 1:
> > > > >   Deepest idle state's target residency=2ms
> > > > >  |
> > > > >  V
> > > > > |> time (ms)
> > > > >   ^  ^
> > > > >   |  |
> > > > > TICK_USEC=1ms   expected_interval=3ms
> > > > >
> > > > >
> > > > > Case 2:
> > > > >   Deepest idle state's target residency = 2ms
> > > > >  |
> > > > >  V
> > > > > |> time (ms)
> > > > >  ^  ^
> > > > >  |  |
> > > > >   expected_interval = 3ms   TICK_USEC = 4ms
> > > > >
> > > > >
> > > > >
> > > > >> > unsigned int delta_next_us = 
> > > > >> > ktime_to_us(delta_next);
> > > > >> >
> > > > >> > *stop_tick = false;
> > > > >> > --
> > > >
> > > > Well, I don't quite agree with the approach here, then.
> > > >
> > > > As I said in the previous reply, IMO restarting the stopped tick
> > > > before leaving the loop in do_idle() is pointless overhead.  It is not
> > > > necessary to do that to avoid leaving CPUs in shallow idle states for
> > > > too long (I'll send an alternative patch to fix this issue shortly).
> > > >
> > > > While you may think that pointless overhead is not a problem, I don't
> > > > quite agree with that.
> > >
> > > I disagree this patch will introduce any extra overhead.
> > >
> > > Firstly, the idle loop doesn't support restarting tick even this patch
> > > tells idle loop to restart the tick;
> 
> I'm not talking about restarting the tick, but about stopping it more
> often on average.

Ah, yes, I agree.

> > > secondly this patch is mainly to
> > > resolve issue for the CPU cannot stay in deepest state in Case 2,
> 
> I understand what you are trying to achieve here, but I don't agree with it.

I agree we need find more general method for fixing.

> The condition modified by this patch is not about how much time the
> CPU can potentially be idle, but about when it is expected to wake up.
> The "expected" part is really key here.
> 
> The governor has gone through the effort of making an idle duration
> prediction and it now it has a certain expectation regarding when the
> CPU will wake up.  If the governor's prediction is any good at all and
> this expectation is in the tick range, the CPU will be woken up by
> something close enough to the tick in the majority of cases, so there
> is no need to stop 

Re: [RESEND PATCH v1 1/2] cpuidle: menu: Correct the criteria for stopping tick

2018-08-12 Thread leo . yan
On Sun, Aug 12, 2018 at 01:12:41PM +0200, Rafael J. Wysocki wrote:
> On Fri, Aug 10, 2018 at 11:03 AM  wrote:
> >
> > On Fri, Aug 10, 2018 at 04:49:06PM +0800, Leo Yan wrote:
> > > On Fri, Aug 10, 2018 at 09:22:10AM +0200, Rafael J. Wysocki wrote:
> > > > On Fri, Aug 10, 2018 at 9:13 AM,   wrote:
> > > > > On Thu, Aug 09, 2018 at 10:47:17PM +0200, Rafael J. Wysocki wrote:
> > > > >> On Thu, Aug 9, 2018 at 7:20 PM, Leo Yan  wrote:
> > > >
> > > > [cut]
> > > >
> > > > >> And that will cause the tick to be stopped unnecessarily in certain
> > > > >> situations, so why is this better?
> > > > >
> > > > > Let's see below two cases, the first one case we configure
> > > > > TICK_USEC=1000 (1ms) and the second case we configure TICK_USEC=4000
> > > > > (4ms).
> > > > >
> > > > > Let's assume we do the testing one the same platform and have two 
> > > > > runs,
> > > > > in the Case 1 we configure HZ=1000 so TICK_USEC=1ms, expected_interval
> > > > > is 3ms and deepest idle state target residency is 2ms, finally the 
> > > > > idle
> > > > > governor will choose the deepest state and skip to calibrate to 
> > > > > shallow
> > > > > state caused by 'expected_interval' > TICK_USEC;
> > > > >
> > > > > In the Case 2 we configure HZ=250 so TICK_USE=4ms, expected_interval
> > > > > (3ms) and deepest idle state target residency (2ms) are same with the
> > > > > Case 1; but because expected_interval < TICK_USEC so the idle governor
> > > > > will do calibration to select a shallower state.  If we image on one
> > > > > platform, the deepest idle state's target residency is smaller value,
> > > > > then it has bigger gap with TICK_USEC, the deepest idle state is 
> > > > > harder
> > > > > to be selected due 'expected_interval' can be easily hit the range
> > > > > [Deepest target residency..TICK_USEC).
> > > > >
> > > > > This patch has no any change for Case 1 and it wants to optimize for
> > > > > Case 2 so Case 2 has chance to stay in deepest idle state.  I
> > > > > understand from the performance pespective, we need to avoid to stop
> > > > > tick for shallow states; on the other hand we cannot prevent CPU run
> > > > > into deepest idle state just only we want to keep the tick running,
> > > > > especially the expected interval is longer than the deepest state
> > > > > target residency.
> > > > >
> > > > > Case 1:
> > > > >   Deepest idle state's target residency=2ms
> > > > >  |
> > > > >  V
> > > > > |> time (ms)
> > > > >   ^  ^
> > > > >   |  |
> > > > > TICK_USEC=1ms   expected_interval=3ms
> > > > >
> > > > >
> > > > > Case 2:
> > > > >   Deepest idle state's target residency = 2ms
> > > > >  |
> > > > >  V
> > > > > |> time (ms)
> > > > >  ^  ^
> > > > >  |  |
> > > > >   expected_interval = 3ms   TICK_USEC = 4ms
> > > > >
> > > > >
> > > > >
> > > > >> > unsigned int delta_next_us = 
> > > > >> > ktime_to_us(delta_next);
> > > > >> >
> > > > >> > *stop_tick = false;
> > > > >> > --
> > > >
> > > > Well, I don't quite agree with the approach here, then.
> > > >
> > > > As I said in the previous reply, IMO restarting the stopped tick
> > > > before leaving the loop in do_idle() is pointless overhead.  It is not
> > > > necessary to do that to avoid leaving CPUs in shallow idle states for
> > > > too long (I'll send an alternative patch to fix this issue shortly).
> > > >
> > > > While you may think that pointless overhead is not a problem, I don't
> > > > quite agree with that.
> > >
> > > I disagree this patch will introduce any extra overhead.
> > >
> > > Firstly, the idle loop doesn't support restarting tick even this patch
> > > tells idle loop to restart the tick;
> 
> I'm not talking about restarting the tick, but about stopping it more
> often on average.

Ah, yes, I agree.

> > > secondly this patch is mainly to
> > > resolve issue for the CPU cannot stay in deepest state in Case 2,
> 
> I understand what you are trying to achieve here, but I don't agree with it.

I agree we need find more general method for fixing.

> The condition modified by this patch is not about how much time the
> CPU can potentially be idle, but about when it is expected to wake up.
> The "expected" part is really key here.
> 
> The governor has gone through the effort of making an idle duration
> prediction and it now it has a certain expectation regarding when the
> CPU will wake up.  If the governor's prediction is any good at all and
> this expectation is in the tick range, the CPU will be woken up by
> something close enough to the tick in the majority of cases, so there
> is no need to stop 

hallo Schönheit

2018-08-12 Thread Wesley
Es ist mir eine Freude, Sie kennenzulernen. Mein Name ist Wesley, ich komme aus 
dem Vereinigten Staaten von Amerika. Ich bin ledig und nie verheiratet. Ich 
werde mich gerne mit Ihnen bekannt machen, ich entschuldige mich für Ihre 
Privatsphäre. Ich hoffe, Sie werden freundlich sein genug, um mir mehr über 
dich zu erzählen, wenn es dir nichts ausmacht.

Hoffe bald von dir zu hören.

Grüße,

Wesley.


hallo Schönheit

2018-08-12 Thread Wesley
Es ist mir eine Freude, Sie kennenzulernen. Mein Name ist Wesley, ich komme aus 
dem Vereinigten Staaten von Amerika. Ich bin ledig und nie verheiratet. Ich 
werde mich gerne mit Ihnen bekannt machen, ich entschuldige mich für Ihre 
Privatsphäre. Ich hoffe, Sie werden freundlich sein genug, um mir mehr über 
dich zu erzählen, wenn es dir nichts ausmacht.

Hoffe bald von dir zu hören.

Grüße,

Wesley.


Re: i2c:ocores: fixes and polling mechanism

2018-08-12 Thread Wolfram Sang


> sorry to disturb you all but after one month and a half I never received 
> any comment about this patch set and I fear it ended up in a forgotten 
> corner. I would like to know if someone is considering it or not.

Adding Peter to CC using his latest EMail address.

Peter, you said you wanted to update MAINTAINERs with the new address?



Re: i2c:ocores: fixes and polling mechanism

2018-08-12 Thread Wolfram Sang


> sorry to disturb you all but after one month and a half I never received 
> any comment about this patch set and I fear it ended up in a forgotten 
> corner. I would like to know if someone is considering it or not.

Adding Peter to CC using his latest EMail address.

Peter, you said you wanted to update MAINTAINERs with the new address?



Re: [PATCH v3] cpuidle: menu: Handle stopped tick more aggressively

2018-08-12 Thread leo . yan
On Fri, Aug 10, 2018 at 01:15:58PM +0200, Rafael J . Wysocki wrote:
> From: Rafael J. Wysocki 
> 
> Commit 87c9fe6ee495 (cpuidle: menu: Avoid selecting shallow states
> with stopped tick) missed the case when the target residencies of
> deep idle states of CPUs are above the tick boundary which may cause
> the CPU to get stuck in a shallow idle state for a long time.
> 
> Say there are two CPU idle states available: one shallow, with the
> target residency much below the tick boundary and one deep, with
> the target residency significantly above the tick boundary.  In
> that case, if the tick has been stopped already and the expected
> next timer event is relatively far in the future, the governor will
> assume the idle duration to be equal to TICK_USEC and it will select
> the idle state for the CPU accordingly.  However, that will cause the
> shallow state to be selected even though it would have been more
> energy-efficient to select the deep one.
> 
> To address this issue, modify the governor to always assume idle
> duration to be equal to the time till the closest timer event if
> the tick is not running which will cause the selected idle states
> to always match the known CPU wakeup time.
> 
> Also make it always indicate that the tick should be stopped in
> that case for consistency.
> 
> Fixes: 87c9fe6ee495 (cpuidle: menu: Avoid selecting shallow states with 
> stopped tick)
> Reported-by: Leo Yan 
> Signed-off-by: Rafael J. Wysocki 
> ---
> 
> -> v2: Initialize first_idx properly in the stopped tick case.
> 
> -> v3: Compute data->bucket before checking whether or not the tick has been
>stopped already to prevent it from becoming stale.
> 
> ---
>  drivers/cpuidle/governors/menu.c |   55 
> +--
>  1 file changed, 25 insertions(+), 30 deletions(-)
> 
> Index: linux-pm/drivers/cpuidle/governors/menu.c
> ===
> --- linux-pm.orig/drivers/cpuidle/governors/menu.c
> +++ linux-pm/drivers/cpuidle/governors/menu.c
> @@ -285,9 +285,8 @@ static int menu_select(struct cpuidle_dr
>  {
>   struct menu_device *data = this_cpu_ptr(_devices);
>   int latency_req = cpuidle_governor_latency_req(dev->cpu);
> - int i;
> - int first_idx;
> - int idx;
> + int first_idx = 0;
> + int idx, i;
>   unsigned int interactivity_req;
>   unsigned int expected_interval;
>   unsigned long nr_iowaiters, cpu_load;
> @@ -311,6 +310,18 @@ static int menu_select(struct cpuidle_dr
>   data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
>  
>   /*
> +  * If the tick is already stopped, the cost of possible short idle
> +  * duration misprediction is much higher, because the CPU may be stuck
> +  * in a shallow idle state for a long time as a result of it.  In that
> +  * case say we might mispredict and use the known time till the closest
> +  * timer event for the idle state selection.
> +  */
> + if (tick_nohz_tick_stopped()) {
> + data->predicted_us = ktime_to_us(delta_next);
> + goto select;
> + }

I tried this patch at my side, firstly just clarify this patch is okay
for me, but there have other underlying issues I observed the CPU
staying shallow idle state with tick stopped, so just note at here.

>From my understanding, the rational for this patch is we
only use the timer event as the reliable wake up source; if there have
one short timer event then we can select shallow state, otherwise we
also can select deepest idle state for long expired timer.

This means the idle governor needs to know the reliable info for the
timer event, so far I observe there at least have two issues for timer
event delta value cannot be trusted.

The first one issue is caused by timer cancel, I wrote one case for
CPU_0 starting a hrtimer with pinned mode with short expire time and
when the CPU_0 goes to sleep this short timeout timer can let idle
governor selects a shallow state; at the meantime another CPU_1 will
be used to try to cancel the timer, my purpose is to cheat CPU_0 so can
see the CPU_0 staying in shallow state for long time;  it has low
percentage to cancel the timer successfully, but I do see seldomly the
timer can be canceled successfully so CPU_0 will stay in idle for long
time (I cannot explain why the timer cannot be canceled successfully
for every time, this might be another issue?).  This case is tricky,
but it's possible happen in drivers with timer cancel.

Another issue is caused by spurious interrupts; if we review the
function tick_nohz_get_sleep_length(), it uses 'ts->idle_entrytime' to
calculate tick or timer delta, so every time when exit from interrupt
and before enter idle governor, it needs to update
'ts->idle_entrytime'; but for spurious interrupts, it will not call
irq_enter() and irq_exit() pairs, so it doesn't invoke below flows:

  irq_exit()
`->tick_irq_exit()
 

Re: [PATCH v3] cpuidle: menu: Handle stopped tick more aggressively

2018-08-12 Thread leo . yan
On Fri, Aug 10, 2018 at 01:15:58PM +0200, Rafael J . Wysocki wrote:
> From: Rafael J. Wysocki 
> 
> Commit 87c9fe6ee495 (cpuidle: menu: Avoid selecting shallow states
> with stopped tick) missed the case when the target residencies of
> deep idle states of CPUs are above the tick boundary which may cause
> the CPU to get stuck in a shallow idle state for a long time.
> 
> Say there are two CPU idle states available: one shallow, with the
> target residency much below the tick boundary and one deep, with
> the target residency significantly above the tick boundary.  In
> that case, if the tick has been stopped already and the expected
> next timer event is relatively far in the future, the governor will
> assume the idle duration to be equal to TICK_USEC and it will select
> the idle state for the CPU accordingly.  However, that will cause the
> shallow state to be selected even though it would have been more
> energy-efficient to select the deep one.
> 
> To address this issue, modify the governor to always assume idle
> duration to be equal to the time till the closest timer event if
> the tick is not running which will cause the selected idle states
> to always match the known CPU wakeup time.
> 
> Also make it always indicate that the tick should be stopped in
> that case for consistency.
> 
> Fixes: 87c9fe6ee495 (cpuidle: menu: Avoid selecting shallow states with 
> stopped tick)
> Reported-by: Leo Yan 
> Signed-off-by: Rafael J. Wysocki 
> ---
> 
> -> v2: Initialize first_idx properly in the stopped tick case.
> 
> -> v3: Compute data->bucket before checking whether or not the tick has been
>stopped already to prevent it from becoming stale.
> 
> ---
>  drivers/cpuidle/governors/menu.c |   55 
> +--
>  1 file changed, 25 insertions(+), 30 deletions(-)
> 
> Index: linux-pm/drivers/cpuidle/governors/menu.c
> ===
> --- linux-pm.orig/drivers/cpuidle/governors/menu.c
> +++ linux-pm/drivers/cpuidle/governors/menu.c
> @@ -285,9 +285,8 @@ static int menu_select(struct cpuidle_dr
>  {
>   struct menu_device *data = this_cpu_ptr(_devices);
>   int latency_req = cpuidle_governor_latency_req(dev->cpu);
> - int i;
> - int first_idx;
> - int idx;
> + int first_idx = 0;
> + int idx, i;
>   unsigned int interactivity_req;
>   unsigned int expected_interval;
>   unsigned long nr_iowaiters, cpu_load;
> @@ -311,6 +310,18 @@ static int menu_select(struct cpuidle_dr
>   data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
>  
>   /*
> +  * If the tick is already stopped, the cost of possible short idle
> +  * duration misprediction is much higher, because the CPU may be stuck
> +  * in a shallow idle state for a long time as a result of it.  In that
> +  * case say we might mispredict and use the known time till the closest
> +  * timer event for the idle state selection.
> +  */
> + if (tick_nohz_tick_stopped()) {
> + data->predicted_us = ktime_to_us(delta_next);
> + goto select;
> + }

I tried this patch at my side, firstly just clarify this patch is okay
for me, but there have other underlying issues I observed the CPU
staying shallow idle state with tick stopped, so just note at here.

>From my understanding, the rational for this patch is we
only use the timer event as the reliable wake up source; if there have
one short timer event then we can select shallow state, otherwise we
also can select deepest idle state for long expired timer.

This means the idle governor needs to know the reliable info for the
timer event, so far I observe there at least have two issues for timer
event delta value cannot be trusted.

The first one issue is caused by timer cancel, I wrote one case for
CPU_0 starting a hrtimer with pinned mode with short expire time and
when the CPU_0 goes to sleep this short timeout timer can let idle
governor selects a shallow state; at the meantime another CPU_1 will
be used to try to cancel the timer, my purpose is to cheat CPU_0 so can
see the CPU_0 staying in shallow state for long time;  it has low
percentage to cancel the timer successfully, but I do see seldomly the
timer can be canceled successfully so CPU_0 will stay in idle for long
time (I cannot explain why the timer cannot be canceled successfully
for every time, this might be another issue?).  This case is tricky,
but it's possible happen in drivers with timer cancel.

Another issue is caused by spurious interrupts; if we review the
function tick_nohz_get_sleep_length(), it uses 'ts->idle_entrytime' to
calculate tick or timer delta, so every time when exit from interrupt
and before enter idle governor, it needs to update
'ts->idle_entrytime'; but for spurious interrupts, it will not call
irq_enter() and irq_exit() pairs, so it doesn't invoke below flows:

  irq_exit()
`->tick_irq_exit()
 

[PATCH v2] pinctrl: qcom: Add sdm660 pinctrl driver

2018-08-12 Thread Craig Tatlor
Add initial pinctrl driver to support pin configuration with
pinctrl framework for sdm660.
Based off CAF implementation.

Signed-off-by: Craig Tatlor 
---

Changes from v1:
  Adds gpio-ranges property to bindings


 .../bindings/pinctrl/qcom,sdm660-pinctrl.txt  |  202 +++
 drivers/pinctrl/qcom/Kconfig  |   10 +
 drivers/pinctrl/qcom/Makefile |1 +
 drivers/pinctrl/qcom/pinctrl-sdm660.c | 1451 +
 4 files changed, 1663 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
 create mode 100644 drivers/pinctrl/qcom/pinctrl-sdm660.c

diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt 
b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
new file mode 100644
index ..801960ad2112
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
@@ -0,0 +1,202 @@
+Qualcomm Technologies, Inc. SDM660 TLMM block
+
+This binding describes the Top Level Mode Multiplexer block found in the
+SDM660 platform.
+
+- compatible:
+   Usage: required
+   Value type: 
+   Definition: must be "qcom,sdm660-pinctrl"
+
+- reg:
+   Usage: required
+   Value type: 
+   Definition: the base address and size of the TLMM register space.
+
+- interrupts:
+   Usage: required
+   Value type: 
+   Definition: should specify the TLMM summary IRQ.
+
+- interrupt-controller:
+   Usage: required
+   Value type: 
+   Definition: identifies this node as an interrupt controller
+
+- #interrupt-cells:
+   Usage: required
+   Value type: 
+   Definition: must be 2. Specifying the pin number and flags, as defined
+   in 
+
+- gpio-controller:
+   Usage: required
+   Value type: 
+   Definition: identifies this node as a gpio controller
+
+- gpio-ranges:
+   Usage: required
+   Value type: 
+   Definition: Specifies the mapping between gpio controller and
+   pin-controller pins.
+
+- #gpio-cells:
+   Usage: required
+   Value type: 
+   Definition: must be 2. Specifying the pin number and flags, as defined
+   in 
+
+Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for
+a general description of GPIO and interrupt bindings.
+
+Please refer to pinctrl-bindings.txt in this directory for details of the
+common pinctrl bindings used by client devices, including the meaning of the
+phrase "pin configuration node".
+
+The pin configuration nodes act as a container for an arbitrary number of
+subnodes. Each of these subnodes represents some desired configuration for a
+pin, a group, or a list of pins or groups. This configuration can include the
+mux function to select on those pin(s)/group(s), and various pin configuration
+parameters, such as pull-up, drive strength, etc.
+
+
+PIN CONFIGURATION NODES:
+
+The name of each subnode is not important; all subnodes should be enumerated
+and processed purely based on their content.
+
+Each subnode only affects those parameters that are explicitly listed. In
+other words, a subnode that lists a mux function but no pin configuration
+parameters implies no information about any pin configuration parameters.
+Similarly, a pin subnode that describes a pullup parameter implies no
+information about e.g. the mux function.
+
+
+The following generic properties as defined in pinctrl-bindings.txt are valid
+to specify in a pin configuration subnode:
+
+- pins:
+   Usage: required
+   Value type: 
+   Definition: List of gpio pins affected by the properties specified in
+   this subnode.  Valid pins are:
+   gpio0-gpio113,
+   Supports mux, bias and drive-strength
+   sdc1_clk, sdc1_cmd, sdc1_data sdc2_clk, sdc2_cmd, sdc2_data 
sdc1_rclk,
+   Supports bias and drive-strength
+
+- function:
+   Usage: required
+   Value type: 
+   Definition: Specify the alternative function to be configured for the
+   specified pins. Functions are only valid for gpio pins.
+   Valid values are:
+
+   blsp_uart1, blsp_spi1, blsp_i2c1, blsp_uim1, atest_tsens,
+   bimc_dte1, dac_calib0, blsp_spi8, blsp_uart8, blsp_uim8,
+   qdss_cti_trig_out_b, bimc_dte0, dac_calib1, 
qdss_cti_trig_in_b,
+   dac_calib2, atest_tsens2, atest_usb1, blsp_spi10, 
blsp_uart10,
+   blsp_uim10, atest_bbrx1, atest_usb13, atest_bbrx0, 
atest_usb12,
+   mdp_vsync, edp_lcd, blsp_i2c10, atest_gpsadc1, atest_usb11,
+   atest_gpsadc0, edp_hot, atest_usb10, m_voc, dac_gpio, 
atest_char,
+   cam_mclk, pll_bypassnl, qdss_stm7, blsp_i2c8, 
qdss_tracedata_b,
+   pll_reset, qdss_stm6, qdss_stm5, qdss_stm4, atest_usb2, 
cci_i2c,
+   

[PATCH v2] pinctrl: qcom: Add sdm660 pinctrl driver

2018-08-12 Thread Craig Tatlor
Add initial pinctrl driver to support pin configuration with
pinctrl framework for sdm660.
Based off CAF implementation.

Signed-off-by: Craig Tatlor 
---

Changes from v1:
  Adds gpio-ranges property to bindings


 .../bindings/pinctrl/qcom,sdm660-pinctrl.txt  |  202 +++
 drivers/pinctrl/qcom/Kconfig  |   10 +
 drivers/pinctrl/qcom/Makefile |1 +
 drivers/pinctrl/qcom/pinctrl-sdm660.c | 1451 +
 4 files changed, 1663 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
 create mode 100644 drivers/pinctrl/qcom/pinctrl-sdm660.c

diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt 
b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
new file mode 100644
index ..801960ad2112
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
@@ -0,0 +1,202 @@
+Qualcomm Technologies, Inc. SDM660 TLMM block
+
+This binding describes the Top Level Mode Multiplexer block found in the
+SDM660 platform.
+
+- compatible:
+   Usage: required
+   Value type: 
+   Definition: must be "qcom,sdm660-pinctrl"
+
+- reg:
+   Usage: required
+   Value type: 
+   Definition: the base address and size of the TLMM register space.
+
+- interrupts:
+   Usage: required
+   Value type: 
+   Definition: should specify the TLMM summary IRQ.
+
+- interrupt-controller:
+   Usage: required
+   Value type: 
+   Definition: identifies this node as an interrupt controller
+
+- #interrupt-cells:
+   Usage: required
+   Value type: 
+   Definition: must be 2. Specifying the pin number and flags, as defined
+   in 
+
+- gpio-controller:
+   Usage: required
+   Value type: 
+   Definition: identifies this node as a gpio controller
+
+- gpio-ranges:
+   Usage: required
+   Value type: 
+   Definition: Specifies the mapping between gpio controller and
+   pin-controller pins.
+
+- #gpio-cells:
+   Usage: required
+   Value type: 
+   Definition: must be 2. Specifying the pin number and flags, as defined
+   in 
+
+Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for
+a general description of GPIO and interrupt bindings.
+
+Please refer to pinctrl-bindings.txt in this directory for details of the
+common pinctrl bindings used by client devices, including the meaning of the
+phrase "pin configuration node".
+
+The pin configuration nodes act as a container for an arbitrary number of
+subnodes. Each of these subnodes represents some desired configuration for a
+pin, a group, or a list of pins or groups. This configuration can include the
+mux function to select on those pin(s)/group(s), and various pin configuration
+parameters, such as pull-up, drive strength, etc.
+
+
+PIN CONFIGURATION NODES:
+
+The name of each subnode is not important; all subnodes should be enumerated
+and processed purely based on their content.
+
+Each subnode only affects those parameters that are explicitly listed. In
+other words, a subnode that lists a mux function but no pin configuration
+parameters implies no information about any pin configuration parameters.
+Similarly, a pin subnode that describes a pullup parameter implies no
+information about e.g. the mux function.
+
+
+The following generic properties as defined in pinctrl-bindings.txt are valid
+to specify in a pin configuration subnode:
+
+- pins:
+   Usage: required
+   Value type: 
+   Definition: List of gpio pins affected by the properties specified in
+   this subnode.  Valid pins are:
+   gpio0-gpio113,
+   Supports mux, bias and drive-strength
+   sdc1_clk, sdc1_cmd, sdc1_data sdc2_clk, sdc2_cmd, sdc2_data 
sdc1_rclk,
+   Supports bias and drive-strength
+
+- function:
+   Usage: required
+   Value type: 
+   Definition: Specify the alternative function to be configured for the
+   specified pins. Functions are only valid for gpio pins.
+   Valid values are:
+
+   blsp_uart1, blsp_spi1, blsp_i2c1, blsp_uim1, atest_tsens,
+   bimc_dte1, dac_calib0, blsp_spi8, blsp_uart8, blsp_uim8,
+   qdss_cti_trig_out_b, bimc_dte0, dac_calib1, 
qdss_cti_trig_in_b,
+   dac_calib2, atest_tsens2, atest_usb1, blsp_spi10, 
blsp_uart10,
+   blsp_uim10, atest_bbrx1, atest_usb13, atest_bbrx0, 
atest_usb12,
+   mdp_vsync, edp_lcd, blsp_i2c10, atest_gpsadc1, atest_usb11,
+   atest_gpsadc0, edp_hot, atest_usb10, m_voc, dac_gpio, 
atest_char,
+   cam_mclk, pll_bypassnl, qdss_stm7, blsp_i2c8, 
qdss_tracedata_b,
+   pll_reset, qdss_stm6, qdss_stm5, qdss_stm4, atest_usb2, 
cci_i2c,
+   

[PATCH V2] arm64: dts: sdm630 SoC and Sony Pioneer (Xperia XA2) support

2018-08-12 Thread Craig Tatlor
Initial device tree support for Qualcomm SDM630 SoC and
Sony Pioneer (Xperia XA2).

SDM630 is based off of the SDM660 soc and all SDM660 specific drivers are
compatible with it. SDM660 is also based off of MSM8998 so it uses some
of its drivers aswell.

The device tree is based on the CAF 4.4 kernel tree.

The device can be booted into the initrd with a shell over UART.

Signed-off-by: Craig Tatlor 
---

Changes from v1:
   Adds gpio-ranges node to allow gpio-hogs to function


 arch/arm64/boot/dts/qcom/Makefile|   1 +
 arch/arm64/boot/dts/qcom/sdm630-pins.dtsi|  17 +
 arch/arm64/boot/dts/qcom/sdm630-pioneer.dts  |  16 +
 arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi |  22 ++
 arch/arm64/boot/dts/qcom/sdm630.dtsi | 384 +++
 5 files changed, 439 insertions(+)
 create mode 100644 arch/arm64/boot/dts/qcom/sdm630-pins.dtsi
 create mode 100644 arch/arm64/boot/dts/qcom/sdm630-pioneer.dts
 create mode 100644 arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi
 create mode 100644 arch/arm64/boot/dts/qcom/sdm630.dtsi

diff --git a/arch/arm64/boot/dts/qcom/Makefile 
b/arch/arm64/boot/dts/qcom/Makefile
index 9319e74b8906..80f98bb19998 100644
--- a/arch/arm64/boot/dts/qcom/Makefile
+++ b/arch/arm64/boot/dts/qcom/Makefile
@@ -6,4 +6,5 @@ dtb-$(CONFIG_ARCH_QCOM) += msm8916-mtp.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8992-bullhead-rev-101.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8994-angler-rev-101.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8996-mtp.dtb
+dtb-$(CONFIG_ARCH_QCOM)+= sdm630-pioneer.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= sdm845-mtp.dtb
diff --git a/arch/arm64/boot/dts/qcom/sdm630-pins.dtsi 
b/arch/arm64/boot/dts/qcom/sdm630-pins.dtsi
new file mode 100644
index ..78b79c1076f1
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sdm630-pins.dtsi
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Craig Tatlor. */
+
+ {
+   blsp1_uart1_default: blsp1_uart1_default {
+   pinmux {
+   pins = "gpio0", "gpio1", "gpio2", "gpio3";
+   function = "gpio";
+   };
+
+   pinconf {
+   pins = "gpio0", "gpio1", "gpio2", "gpio3";
+   drive-strength = <2>;
+   bias-disable;
+   };
+   };
+};
diff --git a/arch/arm64/boot/dts/qcom/sdm630-pioneer.dts 
b/arch/arm64/boot/dts/qcom/sdm630-pioneer.dts
new file mode 100644
index ..67c7e3b57739
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sdm630-pioneer.dts
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Craig Tatlor. */
+
+/dts-v1/;
+
+#include "sdm630-pioneer.dtsi"
+
+/ {
+   model = "Sony Xperia XA2";
+   compatible = "sony,pioneer", "qcom,sdm630";
+
+   /* required for bootloader to select correct board */
+   qcom,board-id = <8 0>;
+   qcom,pmic-id = <0x0001001b 0x0101011a 0x0 0x0>,
+   <0x0001001b 0x0201011a 0x0 0x0>;
+};
diff --git a/arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi 
b/arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi
new file mode 100644
index ..512792c23369
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Craig Tatlor. */
+
+#include "sdm630.dtsi"
+
+/ {
+   aliases {
+   serial0 = _uart1;
+   };
+
+   chosen {
+   stdout-path = "serial0:115200n8";
+   };
+};
+
+ {
+   serial@c17 {
+   status = "okay";
+   pinctrl-names = "default";
+   pinctrl-0 = <_uart1_default>;
+   };
+};
diff --git a/arch/arm64/boot/dts/qcom/sdm630.dtsi 
b/arch/arm64/boot/dts/qcom/sdm630.dtsi
new file mode 100644
index ..8a544979b7c0
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sdm630.dtsi
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Craig Tatlor. */
+
+#include 
+#include 
+
+/ {
+   model = "Qualcomm Technologies, Inc. SDM630";
+
+   interrupt-parent = <>;
+
+   qcom,msm-id = <318 0x0>;
+
+   #address-cells = <2>;
+   #size-cells = <2>;
+
+   chosen { };
+
+   memory {
+   device_type = "memory";
+   /* We expect the bootloader to fill in the reg */
+   reg = <0 0 0 0>;
+   };
+
+
+   cpus {
+   #address-cells = <2>;
+   #size-cells = <0>;
+
+   CPU0: cpu@100 {
+   device_type = "cpu";
+   compatible = "arm,armv8";
+   reg = <0x0 0x100>;
+   enable-method = "psci";
+   efficiency = <1126>;
+   next-level-cache = <_1>;
+   L2_1: l2-cache {
+   compatible = "arm,arch-cache";
+   cache-level = <2>;
+   };
+   L1_I_100: 

[PATCH V2] arm64: dts: sdm630 SoC and Sony Pioneer (Xperia XA2) support

2018-08-12 Thread Craig Tatlor
Initial device tree support for Qualcomm SDM630 SoC and
Sony Pioneer (Xperia XA2).

SDM630 is based off of the SDM660 soc and all SDM660 specific drivers are
compatible with it. SDM660 is also based off of MSM8998 so it uses some
of its drivers aswell.

The device tree is based on the CAF 4.4 kernel tree.

The device can be booted into the initrd with a shell over UART.

Signed-off-by: Craig Tatlor 
---

Changes from v1:
   Adds gpio-ranges node to allow gpio-hogs to function


 arch/arm64/boot/dts/qcom/Makefile|   1 +
 arch/arm64/boot/dts/qcom/sdm630-pins.dtsi|  17 +
 arch/arm64/boot/dts/qcom/sdm630-pioneer.dts  |  16 +
 arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi |  22 ++
 arch/arm64/boot/dts/qcom/sdm630.dtsi | 384 +++
 5 files changed, 439 insertions(+)
 create mode 100644 arch/arm64/boot/dts/qcom/sdm630-pins.dtsi
 create mode 100644 arch/arm64/boot/dts/qcom/sdm630-pioneer.dts
 create mode 100644 arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi
 create mode 100644 arch/arm64/boot/dts/qcom/sdm630.dtsi

diff --git a/arch/arm64/boot/dts/qcom/Makefile 
b/arch/arm64/boot/dts/qcom/Makefile
index 9319e74b8906..80f98bb19998 100644
--- a/arch/arm64/boot/dts/qcom/Makefile
+++ b/arch/arm64/boot/dts/qcom/Makefile
@@ -6,4 +6,5 @@ dtb-$(CONFIG_ARCH_QCOM) += msm8916-mtp.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8992-bullhead-rev-101.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8994-angler-rev-101.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= msm8996-mtp.dtb
+dtb-$(CONFIG_ARCH_QCOM)+= sdm630-pioneer.dtb
 dtb-$(CONFIG_ARCH_QCOM)+= sdm845-mtp.dtb
diff --git a/arch/arm64/boot/dts/qcom/sdm630-pins.dtsi 
b/arch/arm64/boot/dts/qcom/sdm630-pins.dtsi
new file mode 100644
index ..78b79c1076f1
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sdm630-pins.dtsi
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Craig Tatlor. */
+
+ {
+   blsp1_uart1_default: blsp1_uart1_default {
+   pinmux {
+   pins = "gpio0", "gpio1", "gpio2", "gpio3";
+   function = "gpio";
+   };
+
+   pinconf {
+   pins = "gpio0", "gpio1", "gpio2", "gpio3";
+   drive-strength = <2>;
+   bias-disable;
+   };
+   };
+};
diff --git a/arch/arm64/boot/dts/qcom/sdm630-pioneer.dts 
b/arch/arm64/boot/dts/qcom/sdm630-pioneer.dts
new file mode 100644
index ..67c7e3b57739
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sdm630-pioneer.dts
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Craig Tatlor. */
+
+/dts-v1/;
+
+#include "sdm630-pioneer.dtsi"
+
+/ {
+   model = "Sony Xperia XA2";
+   compatible = "sony,pioneer", "qcom,sdm630";
+
+   /* required for bootloader to select correct board */
+   qcom,board-id = <8 0>;
+   qcom,pmic-id = <0x0001001b 0x0101011a 0x0 0x0>,
+   <0x0001001b 0x0201011a 0x0 0x0>;
+};
diff --git a/arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi 
b/arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi
new file mode 100644
index ..512792c23369
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sdm630-pioneer.dtsi
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Craig Tatlor. */
+
+#include "sdm630.dtsi"
+
+/ {
+   aliases {
+   serial0 = _uart1;
+   };
+
+   chosen {
+   stdout-path = "serial0:115200n8";
+   };
+};
+
+ {
+   serial@c17 {
+   status = "okay";
+   pinctrl-names = "default";
+   pinctrl-0 = <_uart1_default>;
+   };
+};
diff --git a/arch/arm64/boot/dts/qcom/sdm630.dtsi 
b/arch/arm64/boot/dts/qcom/sdm630.dtsi
new file mode 100644
index ..8a544979b7c0
--- /dev/null
+++ b/arch/arm64/boot/dts/qcom/sdm630.dtsi
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Craig Tatlor. */
+
+#include 
+#include 
+
+/ {
+   model = "Qualcomm Technologies, Inc. SDM630";
+
+   interrupt-parent = <>;
+
+   qcom,msm-id = <318 0x0>;
+
+   #address-cells = <2>;
+   #size-cells = <2>;
+
+   chosen { };
+
+   memory {
+   device_type = "memory";
+   /* We expect the bootloader to fill in the reg */
+   reg = <0 0 0 0>;
+   };
+
+
+   cpus {
+   #address-cells = <2>;
+   #size-cells = <0>;
+
+   CPU0: cpu@100 {
+   device_type = "cpu";
+   compatible = "arm,armv8";
+   reg = <0x0 0x100>;
+   enable-method = "psci";
+   efficiency = <1126>;
+   next-level-cache = <_1>;
+   L2_1: l2-cache {
+   compatible = "arm,arch-cache";
+   cache-level = <2>;
+   };
+   L1_I_100: 

Re: [PATCH v2] cpuidle: menu: Handle stopped tick more aggressively

2018-08-12 Thread leo . yan
On Sun, Aug 12, 2018 at 12:07:45PM +0200, Rafael J. Wysocki wrote:

[...]

> > > > > --- linux-pm.orig/drivers/cpuidle/governors/menu.c
> > > > > +++ linux-pm/drivers/cpuidle/governors/menu.c
> > > > > @@ -285,9 +285,8 @@ static int menu_select(struct cpuidle_dr
> > > > >  {
> > > > >   struct menu_device *data = this_cpu_ptr(_devices);
> > > > >   int latency_req = cpuidle_governor_latency_req(dev->cpu);
> > > > > - int i;
> > > > > - int first_idx;
> > > > > - int idx;
> > > > > + int first_idx = 0;
> > > > > + int idx, i;
> > > > >   unsigned int interactivity_req;
> > > > >   unsigned int expected_interval;
> > > > >   unsigned long nr_iowaiters, cpu_load;
> > > > > @@ -307,6 +306,18 @@ static int menu_select(struct cpuidle_dr
> > > > >   /* determine the expected residency time, round up */
> > > > >   data->next_timer_us = 
> > > > > ktime_to_us(tick_nohz_get_sleep_length(_next));
> > > > >
> > > > > + /*
> > > > > +  * If the tick is already stopped, the cost of possible short 
> > > > > idle
> > > > > +  * duration misprediction is much higher, because the CPU may 
> > > > > be stuck
> > > > > +  * in a shallow idle state for a long time as a result of it.  
> > > > > In that
> > > > > +  * case say we might mispredict and use the known time till the 
> > > > > closest
> > > > > +  * timer event for the idle state selection.
> > > > > +  */
> > > > > + if (tick_nohz_tick_stopped()) {
> > > > > + data->predicted_us = ktime_to_us(delta_next);
> > > > > + goto select;
> > > > > + }
> > > > > +
> > > >
> > > > This introduce two potential issues:
> > > >
> > > > - This will totally ignore the typical pattern in idle loop; I
> > > >   observed on the mmc driver can trigger multiple times (> 10 times)
> > > >   with consistent interval;
> > >
> > > I'm not sure what you mean by "ignore".
> >
> > You could see after move code from blow to this position, the typical
> > pattern interval will not be accounted; so if in the middle of idles
> > there have a bunch of interrupts with fix pattern, the upper code
> > cannot detect this pattern anymore.
> 
> I'm not really following you here.
> 
> The part of the code skipped for tick_nohz_tick_stopped() doesn't
> update the data at all AFAICS.  It only computes some values that
> would be discarded later anyway, so I'm not sure what the point of
> running that computation is.

Sorry I don't explain clearly, so try to rephrase:

With your patch for the tick stopped case, it directly uses tick delta
value as prediction and goto 'select' tag.  So it skips below code
pieces, these codes have minor improvement for typical pattern which
can be applied in the middle of idles, for example, the mmc driver
triggers 16 interrupts with ~1500us interval, these interrupts are all
handled within the idle loop, so the typical pattern can detect the mmc
interrupts pattern and it will help idle governor to select a shallower
idle state so can avoid to break the residency.

You mentioned these computed values would be discarded later, this is
true for most cases, but it isn't always true actually.  Without your
patch, the governor will discard the computed values only when
'data->predicted_us < TICK_USEC', otherwise the interval pattern is
still be applied in the prediction.

expected_interval = get_typical_interval(data);
expected_interval = min(expected_interval, data->next_timer_us);

[...]

/*
 * Use the lowest expected idle interval to pick the idle state.
 */
data->predicted_us = min(data->predicted_us, expected_interval);

> The statistics are updated by menu_update() and that still runs and it
> will take the actual wakeup events into account, won't it?

Yes.

> > [...]
> >
> > > > > - if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
> > > > > - expected_interval < TICK_USEC) {
> > > > > + if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
> > > > > + expected_interval < TICK_USEC) && 
> > > > > !tick_nohz_tick_stopped()) {
> > > >
> > > > I am not sure this logic is right... Why not use below checking, so
> > > > for POLLING state we will never ask to stop the tick?
> > > >
> > > > if (drv->states[idx].flags & CPUIDLE_FLAG_POLLING ||
> > > > (expected_interval < TICK_USEC && 
> > > > !tick_nohz_tick_stopped())) {
> > > >
> > >
> > > The only effect of it would be setting stop_tick to false, but why
> > > would that matter?
> >
> > Please consider below situation, not sure if this case is existed or
> > not:
> >
> >   step1: first time: enter one idle state with stopping tick;
> >   step2: second time: select POLLING state and tick_nohz_tick_stopped()
> >   is true;
> >
> > So in step2, it cannot set stop_tick to false with below sentence.
> >
> > > > >   unsigned int delta_next_us = ktime_to_us(delta_next);
> > > > >
> > > > >   

Re: [PATCH v2] cpuidle: menu: Handle stopped tick more aggressively

2018-08-12 Thread leo . yan
On Sun, Aug 12, 2018 at 12:07:45PM +0200, Rafael J. Wysocki wrote:

[...]

> > > > > --- linux-pm.orig/drivers/cpuidle/governors/menu.c
> > > > > +++ linux-pm/drivers/cpuidle/governors/menu.c
> > > > > @@ -285,9 +285,8 @@ static int menu_select(struct cpuidle_dr
> > > > >  {
> > > > >   struct menu_device *data = this_cpu_ptr(_devices);
> > > > >   int latency_req = cpuidle_governor_latency_req(dev->cpu);
> > > > > - int i;
> > > > > - int first_idx;
> > > > > - int idx;
> > > > > + int first_idx = 0;
> > > > > + int idx, i;
> > > > >   unsigned int interactivity_req;
> > > > >   unsigned int expected_interval;
> > > > >   unsigned long nr_iowaiters, cpu_load;
> > > > > @@ -307,6 +306,18 @@ static int menu_select(struct cpuidle_dr
> > > > >   /* determine the expected residency time, round up */
> > > > >   data->next_timer_us = 
> > > > > ktime_to_us(tick_nohz_get_sleep_length(_next));
> > > > >
> > > > > + /*
> > > > > +  * If the tick is already stopped, the cost of possible short 
> > > > > idle
> > > > > +  * duration misprediction is much higher, because the CPU may 
> > > > > be stuck
> > > > > +  * in a shallow idle state for a long time as a result of it.  
> > > > > In that
> > > > > +  * case say we might mispredict and use the known time till the 
> > > > > closest
> > > > > +  * timer event for the idle state selection.
> > > > > +  */
> > > > > + if (tick_nohz_tick_stopped()) {
> > > > > + data->predicted_us = ktime_to_us(delta_next);
> > > > > + goto select;
> > > > > + }
> > > > > +
> > > >
> > > > This introduce two potential issues:
> > > >
> > > > - This will totally ignore the typical pattern in idle loop; I
> > > >   observed on the mmc driver can trigger multiple times (> 10 times)
> > > >   with consistent interval;
> > >
> > > I'm not sure what you mean by "ignore".
> >
> > You could see after move code from blow to this position, the typical
> > pattern interval will not be accounted; so if in the middle of idles
> > there have a bunch of interrupts with fix pattern, the upper code
> > cannot detect this pattern anymore.
> 
> I'm not really following you here.
> 
> The part of the code skipped for tick_nohz_tick_stopped() doesn't
> update the data at all AFAICS.  It only computes some values that
> would be discarded later anyway, so I'm not sure what the point of
> running that computation is.

Sorry I don't explain clearly, so try to rephrase:

With your patch for the tick stopped case, it directly uses tick delta
value as prediction and goto 'select' tag.  So it skips below code
pieces, these codes have minor improvement for typical pattern which
can be applied in the middle of idles, for example, the mmc driver
triggers 16 interrupts with ~1500us interval, these interrupts are all
handled within the idle loop, so the typical pattern can detect the mmc
interrupts pattern and it will help idle governor to select a shallower
idle state so can avoid to break the residency.

You mentioned these computed values would be discarded later, this is
true for most cases, but it isn't always true actually.  Without your
patch, the governor will discard the computed values only when
'data->predicted_us < TICK_USEC', otherwise the interval pattern is
still be applied in the prediction.

expected_interval = get_typical_interval(data);
expected_interval = min(expected_interval, data->next_timer_us);

[...]

/*
 * Use the lowest expected idle interval to pick the idle state.
 */
data->predicted_us = min(data->predicted_us, expected_interval);

> The statistics are updated by menu_update() and that still runs and it
> will take the actual wakeup events into account, won't it?

Yes.

> > [...]
> >
> > > > > - if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
> > > > > - expected_interval < TICK_USEC) {
> > > > > + if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
> > > > > + expected_interval < TICK_USEC) && 
> > > > > !tick_nohz_tick_stopped()) {
> > > >
> > > > I am not sure this logic is right... Why not use below checking, so
> > > > for POLLING state we will never ask to stop the tick?
> > > >
> > > > if (drv->states[idx].flags & CPUIDLE_FLAG_POLLING ||
> > > > (expected_interval < TICK_USEC && 
> > > > !tick_nohz_tick_stopped())) {
> > > >
> > >
> > > The only effect of it would be setting stop_tick to false, but why
> > > would that matter?
> >
> > Please consider below situation, not sure if this case is existed or
> > not:
> >
> >   step1: first time: enter one idle state with stopping tick;
> >   step2: second time: select POLLING state and tick_nohz_tick_stopped()
> >   is true;
> >
> > So in step2, it cannot set stop_tick to false with below sentence.
> >
> > > > >   unsigned int delta_next_us = ktime_to_us(delta_next);
> > > > >
> > > > >   

Re: [PATCH 1/4] ARM: dts: exynos: Add missing used PMIC regulators on Exynos5422 Odroid boards

2018-08-12 Thread Krzysztof Kozlowski
On 11 August 2018 at 08:39, Anand Moon  wrote:
> Hi Krzysztof,
>
> These patches should also be ported to u-boot to enable PMIC.
>
> I was just looking into S2MPS11B data sheet
> According to the 1.2 Key Features of  S2MPS11
>
> BUCK1, BUCK2, BUCK3, BUCK4, BUCK6 have (0.65 V to 1.6 V) min /max range.
> BUCK5 have (0.65 V to 2.0 V) min /max range.
> BUCK7 have (1.2 V to 1.5 V) min / max range.
> BUCK8 have (1.8 V to 2.1 V) min / max range.
> BUCK10 have (0.75 V to 1.4 V) min / max range.
>
> BUCKBOST9 (3.0 V to 3.75 V) min / max range.
>
> LDO1, LDO6, LDO11, LDO22, LDO23, LDO27, LDO35 (0.8 V to 2.375 V) min / max 
> range
>
> LDO2, LDO3, LDO5, LDO7, LDO9, LDO10, LD012, LDO13, LDO14, LDO15,
> LDO16, LDO17, LDO18, LDO19, LDO20, LD021, LDO24, LDO25, LDO26,
> LDO28, LDO29, LDO30, LDO31, LDO32, LDO33, LDO34, LDO36, LDO37, LDO38
> (0.8 V to 3.95 V) min / max range
>
> I am aware off we have configures the min/max range as per board schematics.
> but each regulator support dynamic voltage scale and have different
> turn on/off voltage and current rating.
>
> So why are we not setting the regulator-min-microvolt /
> regulator-max-microvolt as per datasheets.
> Please correct me if I am wrong.

Which regulators are not configured as in datasheet?

Best regards,
Krzysztof


Re: [PATCH 1/4] ARM: dts: exynos: Add missing used PMIC regulators on Exynos5422 Odroid boards

2018-08-12 Thread Krzysztof Kozlowski
On 11 August 2018 at 08:39, Anand Moon  wrote:
> Hi Krzysztof,
>
> These patches should also be ported to u-boot to enable PMIC.
>
> I was just looking into S2MPS11B data sheet
> According to the 1.2 Key Features of  S2MPS11
>
> BUCK1, BUCK2, BUCK3, BUCK4, BUCK6 have (0.65 V to 1.6 V) min /max range.
> BUCK5 have (0.65 V to 2.0 V) min /max range.
> BUCK7 have (1.2 V to 1.5 V) min / max range.
> BUCK8 have (1.8 V to 2.1 V) min / max range.
> BUCK10 have (0.75 V to 1.4 V) min / max range.
>
> BUCKBOST9 (3.0 V to 3.75 V) min / max range.
>
> LDO1, LDO6, LDO11, LDO22, LDO23, LDO27, LDO35 (0.8 V to 2.375 V) min / max 
> range
>
> LDO2, LDO3, LDO5, LDO7, LDO9, LDO10, LD012, LDO13, LDO14, LDO15,
> LDO16, LDO17, LDO18, LDO19, LDO20, LD021, LDO24, LDO25, LDO26,
> LDO28, LDO29, LDO30, LDO31, LDO32, LDO33, LDO34, LDO36, LDO37, LDO38
> (0.8 V to 3.95 V) min / max range
>
> I am aware off we have configures the min/max range as per board schematics.
> but each regulator support dynamic voltage scale and have different
> turn on/off voltage and current rating.
>
> So why are we not setting the regulator-min-microvolt /
> regulator-max-microvolt as per datasheets.
> Please correct me if I am wrong.

Which regulators are not configured as in datasheet?

Best regards,
Krzysztof


[PATCH] docs: provide more details about security bug reporting

2018-08-12 Thread Willy Tarreau
Hi Linus,

please consider applying the attached patch to improve the doc on
the security reporting process.

Thanks,
Willy
>From a587418b587915bcaa5657909f52dc3995f29dcd Mon Sep 17 00:00:00 2001
From: Willy Tarreau 
Date: Fri, 10 Aug 2018 16:36:04 +0200
Subject: [PATCH] docs: provide more details about security bug reporting

The analysis, disclosure and crediting parts were completed a bit to
add clarification about what types of reports are expected, what the
reporter may expect in terms of disclosure, and how reporters are
credited for their discovery.

Signed-off-by: Willy Tarreau 
Acked-by: Greg Kroah-Hartman 
---
 Documentation/admin-guide/security-bugs.rst | 81 +
 1 file changed, 81 insertions(+)

diff --git a/Documentation/admin-guide/security-bugs.rst 
b/Documentation/admin-guide/security-bugs.rst
index 30491d9..91ecd48 100644
--- a/Documentation/admin-guide/security-bugs.rst
+++ b/Documentation/admin-guide/security-bugs.rst
@@ -26,6 +26,51 @@ information is helpful.  Any exploit code is very helpful 
and will not
 be released without consent from the reporter unless it has already been
 made public.
 
+Analysis
+
+
+Sometimes a bug will be very well understood by some of the security
+officers who will propose you a patch to test.  Please get prepared to
+receiving extra questions and to provide answers on a timely basis.
+There is little chance a bug will get fixed if you send an incomplete
+report and disappear for two weeks.  It is also possible that some of
+the officers will conclude that the behaviour you observed is normal
+and expected, that it is bogus but doesn't present an imminent
+security risk and should rather be discussed on public lists, or that
+it does indeed represent a risk, but that the risk of breakage induced
+by fixing it outweights the risks of the bug being exploited.  In such
+situations, it is possible that you will be requested to post your
+report to another more suitable place.
+
+Analysing a report takes a lot of time, and while sometimes it's
+better to conclude to a wrong alert because there is nothing to fix,
+it also is annoying if it is discovered that the reporter should have
+found it by himself, because the time lost on this analysis was not
+spent on another one.  This can happen all the time to be wrong about
+a report, but please be careful not to do this too often or your
+reports may not be taken seriously in the end.
+
+As a rule of thumb, it is recommended not to post messages suggesting
+that a bug may exist somewhere.  Since the security team manages
+imminent and important risks, bugs reported there must be based on
+facts and not on beliefs.  It is fine to report a panic message saying
+"I just got this, I don't know how it happened but it scares me", it is
+not fine to say "I ran my new automated analysis tool which thinks a
+check is missing here, could someone knowledgeable in this area please
+double-check".  The security team's role is not to have opinions on
+your beliefs but to spot the right people to help fix a real problem.
+
+Very often, some maintainers will be brought to the discussion as the
+analysis progresses. Most of the time these people will not have received
+the initial e-mail, and they're discovering the issue late.  So please do
+not get upset if they ask questions that were already addressed or which
+were present in the initial report.
+
+Also, don't consider the bug fixed until the fix is merged.  It can
+happen that a fix proposed by one of the security officers doesn't suit
+a subsystem maintainer and that it has to be reworked differently,
+possibly after a public discussion.
+
 Disclosure
 --
 
@@ -44,6 +89,25 @@ timeframe varies from immediate (esp. if it's already 
publicly known bug)
 to a few weeks.  As a basic default policy, we expect report date to
 release date to be on the order of 7 days.
 
+There is no point threatening to make a report public after XX days
+without a response because usually what you will end up with is a fix
+that is merged much earlier than what you possibly expected, for example
+if you promised to someone not to publish it before a certain date.
+Please just understand that the security team's goal is for your bug to
+be fixed as fast as possible and not to sleep on it.
+
+If you report a particularly complex issue that you intend to discuss
+at a conference a few weeks or months later, you cannot really expect
+from the security team to find a solution in time and at the same time
+to refrain from disclosing the issue to a broader audience or
+releasing the fix.  So at the very least you will have to take your
+dispositions to deal with a disclosure which happens much earlier than
+your public talk about the issue.  Also if you only sent an early
+notification about a forthcoming problem that is not yet fully
+disclosed, you must not expect the security officers to ping you again
+later about the issue; you are responsible for reloading 

[PATCH] docs: provide more details about security bug reporting

2018-08-12 Thread Willy Tarreau
Hi Linus,

please consider applying the attached patch to improve the doc on
the security reporting process.

Thanks,
Willy
>From a587418b587915bcaa5657909f52dc3995f29dcd Mon Sep 17 00:00:00 2001
From: Willy Tarreau 
Date: Fri, 10 Aug 2018 16:36:04 +0200
Subject: [PATCH] docs: provide more details about security bug reporting

The analysis, disclosure and crediting parts were completed a bit to
add clarification about what types of reports are expected, what the
reporter may expect in terms of disclosure, and how reporters are
credited for their discovery.

Signed-off-by: Willy Tarreau 
Acked-by: Greg Kroah-Hartman 
---
 Documentation/admin-guide/security-bugs.rst | 81 +
 1 file changed, 81 insertions(+)

diff --git a/Documentation/admin-guide/security-bugs.rst 
b/Documentation/admin-guide/security-bugs.rst
index 30491d9..91ecd48 100644
--- a/Documentation/admin-guide/security-bugs.rst
+++ b/Documentation/admin-guide/security-bugs.rst
@@ -26,6 +26,51 @@ information is helpful.  Any exploit code is very helpful 
and will not
 be released without consent from the reporter unless it has already been
 made public.
 
+Analysis
+
+
+Sometimes a bug will be very well understood by some of the security
+officers who will propose you a patch to test.  Please get prepared to
+receiving extra questions and to provide answers on a timely basis.
+There is little chance a bug will get fixed if you send an incomplete
+report and disappear for two weeks.  It is also possible that some of
+the officers will conclude that the behaviour you observed is normal
+and expected, that it is bogus but doesn't present an imminent
+security risk and should rather be discussed on public lists, or that
+it does indeed represent a risk, but that the risk of breakage induced
+by fixing it outweights the risks of the bug being exploited.  In such
+situations, it is possible that you will be requested to post your
+report to another more suitable place.
+
+Analysing a report takes a lot of time, and while sometimes it's
+better to conclude to a wrong alert because there is nothing to fix,
+it also is annoying if it is discovered that the reporter should have
+found it by himself, because the time lost on this analysis was not
+spent on another one.  This can happen all the time to be wrong about
+a report, but please be careful not to do this too often or your
+reports may not be taken seriously in the end.
+
+As a rule of thumb, it is recommended not to post messages suggesting
+that a bug may exist somewhere.  Since the security team manages
+imminent and important risks, bugs reported there must be based on
+facts and not on beliefs.  It is fine to report a panic message saying
+"I just got this, I don't know how it happened but it scares me", it is
+not fine to say "I ran my new automated analysis tool which thinks a
+check is missing here, could someone knowledgeable in this area please
+double-check".  The security team's role is not to have opinions on
+your beliefs but to spot the right people to help fix a real problem.
+
+Very often, some maintainers will be brought to the discussion as the
+analysis progresses. Most of the time these people will not have received
+the initial e-mail, and they're discovering the issue late.  So please do
+not get upset if they ask questions that were already addressed or which
+were present in the initial report.
+
+Also, don't consider the bug fixed until the fix is merged.  It can
+happen that a fix proposed by one of the security officers doesn't suit
+a subsystem maintainer and that it has to be reworked differently,
+possibly after a public discussion.
+
 Disclosure
 --
 
@@ -44,6 +89,25 @@ timeframe varies from immediate (esp. if it's already 
publicly known bug)
 to a few weeks.  As a basic default policy, we expect report date to
 release date to be on the order of 7 days.
 
+There is no point threatening to make a report public after XX days
+without a response because usually what you will end up with is a fix
+that is merged much earlier than what you possibly expected, for example
+if you promised to someone not to publish it before a certain date.
+Please just understand that the security team's goal is for your bug to
+be fixed as fast as possible and not to sleep on it.
+
+If you report a particularly complex issue that you intend to discuss
+at a conference a few weeks or months later, you cannot really expect
+from the security team to find a solution in time and at the same time
+to refrain from disclosing the issue to a broader audience or
+releasing the fix.  So at the very least you will have to take your
+dispositions to deal with a disclosure which happens much earlier than
+your public talk about the issue.  Also if you only sent an early
+notification about a forthcoming problem that is not yet fully
+disclosed, you must not expect the security officers to ping you again
+later about the issue; you are responsible for reloading 

Re: [PATCH] pinctrl: qcom: Add sdm660 pinctrl driver

2018-08-12 Thread Craig Tatlor



On 12 August 2018 13:42:27 BST, Christian Lamparter  wrote:
>On Sunday, August 12, 2018 9:18:19 AM CEST you wrote:
>> On 11 August 2018 18:27:43 BST, Christian Lamparter
> wrote:
>> >On Saturday, August 11, 2018 6:25:19 PM CEST Craig Tatlor wrote:
>> >> Add initial pinctrl driver to support pin configuration with
>> >> pinctrl framework for sdm660.
>> >> Based off CAF implementation.
>> >> 
>> >> Signed-off-by: Craig Tatlor 
>> >> ---
>> >> 
>> >> diff --git
>> >a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
>> >b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
>> >> new file mode 100644
>> >> index ..85e6c6c17c04
>> >> --- /dev/null
>> >> +++
>> >b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
>> >> @@ -0,0 +1,195 @@
>> >> +Qualcomm Technologies, Inc. SDM660 TLMM block
>> >> +
>> >> +This binding describes the Top Level Mode Multiplexer block found
>in
>> >the
>> >> +SDM660 platform.
>> >> +
>> >> +- compatible:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: must be "qcom,sdm660-pinctrl"
>> >> +
>> >> +- reg:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: the base address and size of the TLMM register
>space.
>> >> +
>> >> +- interrupts:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: should specify the TLMM summary IRQ.
>> >> +
>> >> +- interrupt-controller:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: identifies this node as an interrupt controller
>> >> +
>> >> +- #interrupt-cells:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: must be 2. Specifying the pin number and flags, as
>> >defined
>> >> + in 
>> >> +
>> >> +- gpio-controller:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: identifies this node as a gpio controller
>> >> +
>> >> +- #gpio-cells:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: must be 2. Specifying the pin number and flags, as
>> >defined
>> >> + in 
>> >> +
>> >> +Please refer to ../gpio/gpio.txt and
>> >../interrupt-controller/interrupts.txt for
>> >> +a general description of GPIO and interrupt bindings.
>> >You want to specify gpio-ranges here as well. The property is
>explained
>> >in Section "2.1) gpio- and pin-controller interaction" in
>> >../gpio/gpio.txt
>> >
>> >Without it, the gpio-hogs construct (part of ../gpio/gpio.txt) will
>> >cause
>> >the driver to fail during boot. (try it, ;-) )
>> Would gpio-ranges make sense for this, as the gpio and pinctrl are in
>same block?
>Yes, it's part of the ../gpio/gpio.txt which you link.
>Here's a copy of the relevant section that explains this
>gpio- and pin-controller interaction.
>
>
>|2.1) gpio- and pin-controller interaction
>|-
>|
>|Some or all of the GPIOs provided by a GPIO controller may be routed
>to pins
>|on the package via a pin controller. This allows muxing those pins
>between
>|GPIO and other functions.
>|It is useful to represent which GPIOs correspond to which pins on
>which pin
>|controllers. The gpio-ranges property described below represents this,
>and
>|contains information structures as follows:
>|
>|   gpio-range-list ::=  [gpio-range-list]
>|   single-gpio-range ::=  | 
>|   numeric-gpio-range ::=
>| 
>
>|   named-gpio-range ::=   '<0 0>'
>|   pinctrl-phandle : phandle to pin controller node
>|   gpio-base : Base GPIO ID in the GPIO controller
>|   pinctrl-base : Base pinctrl pin ID in the pin controller
>|   count : The number of GPIOs/pins in this range
>|
>|The "pin controller node" mentioned above must conform to the bindings
>|described in ../pinctrl/pinctrl-bindings.txt.
>|...
>
>As for the reason why gpio-ranges is what it is, please look at the ML
>discussion from the "pinctrl: msm: fix gpio-hog related boot issues"
>thread
>on  and the posts by 
>Linus Walleij: 
>and
>Stephen Boyd: .
>(It's quite a bit to take in)
Thanks for the links, makes sense now, I'll add in v2.
>
>> Seems no other qcom pinctrl drivers have it and I'm able to boot
>without it.
>Ok, let's run an experiment. Please remove the gpio-ranges property and
>try
>adding a test gpio-hog to your device's DTS:
>
>something like (I randomly selected GPIO5, but it shouldn't
>matter which gpio you select here. If you know a unused/NC
>pin/gpio, then you can use it instead):
>
> {
>   test-hog {
>   gpio-hog;
>   gpios = <5 0>;
>   output-low;
>   line-name = "test hog";
>   };
>};
>
>compile it and then watch the kernel on the next boot:
>
>without the gpio-ranges present, it will spew out something along the
>lines of:
>
>| requesting hog GPIO test hog (chip 300.pinctrl, offset 5) failed,
>-517
>| 

Re: [PATCH] pinctrl: qcom: Add sdm660 pinctrl driver

2018-08-12 Thread Craig Tatlor



On 12 August 2018 13:42:27 BST, Christian Lamparter  wrote:
>On Sunday, August 12, 2018 9:18:19 AM CEST you wrote:
>> On 11 August 2018 18:27:43 BST, Christian Lamparter
> wrote:
>> >On Saturday, August 11, 2018 6:25:19 PM CEST Craig Tatlor wrote:
>> >> Add initial pinctrl driver to support pin configuration with
>> >> pinctrl framework for sdm660.
>> >> Based off CAF implementation.
>> >> 
>> >> Signed-off-by: Craig Tatlor 
>> >> ---
>> >> 
>> >> diff --git
>> >a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
>> >b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
>> >> new file mode 100644
>> >> index ..85e6c6c17c04
>> >> --- /dev/null
>> >> +++
>> >b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
>> >> @@ -0,0 +1,195 @@
>> >> +Qualcomm Technologies, Inc. SDM660 TLMM block
>> >> +
>> >> +This binding describes the Top Level Mode Multiplexer block found
>in
>> >the
>> >> +SDM660 platform.
>> >> +
>> >> +- compatible:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: must be "qcom,sdm660-pinctrl"
>> >> +
>> >> +- reg:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: the base address and size of the TLMM register
>space.
>> >> +
>> >> +- interrupts:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: should specify the TLMM summary IRQ.
>> >> +
>> >> +- interrupt-controller:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: identifies this node as an interrupt controller
>> >> +
>> >> +- #interrupt-cells:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: must be 2. Specifying the pin number and flags, as
>> >defined
>> >> + in 
>> >> +
>> >> +- gpio-controller:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: identifies this node as a gpio controller
>> >> +
>> >> +- #gpio-cells:
>> >> + Usage: required
>> >> + Value type: 
>> >> + Definition: must be 2. Specifying the pin number and flags, as
>> >defined
>> >> + in 
>> >> +
>> >> +Please refer to ../gpio/gpio.txt and
>> >../interrupt-controller/interrupts.txt for
>> >> +a general description of GPIO and interrupt bindings.
>> >You want to specify gpio-ranges here as well. The property is
>explained
>> >in Section "2.1) gpio- and pin-controller interaction" in
>> >../gpio/gpio.txt
>> >
>> >Without it, the gpio-hogs construct (part of ../gpio/gpio.txt) will
>> >cause
>> >the driver to fail during boot. (try it, ;-) )
>> Would gpio-ranges make sense for this, as the gpio and pinctrl are in
>same block?
>Yes, it's part of the ../gpio/gpio.txt which you link.
>Here's a copy of the relevant section that explains this
>gpio- and pin-controller interaction.
>
>
>|2.1) gpio- and pin-controller interaction
>|-
>|
>|Some or all of the GPIOs provided by a GPIO controller may be routed
>to pins
>|on the package via a pin controller. This allows muxing those pins
>between
>|GPIO and other functions.
>|It is useful to represent which GPIOs correspond to which pins on
>which pin
>|controllers. The gpio-ranges property described below represents this,
>and
>|contains information structures as follows:
>|
>|   gpio-range-list ::=  [gpio-range-list]
>|   single-gpio-range ::=  | 
>|   numeric-gpio-range ::=
>| 
>
>|   named-gpio-range ::=   '<0 0>'
>|   pinctrl-phandle : phandle to pin controller node
>|   gpio-base : Base GPIO ID in the GPIO controller
>|   pinctrl-base : Base pinctrl pin ID in the pin controller
>|   count : The number of GPIOs/pins in this range
>|
>|The "pin controller node" mentioned above must conform to the bindings
>|described in ../pinctrl/pinctrl-bindings.txt.
>|...
>
>As for the reason why gpio-ranges is what it is, please look at the ML
>discussion from the "pinctrl: msm: fix gpio-hog related boot issues"
>thread
>on  and the posts by 
>Linus Walleij: 
>and
>Stephen Boyd: .
>(It's quite a bit to take in)
Thanks for the links, makes sense now, I'll add in v2.
>
>> Seems no other qcom pinctrl drivers have it and I'm able to boot
>without it.
>Ok, let's run an experiment. Please remove the gpio-ranges property and
>try
>adding a test gpio-hog to your device's DTS:
>
>something like (I randomly selected GPIO5, but it shouldn't
>matter which gpio you select here. If you know a unused/NC
>pin/gpio, then you can use it instead):
>
> {
>   test-hog {
>   gpio-hog;
>   gpios = <5 0>;
>   output-low;
>   line-name = "test hog";
>   };
>};
>
>compile it and then watch the kernel on the next boot:
>
>without the gpio-ranges present, it will spew out something along the
>lines of:
>
>| requesting hog GPIO test hog (chip 300.pinctrl, offset 5) failed,
>-517
>| 

Re: [PATCH] pinctrl: qcom: Add sdm660 pinctrl driver

2018-08-12 Thread Christian Lamparter
On Sunday, August 12, 2018 9:18:19 AM CEST you wrote:
> On 11 August 2018 18:27:43 BST, Christian Lamparter  
> wrote:
> >On Saturday, August 11, 2018 6:25:19 PM CEST Craig Tatlor wrote:
> >> Add initial pinctrl driver to support pin configuration with
> >> pinctrl framework for sdm660.
> >> Based off CAF implementation.
> >> 
> >> Signed-off-by: Craig Tatlor 
> >> ---
> >> 
> >> diff --git
> >a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
> >b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
> >> new file mode 100644
> >> index ..85e6c6c17c04
> >> --- /dev/null
> >> +++
> >b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
> >> @@ -0,0 +1,195 @@
> >> +Qualcomm Technologies, Inc. SDM660 TLMM block
> >> +
> >> +This binding describes the Top Level Mode Multiplexer block found in
> >the
> >> +SDM660 platform.
> >> +
> >> +- compatible:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: must be "qcom,sdm660-pinctrl"
> >> +
> >> +- reg:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: the base address and size of the TLMM register space.
> >> +
> >> +- interrupts:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: should specify the TLMM summary IRQ.
> >> +
> >> +- interrupt-controller:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: identifies this node as an interrupt controller
> >> +
> >> +- #interrupt-cells:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: must be 2. Specifying the pin number and flags, as
> >defined
> >> +  in 
> >> +
> >> +- gpio-controller:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: identifies this node as a gpio controller
> >> +
> >> +- #gpio-cells:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: must be 2. Specifying the pin number and flags, as
> >defined
> >> +  in 
> >> +
> >> +Please refer to ../gpio/gpio.txt and
> >../interrupt-controller/interrupts.txt for
> >> +a general description of GPIO and interrupt bindings.
> >You want to specify gpio-ranges here as well. The property is explained
> >in Section "2.1) gpio- and pin-controller interaction" in
> >../gpio/gpio.txt
> >
> >Without it, the gpio-hogs construct (part of ../gpio/gpio.txt) will
> >cause
> >the driver to fail during boot. (try it, ;-) )
> Would gpio-ranges make sense for this, as the gpio and pinctrl are in same 
> block?
Yes, it's part of the ../gpio/gpio.txt which you link.
Here's a copy of the relevant section that explains this
gpio- and pin-controller interaction.


|2.1) gpio- and pin-controller interaction
|-
|
|Some or all of the GPIOs provided by a GPIO controller may be routed to pins
|on the package via a pin controller. This allows muxing those pins between
|GPIO and other functions.
|It is useful to represent which GPIOs correspond to which pins on which pin
|controllers. The gpio-ranges property described below represents this, and
|contains information structures as follows:
|
|   gpio-range-list ::=  [gpio-range-list]
|   single-gpio-range ::=  | 
|   numeric-gpio-range ::=
|  
|   named-gpio-range ::=   '<0 0>'
|   pinctrl-phandle : phandle to pin controller node
|   gpio-base : Base GPIO ID in the GPIO controller
|   pinctrl-base : Base pinctrl pin ID in the pin controller
|   count : The number of GPIOs/pins in this range
|
|The "pin controller node" mentioned above must conform to the bindings
|described in ../pinctrl/pinctrl-bindings.txt.
|...

As for the reason why gpio-ranges is what it is, please look at the ML
discussion from the "pinctrl: msm: fix gpio-hog related boot issues" thread
on  and the posts by 
Linus Walleij:  and
Stephen Boyd: .
(It's quite a bit to take in)

> Seems no other qcom pinctrl drivers have it and I'm able to boot without it.
Ok, let's run an experiment. Please remove the gpio-ranges property and try
adding a test gpio-hog to your device's DTS:

something like (I randomly selected GPIO5, but it shouldn't
matter which gpio you select here. If you know a unused/NC
pin/gpio, then you can use it instead):

 {
test-hog {
gpio-hog;
gpios = <5 0>;
output-low;
line-name = "test hog";
};
};

compile it and then watch the kernel on the next boot:

without the gpio-ranges present, it will spew out something along the
lines of:

| requesting hog GPIO test hog (chip 300.pinctrl, offset 5) failed, -517
| gpiochip_add_data: GPIOs 0..114 (300.pinctrl) failed to register
| sdm660-pinctrl 300.pinctrl: Failed register gpiochip

The single gpio-hog causes havoc and takes down the sdm660-pinctrl with it.
And every driver that depends on the 

Re: [PATCH] pinctrl: qcom: Add sdm660 pinctrl driver

2018-08-12 Thread Christian Lamparter
On Sunday, August 12, 2018 9:18:19 AM CEST you wrote:
> On 11 August 2018 18:27:43 BST, Christian Lamparter  
> wrote:
> >On Saturday, August 11, 2018 6:25:19 PM CEST Craig Tatlor wrote:
> >> Add initial pinctrl driver to support pin configuration with
> >> pinctrl framework for sdm660.
> >> Based off CAF implementation.
> >> 
> >> Signed-off-by: Craig Tatlor 
> >> ---
> >> 
> >> diff --git
> >a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
> >b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
> >> new file mode 100644
> >> index ..85e6c6c17c04
> >> --- /dev/null
> >> +++
> >b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
> >> @@ -0,0 +1,195 @@
> >> +Qualcomm Technologies, Inc. SDM660 TLMM block
> >> +
> >> +This binding describes the Top Level Mode Multiplexer block found in
> >the
> >> +SDM660 platform.
> >> +
> >> +- compatible:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: must be "qcom,sdm660-pinctrl"
> >> +
> >> +- reg:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: the base address and size of the TLMM register space.
> >> +
> >> +- interrupts:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: should specify the TLMM summary IRQ.
> >> +
> >> +- interrupt-controller:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: identifies this node as an interrupt controller
> >> +
> >> +- #interrupt-cells:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: must be 2. Specifying the pin number and flags, as
> >defined
> >> +  in 
> >> +
> >> +- gpio-controller:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: identifies this node as a gpio controller
> >> +
> >> +- #gpio-cells:
> >> +  Usage: required
> >> +  Value type: 
> >> +  Definition: must be 2. Specifying the pin number and flags, as
> >defined
> >> +  in 
> >> +
> >> +Please refer to ../gpio/gpio.txt and
> >../interrupt-controller/interrupts.txt for
> >> +a general description of GPIO and interrupt bindings.
> >You want to specify gpio-ranges here as well. The property is explained
> >in Section "2.1) gpio- and pin-controller interaction" in
> >../gpio/gpio.txt
> >
> >Without it, the gpio-hogs construct (part of ../gpio/gpio.txt) will
> >cause
> >the driver to fail during boot. (try it, ;-) )
> Would gpio-ranges make sense for this, as the gpio and pinctrl are in same 
> block?
Yes, it's part of the ../gpio/gpio.txt which you link.
Here's a copy of the relevant section that explains this
gpio- and pin-controller interaction.


|2.1) gpio- and pin-controller interaction
|-
|
|Some or all of the GPIOs provided by a GPIO controller may be routed to pins
|on the package via a pin controller. This allows muxing those pins between
|GPIO and other functions.
|It is useful to represent which GPIOs correspond to which pins on which pin
|controllers. The gpio-ranges property described below represents this, and
|contains information structures as follows:
|
|   gpio-range-list ::=  [gpio-range-list]
|   single-gpio-range ::=  | 
|   numeric-gpio-range ::=
|  
|   named-gpio-range ::=   '<0 0>'
|   pinctrl-phandle : phandle to pin controller node
|   gpio-base : Base GPIO ID in the GPIO controller
|   pinctrl-base : Base pinctrl pin ID in the pin controller
|   count : The number of GPIOs/pins in this range
|
|The "pin controller node" mentioned above must conform to the bindings
|described in ../pinctrl/pinctrl-bindings.txt.
|...

As for the reason why gpio-ranges is what it is, please look at the ML
discussion from the "pinctrl: msm: fix gpio-hog related boot issues" thread
on  and the posts by 
Linus Walleij:  and
Stephen Boyd: .
(It's quite a bit to take in)

> Seems no other qcom pinctrl drivers have it and I'm able to boot without it.
Ok, let's run an experiment. Please remove the gpio-ranges property and try
adding a test gpio-hog to your device's DTS:

something like (I randomly selected GPIO5, but it shouldn't
matter which gpio you select here. If you know a unused/NC
pin/gpio, then you can use it instead):

 {
test-hog {
gpio-hog;
gpios = <5 0>;
output-low;
line-name = "test hog";
};
};

compile it and then watch the kernel on the next boot:

without the gpio-ranges present, it will spew out something along the
lines of:

| requesting hog GPIO test hog (chip 300.pinctrl, offset 5) failed, -517
| gpiochip_add_data: GPIOs 0..114 (300.pinctrl) failed to register
| sdm660-pinctrl 300.pinctrl: Failed register gpiochip

The single gpio-hog causes havoc and takes down the sdm660-pinctrl with it.
And every driver that depends on the 

[PATCH v2 3/3] arm64: dts: actions: Add sirq node for Actions Semi S700

2018-08-12 Thread Parthiban Nallathambi
Add sirq node for Actions Semi S700 SoC with 3 SIRQ pins support,
in which external interrupt controllers can be connected.

Example:
atc260x: atc2603c@65 {
interrupt-parent = <>;
interrupts = <2 IRQ_TYPE_LEVEL_HIGH>;
};

Signed-off-by: Parthiban Nallathambi 
Signed-off-by: Saravanan Sekar 
---
 arch/arm64/boot/dts/actions/s700.dtsi | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/boot/dts/actions/s700.dtsi 
b/arch/arm64/boot/dts/actions/s700.dtsi
index 66dd5309f0a2..c5aef5ac7f46 100644
--- a/arch/arm64/boot/dts/actions/s700.dtsi
+++ b/arch/arm64/boot/dts/actions/s700.dtsi
@@ -153,6 +153,15 @@
status = "disabled";
};
 
+   sirq: interrupt-controller@e01b {
+   compatible = "actions,owl-sirq";
+   reg = <0 0xe01b 0 0x1000>;
+   interrupt-controller;
+   #interrupt-cells = <2>;
+   actions,sirq-shared-reg;
+   actions,sirq-offset = <0x200 0x200 0x200>;
+   };
+
sps: power-controller@e01b0100 {
compatible = "actions,s700-sps";
reg = <0x0 0xe01b0100 0x0 0x100>;
-- 
2.14.4



[PATCH v2 2/3] drivers/irqchip: Add Actions external interrupts support

2018-08-12 Thread Parthiban Nallathambi
Actions Semi Owl family SoC's S500, S700 and S900 provides support
for 3 external interrupt controllers through SIRQ pins.

Each line can be independently configured as interrupt and triggers
on either of the edges (raising or falling) or either of the levels
(high or low) . Each line can also be masked independently.

Signed-off-by: Parthiban Nallathambi 
Signed-off-by: Saravanan Sekar 
---
 drivers/irqchip/Makefile   |   1 +
 drivers/irqchip/irq-owl-sirq.c | 305 +
 2 files changed, 306 insertions(+)
 create mode 100644 drivers/irqchip/irq-owl-sirq.c

diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 15f268f646bf..072c4409e7c4 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_ATH79) += irq-ath79-misc.o
 obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2835.o
 obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2836.o
 obj-$(CONFIG_ARCH_EXYNOS)  += exynos-combiner.o
+obj-$(CONFIG_ARCH_ACTIONS) += irq-owl-sirq.o
 obj-$(CONFIG_FARADAY_FTINTC010)+= irq-ftintc010.o
 obj-$(CONFIG_ARCH_HIP04)   += irq-hip04.o
 obj-$(CONFIG_ARCH_LPC32XX) += irq-lpc32xx.o
diff --git a/drivers/irqchip/irq-owl-sirq.c b/drivers/irqchip/irq-owl-sirq.c
new file mode 100644
index ..b69301388300
--- /dev/null
+++ b/drivers/irqchip/irq-owl-sirq.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *
+ * Actions Semi Owl SoCs SIRQ interrupt controller driver
+ *
+ * Copyright (C) 2014 Actions Semi Inc.
+ * David Liu 
+ *
+ * Author: Parthiban Nallathambi 
+ * Author: Saravanan Sekar 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define INTC_GIC_INTERRUPT_PIN 13
+#define INTC_EXTCTL_PENDINGBIT(0)
+#define INTC_EXTCTL_CLK_SELBIT(4)
+#define INTC_EXTCTL_EN BIT(5)
+#defineINTC_EXTCTL_TYPE_MASK   GENMASK(6, 7)
+#defineINTC_EXTCTL_TYPE_HIGH   0
+#defineINTC_EXTCTL_TYPE_LOWBIT(6)
+#defineINTC_EXTCTL_TYPE_RISING BIT(7)
+#defineINTC_EXTCTL_TYPE_FALLING(BIT(6) | BIT(7))
+
+#define get_sirq_offset(x) chip_data->sirq[x].offset
+
+/* Per SIRQ data */
+struct owl_sirq {
+   u16 offset;
+   /* software is responsible to clear interrupt pending bit when
+* type is edge triggered. This value is for per SIRQ line.
+*/
+   bool type_edge;
+};
+
+struct owl_sirq_chip_data {
+   void __iomem *base;
+   raw_spinlock_t lock;
+   /* some SoC's share the register for all SIRQ lines, so maintain
+* register is shared or not here. This value is from DT.
+*/
+   bool shared_reg;
+   struct owl_sirq *sirq;
+};
+
+static struct owl_sirq_chip_data *sirq_data;
+
+static unsigned int sirq_read_extctl(struct irq_data *data)
+{
+   struct owl_sirq_chip_data *chip_data = data->chip_data;
+   unsigned int val;
+
+   val = readl_relaxed(chip_data->base + get_sirq_offset(data->hwirq));
+   if (chip_data->shared_reg)
+   val = (val >> (2 - data->hwirq) * 8) & 0xff;
+
+   return val;
+}
+
+static void sirq_write_extctl(struct irq_data *data, unsigned int extctl)
+{
+   struct owl_sirq_chip_data *chip_data = data->chip_data;
+   unsigned int val;
+
+   if (chip_data->shared_reg) {
+   val = readl_relaxed(chip_data->base +
+   get_sirq_offset(data->hwirq));
+   val &= ~(0xff << (2 - data->hwirq) * 8);
+   extctl &= 0xff;
+   extctl = (extctl << (2 - data->hwirq) * 8) | val;
+   }
+
+   writel_relaxed(extctl, chip_data->base +
+   get_sirq_offset(data->hwirq));
+}
+
+static void owl_sirq_ack(struct irq_data *data)
+{
+   struct owl_sirq_chip_data *chip_data = data->chip_data;
+   unsigned int extctl;
+   unsigned long flags;
+
+   /* software must clear external interrupt pending, when interrupt type
+* is edge triggered, so we need per SIRQ based clearing.
+*/
+   if (chip_data->sirq[data->hwirq].type_edge) {
+   raw_spin_lock_irqsave(_data->lock, flags);
+
+   extctl = sirq_read_extctl(data);
+   extctl |= INTC_EXTCTL_PENDING;
+   sirq_write_extctl(data, extctl);
+
+   raw_spin_unlock_irqrestore(_data->lock, flags);
+   }
+   irq_chip_ack_parent(data);
+}
+
+static void owl_sirq_mask(struct irq_data *data)
+{
+   struct owl_sirq_chip_data *chip_data = data->chip_data;
+   unsigned int extctl;
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(_data->lock, flags);
+
+   extctl = sirq_read_extctl(data);
+   extctl &= ~(INTC_EXTCTL_EN);
+   sirq_write_extctl(data, extctl);
+
+   raw_spin_unlock_irqrestore(_data->lock, flags);
+   irq_chip_mask_parent(data);
+}
+

[PATCH v2 0/3] Add Actions Semi Owl family sirq support

2018-08-12 Thread Parthiban Nallathambi
This patch series add support for external interrupt controller
in Actions Semi Owl famil of SoC's (S500, S700 and S900). Actions
provides support for external interrupt controller to be connected
with it's SoC's using 3 SIRQ pins.

Each line can be configures independently, i.e 3 independent external
interrupt controller can be connected and managed parallely.

Device tree node is created only for S700 after testing it in Cubieboard7.

Changelog in v2:
- Added SIRQ as hierarchical chip
GIC <> SIRQ <> External interrupt controller/Child devices
- Device binding updates with vendor prefix
- Register sharing handled globally and common init sequence/data for all
actions SoC family

Thanks,
Parthiban
Saravanan

Parthiban Nallathambi (3):
  dt-bindings: interrupt-controller: Actions external interrupt
controller
  drivers/irqchip: Add Actions external interrupts support
  arm64: dts: actions: Add sirq node for Actions Semi S700

 .../interrupt-controller/actions,owl-sirq.txt  |  46 
 arch/arm64/boot/dts/actions/s700.dtsi  |   9 +
 drivers/irqchip/Makefile   |   1 +
 drivers/irqchip/irq-owl-sirq.c | 305 +
 4 files changed, 361 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.txt
 create mode 100644 drivers/irqchip/irq-owl-sirq.c

-- 
2.14.4



[PATCH v2 2/3] drivers/irqchip: Add Actions external interrupts support

2018-08-12 Thread Parthiban Nallathambi
Actions Semi Owl family SoC's S500, S700 and S900 provides support
for 3 external interrupt controllers through SIRQ pins.

Each line can be independently configured as interrupt and triggers
on either of the edges (raising or falling) or either of the levels
(high or low) . Each line can also be masked independently.

Signed-off-by: Parthiban Nallathambi 
Signed-off-by: Saravanan Sekar 
---
 drivers/irqchip/Makefile   |   1 +
 drivers/irqchip/irq-owl-sirq.c | 305 +
 2 files changed, 306 insertions(+)
 create mode 100644 drivers/irqchip/irq-owl-sirq.c

diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 15f268f646bf..072c4409e7c4 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_ATH79) += irq-ath79-misc.o
 obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2835.o
 obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2836.o
 obj-$(CONFIG_ARCH_EXYNOS)  += exynos-combiner.o
+obj-$(CONFIG_ARCH_ACTIONS) += irq-owl-sirq.o
 obj-$(CONFIG_FARADAY_FTINTC010)+= irq-ftintc010.o
 obj-$(CONFIG_ARCH_HIP04)   += irq-hip04.o
 obj-$(CONFIG_ARCH_LPC32XX) += irq-lpc32xx.o
diff --git a/drivers/irqchip/irq-owl-sirq.c b/drivers/irqchip/irq-owl-sirq.c
new file mode 100644
index ..b69301388300
--- /dev/null
+++ b/drivers/irqchip/irq-owl-sirq.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *
+ * Actions Semi Owl SoCs SIRQ interrupt controller driver
+ *
+ * Copyright (C) 2014 Actions Semi Inc.
+ * David Liu 
+ *
+ * Author: Parthiban Nallathambi 
+ * Author: Saravanan Sekar 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define INTC_GIC_INTERRUPT_PIN 13
+#define INTC_EXTCTL_PENDINGBIT(0)
+#define INTC_EXTCTL_CLK_SELBIT(4)
+#define INTC_EXTCTL_EN BIT(5)
+#defineINTC_EXTCTL_TYPE_MASK   GENMASK(6, 7)
+#defineINTC_EXTCTL_TYPE_HIGH   0
+#defineINTC_EXTCTL_TYPE_LOWBIT(6)
+#defineINTC_EXTCTL_TYPE_RISING BIT(7)
+#defineINTC_EXTCTL_TYPE_FALLING(BIT(6) | BIT(7))
+
+#define get_sirq_offset(x) chip_data->sirq[x].offset
+
+/* Per SIRQ data */
+struct owl_sirq {
+   u16 offset;
+   /* software is responsible to clear interrupt pending bit when
+* type is edge triggered. This value is for per SIRQ line.
+*/
+   bool type_edge;
+};
+
+struct owl_sirq_chip_data {
+   void __iomem *base;
+   raw_spinlock_t lock;
+   /* some SoC's share the register for all SIRQ lines, so maintain
+* register is shared or not here. This value is from DT.
+*/
+   bool shared_reg;
+   struct owl_sirq *sirq;
+};
+
+static struct owl_sirq_chip_data *sirq_data;
+
+static unsigned int sirq_read_extctl(struct irq_data *data)
+{
+   struct owl_sirq_chip_data *chip_data = data->chip_data;
+   unsigned int val;
+
+   val = readl_relaxed(chip_data->base + get_sirq_offset(data->hwirq));
+   if (chip_data->shared_reg)
+   val = (val >> (2 - data->hwirq) * 8) & 0xff;
+
+   return val;
+}
+
+static void sirq_write_extctl(struct irq_data *data, unsigned int extctl)
+{
+   struct owl_sirq_chip_data *chip_data = data->chip_data;
+   unsigned int val;
+
+   if (chip_data->shared_reg) {
+   val = readl_relaxed(chip_data->base +
+   get_sirq_offset(data->hwirq));
+   val &= ~(0xff << (2 - data->hwirq) * 8);
+   extctl &= 0xff;
+   extctl = (extctl << (2 - data->hwirq) * 8) | val;
+   }
+
+   writel_relaxed(extctl, chip_data->base +
+   get_sirq_offset(data->hwirq));
+}
+
+static void owl_sirq_ack(struct irq_data *data)
+{
+   struct owl_sirq_chip_data *chip_data = data->chip_data;
+   unsigned int extctl;
+   unsigned long flags;
+
+   /* software must clear external interrupt pending, when interrupt type
+* is edge triggered, so we need per SIRQ based clearing.
+*/
+   if (chip_data->sirq[data->hwirq].type_edge) {
+   raw_spin_lock_irqsave(_data->lock, flags);
+
+   extctl = sirq_read_extctl(data);
+   extctl |= INTC_EXTCTL_PENDING;
+   sirq_write_extctl(data, extctl);
+
+   raw_spin_unlock_irqrestore(_data->lock, flags);
+   }
+   irq_chip_ack_parent(data);
+}
+
+static void owl_sirq_mask(struct irq_data *data)
+{
+   struct owl_sirq_chip_data *chip_data = data->chip_data;
+   unsigned int extctl;
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(_data->lock, flags);
+
+   extctl = sirq_read_extctl(data);
+   extctl &= ~(INTC_EXTCTL_EN);
+   sirq_write_extctl(data, extctl);
+
+   raw_spin_unlock_irqrestore(_data->lock, flags);
+   irq_chip_mask_parent(data);
+}
+

  1   2   >