Re: [PATCH] powerpc: Move a dereference below a NULL test

2018-09-26 Thread zhong jiang
On 2018/9/26 22:22, Michal Suchánek wrote:
> On Wed, 26 Sep 2018 19:46:08 +0800
> zhong jiang  wrote:
>
>> It is safe to move dereference below a NULL test.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>  arch/powerpc/kernel/cacheinfo.c | 5 ++---
>>  1 file changed, 2 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/cacheinfo.c
>> b/arch/powerpc/kernel/cacheinfo.c index a8f20e5..7f19714 100644
>> --- a/arch/powerpc/kernel/cacheinfo.c
>> +++ b/arch/powerpc/kernel/cacheinfo.c
>> @@ -401,14 +401,13 @@ static struct cache
>> *cache_lookup_or_instantiate(struct device_node *node, struct cache
>> *cache; 
>>  cache = cache_lookup_by_node(node);
>> +if (!cache)
>> +cache = cache_do_one_devnode(node, level);
>>  
>>  WARN_ONCE(cache && cache->level != level,
> This has also null test so cache should be dereferenced only when
> non-null here.
:-[ ,  you're right.  I forget WARN_ONCE.  please ignore the patch.

Sincerely,
zhong jiang
> Thanks
>
> Michal
>
> .
>




Re: [PATCH] powerpc: Move a dereference below a NULL test

2018-09-26 Thread zhong jiang
On 2018/9/26 21:58, Christophe LEROY wrote:
>
>
> Le 26/09/2018 à 13:46, zhong jiang a écrit :
>> It is safe to move dereference below a NULL test.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>   arch/powerpc/kernel/cacheinfo.c | 5 ++---
>>   1 file changed, 2 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/cacheinfo.c 
>> b/arch/powerpc/kernel/cacheinfo.c
>> index a8f20e5..7f19714 100644
>> --- a/arch/powerpc/kernel/cacheinfo.c
>> +++ b/arch/powerpc/kernel/cacheinfo.c
>> @@ -401,14 +401,13 @@ static struct cache 
>> *cache_lookup_or_instantiate(struct device_node *node,
>>   struct cache *cache;
>> cache = cache_lookup_by_node(node);
>> +if (!cache)
>> +cache = cache_do_one_devnode(node, level);
>
> But by doing this, you change the meaning of the following warning. Is that 
> what you want ? In that case the text of the WARN_ONCE() should be changed, 
> because the mismatch is not only on lookup now.
>
Yep, I forget the WARN_ONCE. I think we should just remove it. Thought?

Thanks,
zhong jiang
> Christophe
>
>> WARN_ONCE(cache && cache->level != level,
>> "cache level mismatch on lookup (got %d, expected %d)\n",
>> cache->level, level);
>>   -if (!cache)
>> -cache = cache_do_one_devnode(node, level);
>> -
>>   return cache;
>>   }
>>  
>
> .
>




[PATCH] powerpc: Move a dereference below a NULL test

2018-09-26 Thread zhong jiang
It is safe to move dereference below a NULL test.

Signed-off-by: zhong jiang 
---
 arch/powerpc/kernel/cacheinfo.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index a8f20e5..7f19714 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -401,14 +401,13 @@ static struct cache *cache_lookup_or_instantiate(struct 
device_node *node,
struct cache *cache;
 
cache = cache_lookup_by_node(node);
+   if (!cache)
+   cache = cache_do_one_devnode(node, level);
 
WARN_ONCE(cache && cache->level != level,
  "cache level mismatch on lookup (got %d, expected %d)\n",
  cache->level, level);
 
-   if (!cache)
-   cache = cache_do_one_devnode(node, level);
-
return cache;
 }
 
-- 
1.7.12.4



[PATCH] powerpc: xive: Move a dereference below a NULL test

2018-09-26 Thread zhong jiang
It is safe to move dereference below a NULL test.

Signed-off-by: zhong jiang 
---
 arch/powerpc/sysdev/xive/common.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 959a2a6..9824074 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1010,12 +1010,13 @@ static void xive_ipi_eoi(struct irq_data *d)
 {
struct xive_cpu *xc = __this_cpu_read(xive_cpu);
 
-   DBG_VERBOSE("IPI eoi: irq=%d [0x%lx] (HW IRQ 0x%x) pending=%02x\n",
-   d->irq, irqd_to_hwirq(d), xc->hw_ipi, xc->pending_prio);
-
/* Handle possible race with unplug and drop stale IPIs */
if (!xc)
return;
+
+   DBG_VERBOSE("IPI eoi: irq=%d [0x%lx] (HW IRQ 0x%x) pending=%02x\n",
+   d->irq, irqd_to_hwirq(d), xc->hw_ipi, xc->pending_prio);
+
xive_do_source_eoi(xc->hw_ipi, >ipi_data);
xive_do_queue_eoi(xc);
 }
-- 
1.7.12.4



Re: [PATCH 2/2] powerpc: Use ARRAY_SIZE to replace its implementation

2018-08-14 Thread zhong jiang
On 2018/8/14 17:28, Michael Ellerman wrote:
> zhong jiang  writes:
>> Use ARRAY_SIZE instead of dividing sizeof array with sizeof an element.
>> So just replace it.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>  arch/powerpc/xmon/ppc-opc.c | 12 
>>  1 file changed, 4 insertions(+), 8 deletions(-)
> This code is copied from binutils and we don't want to needlessly cause
> it to diverge from the binutils copy.
>
> So thanks but no thanks.
Thank you for clarification.

Sincerely
zhong jiang
> cheers
>
>> diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c
>> index ac2b55b..f3f57a1 100644
>> --- a/arch/powerpc/xmon/ppc-opc.c
>> +++ b/arch/powerpc/xmon/ppc-opc.c
>> @@ -966,8 +966,7 @@
>>{ 0xff, 11, NULL, NULL, PPC_OPERAND_SIGNOPT },
>>  };
>>  
>> -const unsigned int num_powerpc_operands = (sizeof (powerpc_operands)
>> -   / sizeof (powerpc_operands[0]));
>> +const unsigned int num_powerpc_operands = ARRAY_SIZE(powerpc_operands);
>>  
>>  /* The functions used to insert and extract complicated operands.  */
>>  
>> @@ -6980,8 +6979,7 @@
>>  {"fcfidu.", XRC(63,974,1),  XRA_MASK, POWER7|PPCA2, PPCVLE, {FRT, 
>> FRB}},
>>  };
>>  
>> -const int powerpc_num_opcodes =
>> -  sizeof (powerpc_opcodes) / sizeof (powerpc_opcodes[0]);
>> +const int powerpc_num_opcodes = ARRAY_SIZE(powerpc_opcodes);
>>  
>>  /* The VLE opcode table.
>>  
>> @@ -7219,8 +7217,7 @@
>>  {"se_bl",   BD8(58,0,1),BD8_MASK,   PPCVLE, 0,  {B8}},
>>  };
>>  
>> -const int vle_num_opcodes =
>> -  sizeof (vle_opcodes) / sizeof (vle_opcodes[0]);
>> +const int vle_num_opcodes = ARRAY_SIZE(vle_opcodes);
>>  
>>  /* The macro table.  This is only used by the assembler.  */
>>  
>> @@ -7288,5 +7285,4 @@
>>  {"e_clrlslwi",4, PPCVLE, "e_rlwinm %0,%1,%3,(%2)-(%3),31-(%3)"},
>>  };
>>  
>> -const int powerpc_num_macros =
>> -  sizeof (powerpc_macros) / sizeof (powerpc_macros[0]);
>> +const int powerpc_num_macros = ARRAY_SIZE(powerpc_macros);
>> -- 
>> 1.7.12.4
> .
>




Re: [PATCH 1/2] ia64: Use ARRAY_SIZE to replace its implementation

2018-08-13 Thread zhong jiang
On 2018/8/14 12:45, Joe Perches wrote:
> On Tue, 2018-08-14 at 10:46 +0800, zhong jiang wrote:
>> We prefer to ARRAY_SIZE rather than duplicating its implementation.
>> So just replace it.
> []
>> diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
> []
>> @@ -4645,7 +4645,7 @@ static char *pfmfs_dname(struct dentry *dentry, char 
>> *buffer, int buflen)
>>  /* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, 
>> pfarg_dbreg_t, NULL),
>>  /* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, 
>> pfarg_dbreg_t, NULL)
>>  };
>> -#define PFM_CMD_COUNT   (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
>> +#define PFM_CMD_COUNT   ARRAY_SIZE(pfm_cmd_tab)
> Better would be to remove the #define altogether and change
> the one place where it's used to ARRAY_SIZE(...)
> ---
>  arch/ia64/kernel/perfmon.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
> index a9d4dc6c0427..08ece2c7b6e1 100644
> --- a/arch/ia64/kernel/perfmon.c
> +++ b/arch/ia64/kernel/perfmon.c
> @@ -4645,7 +4645,6 @@ static pfm_cmd_desc_t pfm_cmd_tab[]={
>  /* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, 
> pfarg_dbreg_t, NULL),
>  /* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, 
> pfarg_dbreg_t, NULL)
>  };
> -#define PFM_CMD_COUNT(sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
>  
>  static int
>  pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
> @@ -4770,7 +4769,7 @@ sys_perfmonctl (int fd, int cmd, void __user *arg, int 
> count)
>*/
>   if (unlikely(pmu_conf == NULL)) return -ENOSYS;
>  
> - if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) {
> + if (unlikely(cmd < 0 || cmd >= ARRAY_SIZE(pfm_cmd_tab)) {
>   DPRINT(("invalid cmd=%d\n", cmd));
>   return -EINVAL;
>   }
>
>
> .
>
 Thank you for suggestion.  That's indeed better if just one palce use it.  I 
will repost in v2.

 Sincerely,
zhong jiang



[PATCH 2/2] powerpc: Use ARRAY_SIZE to replace its implementation

2018-08-13 Thread zhong jiang
Use ARRAY_SIZE instead of dividing sizeof array with sizeof an element.
So just replace it.

Signed-off-by: zhong jiang 
---
 arch/powerpc/xmon/ppc-opc.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c
index ac2b55b..f3f57a1 100644
--- a/arch/powerpc/xmon/ppc-opc.c
+++ b/arch/powerpc/xmon/ppc-opc.c
@@ -966,8 +966,7 @@
   { 0xff, 11, NULL, NULL, PPC_OPERAND_SIGNOPT },
 };
 
-const unsigned int num_powerpc_operands = (sizeof (powerpc_operands)
-  / sizeof (powerpc_operands[0]));
+const unsigned int num_powerpc_operands = ARRAY_SIZE(powerpc_operands);
 
 /* The functions used to insert and extract complicated operands.  */
 
@@ -6980,8 +6979,7 @@
 {"fcfidu.",XRC(63,974,1),  XRA_MASK, POWER7|PPCA2, PPCVLE, {FRT, 
FRB}},
 };
 
-const int powerpc_num_opcodes =
-  sizeof (powerpc_opcodes) / sizeof (powerpc_opcodes[0]);
+const int powerpc_num_opcodes = ARRAY_SIZE(powerpc_opcodes);
 
 /* The VLE opcode table.
 
@@ -7219,8 +7217,7 @@
 {"se_bl",  BD8(58,0,1),BD8_MASK,   PPCVLE, 0,  {B8}},
 };
 
-const int vle_num_opcodes =
-  sizeof (vle_opcodes) / sizeof (vle_opcodes[0]);
+const int vle_num_opcodes = ARRAY_SIZE(vle_opcodes);
 
 /* The macro table.  This is only used by the assembler.  */
 
@@ -7288,5 +7285,4 @@
 {"e_clrlslwi",4, PPCVLE, "e_rlwinm %0,%1,%3,(%2)-(%3),31-(%3)"},
 };
 
-const int powerpc_num_macros =
-  sizeof (powerpc_macros) / sizeof (powerpc_macros[0]);
+const int powerpc_num_macros = ARRAY_SIZE(powerpc_macros);
-- 
1.7.12.4



[PATCH 1/2] ia64: Use ARRAY_SIZE to replace its implementation

2018-08-13 Thread zhong jiang
We prefer to ARRAY_SIZE rather than duplicating its implementation.
So just replace it.

Signed-off-by: zhong jiang 
---
 arch/ia64/kernel/perfmon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index a9d4dc6..6cbe6e0 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4645,7 +4645,7 @@ static char *pfmfs_dname(struct dentry *dentry, char 
*buffer, int buflen)
 /* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, 
pfarg_dbreg_t, NULL),
 /* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, 
pfarg_dbreg_t, NULL)
 };
-#define PFM_CMD_COUNT  (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
+#define PFM_CMD_COUNT  ARRAY_SIZE(pfm_cmd_tab)
 
 static int
 pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
-- 
1.7.12.4



[PATCH 0/2] Use ARRAY_SIZE to replace its implementation

2018-08-13 Thread zhong jiang
The issue is detected with the help of Coccinelle.

zhong jiang (2):
  ia64: Use ARRAY_SIZE to replace its implementation
  powerpc: Use ARRAY_SIZE to replace its implementation

 arch/ia64/kernel/perfmon.c  |  2 +-
 arch/powerpc/xmon/ppc-opc.c | 12 
 2 files changed, 5 insertions(+), 9 deletions(-)

-- 
1.7.12.4



[PATCH] powpc:feature: of_node_put is not needed after iterator.

2018-08-04 Thread zhong jiang
for_each_node_by_name iterators only  exit normally when the loop
cursor is NULL, So there is no point to call of_node_put.

Signed-off-by: zhong jiang 
---
 arch/powerpc/platforms/powermac/feature.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/platforms/powermac/feature.c 
b/arch/powerpc/platforms/powermac/feature.c
index 3f82cb2..4eb8cb3 100644
--- a/arch/powerpc/platforms/powermac/feature.c
+++ b/arch/powerpc/platforms/powermac/feature.c
@@ -2889,10 +2889,8 @@ static void __init probe_one_macio(const char *name, 
const char *compat, int typ
/* On all machines, switch modem & serial ports off */
for_each_node_by_name(np, "ch-a")
initial_serial_shutdown(np);
-   of_node_put(np);
for_each_node_by_name(np, "ch-b")
initial_serial_shutdown(np);
-   of_node_put(np);
 }
 
 void __init
-- 
1.7.12.4



Re: [PATCH v11 19/26] mm: provide speculative fault infrastructure

2018-07-25 Thread zhong jiang
On 2018/7/25 18:44, Laurent Dufour wrote:
>
> On 25/07/2018 11:04, zhong jiang wrote:
>> On 2018/7/25 0:10, Laurent Dufour wrote:
>>> On 24/07/2018 16:26, zhong jiang wrote:
>>>> On 2018/5/17 19:06, Laurent Dufour wrote:
>>>>> From: Peter Zijlstra 
>>>>>
>>>>> Provide infrastructure to do a speculative fault (not holding
>>>>> mmap_sem).
>>>>>
>>>>> The not holding of mmap_sem means we can race against VMA
>>>>> change/removal and page-table destruction. We use the SRCU VMA freeing
>>>>> to keep the VMA around. We use the VMA seqcount to detect change
>>>>> (including umapping / page-table deletion) and we use gup_fast() style
>>>>> page-table walking to deal with page-table races.
>>>>>
>>>>> Once we've obtained the page and are ready to update the PTE, we
>>>>> validate if the state we started the fault with is still valid, if
>>>>> not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
>>>>> PTE and we're done.
>>>>>
>>>>> Signed-off-by: Peter Zijlstra (Intel) 
>>>>>
>>>>> [Manage the newly introduced pte_spinlock() for speculative page
>>>>>  fault to fail if the VMA is touched in our back]
>>>>> [Rename vma_is_dead() to vma_has_changed() and declare it here]
>>>>> [Fetch p4d and pud]
>>>>> [Set vmd.sequence in __handle_mm_fault()]
>>>>> [Abort speculative path when handle_userfault() has to be called]
>>>>> [Add additional VMA's flags checks in handle_speculative_fault()]
>>>>> [Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
>>>>> [Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
>>>>> [Remove warning comment about waiting for !seq&1 since we don't want
>>>>>  to wait]
>>>>> [Remove warning about no huge page support, mention it explictly]
>>>>> [Don't call do_fault() in the speculative path as __do_fault() calls
>>>>>  vma->vm_ops->fault() which may want to release mmap_sem]
>>>>> [Only vm_fault pointer argument for vma_has_changed()]
>>>>> [Fix check against huge page, calling pmd_trans_huge()]
>>>>> [Use READ_ONCE() when reading VMA's fields in the speculative path]
>>>>> [Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
>>>>>  processing done in vm_normal_page()]
>>>>> [Check that vma->anon_vma is already set when starting the speculative
>>>>>  path]
>>>>> [Check for memory policy as we can't support MPOL_INTERLEAVE case due to
>>>>>  the processing done in mpol_misplaced()]
>>>>> [Don't support VMA growing up or down]
>>>>> [Move check on vm_sequence just before calling handle_pte_fault()]
>>>>> [Don't build SPF services if !CONFIG_SPECULATIVE_PAGE_FAULT]
>>>>> [Add mem cgroup oom check]
>>>>> [Use READ_ONCE to access p*d entries]
>>>>> [Replace deprecated ACCESS_ONCE() by READ_ONCE() in vma_has_changed()]
>>>>> [Don't fetch pte again in handle_pte_fault() when running the speculative
>>>>>  path]
>>>>> [Check PMD against concurrent collapsing operation]
>>>>> [Try spin lock the pte during the speculative path to avoid deadlock with
>>>>>  other CPU's invalidating the TLB and requiring this CPU to catch the
>>>>>  inter processor's interrupt]
>>>>> [Move define of FAULT_FLAG_SPECULATIVE here]
>>>>> [Introduce __handle_speculative_fault() and add a check against
>>>>>  mm->mm_users in handle_speculative_fault() defined in mm.h]
>>>>> Signed-off-by: Laurent Dufour 
>>>>> ---
>>>>>  include/linux/hugetlb_inline.h |   2 +-
>>>>>  include/linux/mm.h |  30 
>>>>>  include/linux/pagemap.h|   4 +-
>>>>>  mm/internal.h  |  16 +-
>>>>>  mm/memory.c| 340 
>>>>> -
>>>>>  5 files changed, 385 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/hugetlb_inline.h 
>>>>> b/include/linux/hugetlb_inline.h
>>>>> index 0660a03d37d9..9e25283d6fc9 100644
>>>>> --- a/include/linux/hugetlb_inline.h
>>>>> +++ b/include/linux/hugetlb_inline.

Re: [PATCH v11 19/26] mm: provide speculative fault infrastructure

2018-07-25 Thread zhong jiang
On 2018/7/25 0:10, Laurent Dufour wrote:
>
> On 24/07/2018 16:26, zhong jiang wrote:
>> On 2018/5/17 19:06, Laurent Dufour wrote:
>>> From: Peter Zijlstra 
>>>
>>> Provide infrastructure to do a speculative fault (not holding
>>> mmap_sem).
>>>
>>> The not holding of mmap_sem means we can race against VMA
>>> change/removal and page-table destruction. We use the SRCU VMA freeing
>>> to keep the VMA around. We use the VMA seqcount to detect change
>>> (including umapping / page-table deletion) and we use gup_fast() style
>>> page-table walking to deal with page-table races.
>>>
>>> Once we've obtained the page and are ready to update the PTE, we
>>> validate if the state we started the fault with is still valid, if
>>> not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
>>> PTE and we're done.
>>>
>>> Signed-off-by: Peter Zijlstra (Intel) 
>>>
>>> [Manage the newly introduced pte_spinlock() for speculative page
>>>  fault to fail if the VMA is touched in our back]
>>> [Rename vma_is_dead() to vma_has_changed() and declare it here]
>>> [Fetch p4d and pud]
>>> [Set vmd.sequence in __handle_mm_fault()]
>>> [Abort speculative path when handle_userfault() has to be called]
>>> [Add additional VMA's flags checks in handle_speculative_fault()]
>>> [Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
>>> [Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
>>> [Remove warning comment about waiting for !seq&1 since we don't want
>>>  to wait]
>>> [Remove warning about no huge page support, mention it explictly]
>>> [Don't call do_fault() in the speculative path as __do_fault() calls
>>>  vma->vm_ops->fault() which may want to release mmap_sem]
>>> [Only vm_fault pointer argument for vma_has_changed()]
>>> [Fix check against huge page, calling pmd_trans_huge()]
>>> [Use READ_ONCE() when reading VMA's fields in the speculative path]
>>> [Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
>>>  processing done in vm_normal_page()]
>>> [Check that vma->anon_vma is already set when starting the speculative
>>>  path]
>>> [Check for memory policy as we can't support MPOL_INTERLEAVE case due to
>>>  the processing done in mpol_misplaced()]
>>> [Don't support VMA growing up or down]
>>> [Move check on vm_sequence just before calling handle_pte_fault()]
>>> [Don't build SPF services if !CONFIG_SPECULATIVE_PAGE_FAULT]
>>> [Add mem cgroup oom check]
>>> [Use READ_ONCE to access p*d entries]
>>> [Replace deprecated ACCESS_ONCE() by READ_ONCE() in vma_has_changed()]
>>> [Don't fetch pte again in handle_pte_fault() when running the speculative
>>>  path]
>>> [Check PMD against concurrent collapsing operation]
>>> [Try spin lock the pte during the speculative path to avoid deadlock with
>>>  other CPU's invalidating the TLB and requiring this CPU to catch the
>>>  inter processor's interrupt]
>>> [Move define of FAULT_FLAG_SPECULATIVE here]
>>> [Introduce __handle_speculative_fault() and add a check against
>>>  mm->mm_users in handle_speculative_fault() defined in mm.h]
>>> Signed-off-by: Laurent Dufour 
>>> ---
>>>  include/linux/hugetlb_inline.h |   2 +-
>>>  include/linux/mm.h |  30 
>>>  include/linux/pagemap.h|   4 +-
>>>  mm/internal.h  |  16 +-
>>>  mm/memory.c| 340 
>>> -
>>>  5 files changed, 385 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
>>> index 0660a03d37d9..9e25283d6fc9 100644
>>> --- a/include/linux/hugetlb_inline.h
>>> +++ b/include/linux/hugetlb_inline.h
>>> @@ -8,7 +8,7 @@
>>>  
>>>  static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
>>>  {
>>> -   return !!(vma->vm_flags & VM_HUGETLB);
>>> +   return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
>>>  }
>>>  
>>>  #else
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 05cbba70104b..31acf98a7d92 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -315,6 +315,7 @@ extern pgprot_t protection_map[16];
>>>  #define FAULT_FLAG_USER0x40/* The fault originated in 
>>> user

Re: [PATCH v11 19/26] mm: provide speculative fault infrastructure

2018-07-24 Thread zhong jiang
mf->flags & FAULT_FLAG_SPECULATIVE) && userfaultfd_missing(vma)) 
> {
>   pte_unmap_unlock(vmf->pte, vmf->ptl);
>   mem_cgroup_cancel_charge(page, memcg, false);
>   put_page(page);
> @@ -3994,13 +4116,22 @@ static int handle_pte_fault(struct vm_fault *vmf)
>  
>   if (unlikely(pmd_none(*vmf->pmd))) {
>   /*
> +  * In the case of the speculative page fault handler we abort
> +  * the speculative path immediately as the pmd is probably
> +  * in the way to be converted in a huge one. We will try
> +  * again holding the mmap_sem (which implies that the collapse
> +  * operation is done).
> +  */
> + if (vmf->flags & FAULT_FLAG_SPECULATIVE)
> + return VM_FAULT_RETRY;
> + /*
>* Leave __pte_alloc() until later: because vm_ops->fault may
>* want to allocate huge page, and if we expose page table
>* for an instant, it will be difficult to retract from
>* concurrent faults and from rmap lookups.
>*/
>   vmf->pte = NULL;
> - } else {
> + } else if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
>   /* See comment in pte_alloc_one_map() */
>   if (pmd_devmap_trans_unstable(vmf->pmd))
>   return 0;
> @@ -4009,6 +4140,9 @@ static int handle_pte_fault(struct vm_fault *vmf)
>* pmd from under us anymore at this point because we hold the
>* mmap_sem read mode and khugepaged takes it in write mode.
>* So now it's safe to run pte_offset_map().
> +  * This is not applicable to the speculative page fault handler
> +  * but in that case, the pte is fetched earlier in
> +  * handle_speculative_fault().
>*/
>   vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
>   vmf->orig_pte = *vmf->pte;
> @@ -4031,6 +4165,8 @@ static int handle_pte_fault(struct vm_fault *vmf)
>   if (!vmf->pte) {
>   if (vma_is_anonymous(vmf->vma))
>   return do_anonymous_page(vmf);
> + else if (vmf->flags & FAULT_FLAG_SPECULATIVE)
> + return VM_FAULT_RETRY;
>   else
>   return do_fault(vmf);
>   }
> @@ -4128,6 +4264,9 @@ static int __handle_mm_fault(struct vm_area_struct 
> *vma, unsigned long address,
>   vmf.pmd = pmd_alloc(mm, vmf.pud, address);
>   if (!vmf.pmd)
>   return VM_FAULT_OOM;
> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
> + vmf.sequence = raw_read_seqcount(>vm_sequence);
> +#endif
>   if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
>   ret = create_huge_pmd();
>   if (!(ret & VM_FAULT_FALLBACK))
> @@ -4161,6 +4300,201 @@ static int __handle_mm_fault(struct vm_area_struct 
> *vma, unsigned long address,
>   return handle_pte_fault();
>  }
>  
> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
> +/*
> + * Tries to handle the page fault in a speculative way, without grabbing the
> + * mmap_sem.
> + */
> +int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
> +unsigned int flags)
> +{
> + struct vm_fault vmf = {
> + .address = address,
> + };
> + pgd_t *pgd, pgdval;
> + p4d_t *p4d, p4dval;
> + pud_t pudval;
> + int seq, ret = VM_FAULT_RETRY;
> + struct vm_area_struct *vma;
> +#ifdef CONFIG_NUMA
> + struct mempolicy *pol;
> +#endif
> +
> + /* Clear flags that may lead to release the mmap_sem to retry */
> + flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
> + flags |= FAULT_FLAG_SPECULATIVE;
> +
> + vma = get_vma(mm, address);
> + if (!vma)
> + return ret;
> +
> + seq = raw_read_seqcount(>vm_sequence); /* rmb <-> 
> seqlock,vma_rb_erase() */
> + if (seq & 1)
> + goto out_put;
> +
> + /*
> +  * Can't call vm_ops service has we don't know what they would do
> +  * with the VMA.
> +  * This include huge page from hugetlbfs.
> +  */
> + if (vma->vm_ops)
> + goto out_put;
> +
  Hi   Laurent
   
   I think that most of pagefault will leave here.   Is there any case  need to 
skip ?
  I have tested the following  patch, it work well.

diff --git a/mm/memory.c b/mm/memory.c
index 936128b..9bc1545 100644
 @@ -3893,8 +3898,6 @@ static int handle_pte_fault(struct fault_env *fe)
if (!fe->pte) {