from:"Xishi Qiu"

Re: [RFC PATCH 5/5] mm, page_alloc: Introduce ZONELIST_FALLBACK_SAME_TYPE fallback list

2019-04-24 Thread Xishi Qiu

Hi Fan Du,

I think we should change the print in mminit_verify_zonelist too.

This patch changes the order of ZONELIST_FALLBACK, so the default numa policy 
can
alloc DRAM first, then PMEM, right?

Thanks,
Xishi Qiu
> On system with heterogeneous memory, reasonable fall back lists woul be:
> a. No fall back, stick to current running node.
> b. Fall back to other nodes of the same type or different type
>    e.g. DRAM node 0 -> DRAM node 1 -> PMEM node 2 -> PMEM node 3
> c. Fall back to other nodes of the same type only.
>    e.g. DRAM node 0 -> DRAM node 1
> 
> a. is already in place, previous patch implement b. providing way to
> satisfy memory request as best effort by default. And this patch of
> writing build c. to fallback to the same node type when user specify
> GFP_SAME_NODE_TYPE only.
> 
> Signed-off-by: Fan Du 
> ---
>  include/linux/gfp.h|  7 +++
>  include/linux/mmzone.h |  1 +
>  mm/page_alloc.c| 15 +++
>  3 files changed, 23 insertions(+)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index fdab7de..ca5fdfc 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -44,6 +44,8 @@
>  #else
>  #define ___GFP_NOLOCKDEP 0
>  #endif
> +#define ___GFP_SAME_NODE_TYPE 0x100u
> +
>  /* If the above are modified, __GFP_BITS_SHIFT may need updating */
>  
>  /*
> @@ -215,6 +217,7 @@
>  
>  /* Disable lockdep for GFP context tracking */
>  #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
> +#define __GFP_SAME_NODE_TYPE ((__force gfp_t)___GFP_SAME_NODE_TYPE)
>  
>  /* Room for N __GFP_FOO bits */
>  #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
> @@ -301,6 +304,8 @@
>  __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
>  #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
>  
> +#define GFP_SAME_NODE_TYPE (__GFP_SAME_NODE_TYPE)
> +
>  /* Convert GFP flags to their corresponding migrate type */
>  #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
>  #define GFP_MOVABLE_SHIFT 3
> @@ -438,6 +443,8 @@ static inline int gfp_zonelist(gfp_t flags)
>  #ifdef CONFIG_NUMA
>   if (unlikely(flags & __GFP_THISNODE))
>    return ZONELIST_NOFALLBACK;
> + if (unlikely(flags & __GFP_SAME_NODE_TYPE))
> +  return ZONELIST_FALLBACK_SAME_TYPE;
>  #endif
>   return ZONELIST_FALLBACK;
>  }
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 8c37e1c..2f8603e 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -583,6 +583,7 @@ static inline bool zone_intersects(struct zone *zone,
>  
>  enum {
>   ZONELIST_FALLBACK, /* zonelist with fallback */
> + ZONELIST_FALLBACK_SAME_TYPE, /* zonelist with fallback to the same type 
> node */
>  #ifdef CONFIG_NUMA
>   /*
>    * The NUMA zonelists are doubled because we need zonelists that
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index a408a91..de797921 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5448,6 +5448,21 @@ static void 
> build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
>   }
>   zonerefs->zone = NULL;
>   zonerefs->zone_idx = 0;
> +
> + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK_SAME_TYPE]._zonerefs;
> +
> + for (i = 0; i < nr_nodes; i++) {
> +  int nr_zones;
> +
> +  pg_data_t *node = NODE_DATA(node_order[i]);
> +
> +  if (!is_node_same_type(node->node_id, pgdat->node_id))
> +   continue;
> +  nr_zones = build_zonerefs_node(node, zonerefs);
> +  zonerefs += nr_zones;
> + }
> + zonerefs->zone = NULL;
> + zonerefs->zone_idx = 0;
>  }
>  
>  /*
> -- 
> 1.8.3.1
> 
>

Re: [PATCH] mm:vmalloc add vm_struct for vm_map_ram

2018-11-11 Thread Xishi Qiu




On 2018/11/8 19:14, Zhaoyang Huang wrote:
> From: Zhaoyang Huang 
> 
> There is no caller and pages information etc for the area which is
> created by vm_map_ram as well as the page count > VMAP_MAX_ALLOC.
> Add them on in this commit.
> 
> Signed-off-by: Zhaoyang Huang 
> ---
>  mm/vmalloc.c | 30 --
>  1 file changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index cfea25b..819b690 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -45,7 +45,8 @@ struct vfree_deferred {
>  static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
>  
>  static void __vunmap(const void *, int);
> -
> +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> +   unsigned long flags, const void *caller);
>  static void free_work(struct work_struct *w)
>  {
>   struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
> @@ -1138,6 +1139,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
>   BUG_ON(!va);
>   debug_check_no_locks_freed((void *)va->va_start,
>   (va->va_end - va->va_start));
> + kfree(va->vm);
>   free_unmap_vmap_area(va);
>  }
>  EXPORT_SYMBOL(vm_unmap_ram);
> @@ -1170,6 +1172,8 @@ void *vm_map_ram(struct page **pages, unsigned int 
> count, int node, pgprot_t pro
>   addr = (unsigned long)mem;
>   } else {
>   struct vmap_area *va;
> + struct vm_struct *area;
> +
>   va = alloc_vmap_area(size, PAGE_SIZE,
>   VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
>   if (IS_ERR(va))
> @@ -1177,11 +1181,17 @@ void *vm_map_ram(struct page **pages, unsigned int 
> count, int node, pgprot_t pro
>  
>   addr = va->va_start;
>   mem = (void *)addr;
> + area = kzalloc_node(sizeof(*area), GFP_KERNEL, node);
> + if (likely(area)) {
> + setup_vmalloc_vm(area, va, 0, 
> __builtin_return_address(0));
> + va->flags &= ~VM_VM_AREA;
> + }
Hi Zhaoyang，

I think if we set the flag VM_VM_AREA, that means we have some info,
so how about do not clear the flag after setup_vmalloc_vm, and just
update the print in s_show.

...
if (v->flags & VM_ALLOC)
seq_puts(m, " vmalloc");
+   if (v->flags & VM_MAP_RAM)  // add a new flag for vm_map_ram?
+   seq_puts(m, " vm_map_ram");
...

Thanks,
Xishi QIu
>   }
>   if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
>   vm_unmap_ram(mem, count);
>   return NULL;
>   }
> +
>   return mem;
>  }
>  EXPORT_SYMBOL(vm_map_ram);
> @@ -2688,19 +2698,19 @@ static int s_show(struct seq_file *m, void *p)
>* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
>* behalf of vmap area is being tear down or vm_map_ram allocation.
>*/
> - if (!(va->flags & VM_VM_AREA)) {
> - seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
> - (void *)va->va_start, (void *)va->va_end,
> - va->va_end - va->va_start,
> - va->flags & VM_LAZY_FREE ? "unpurged vm_area" : 
> "vm_map_ram");
> -
> + if (!(va->flags & VM_VM_AREA) && !va->vm)
>   return 0;
> - }
>  
>   v = va->vm;
>  
> - seq_printf(m, "0x%pK-0x%pK %7ld",
> - v->addr, v->addr + v->size, v->size);
> + if (!(va->flags & VM_VM_AREA))
> + seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
> + (void *)va->va_start, (void *)va->va_end,
> + va->va_end - va->va_start,
> + va->flags & VM_LAZY_FREE ? "unpurged vm_area" : 
> "vm_map_ram");
> + else
> + seq_printf(m, "0x%pK-0x%pK %7ld",
> + v->addr, v->addr + v->size, v->size);
>  
>   if (v->caller)
>   seq_printf(m, " %pS", v->caller);
>

Re: [PATCH] mm:vmalloc add vm_struct for vm_map_ram

2018-11-11 Thread Xishi Qiu




On 2018/11/8 19:14, Zhaoyang Huang wrote:
> From: Zhaoyang Huang 
> 
> There is no caller and pages information etc for the area which is
> created by vm_map_ram as well as the page count > VMAP_MAX_ALLOC.
> Add them on in this commit.
> 
> Signed-off-by: Zhaoyang Huang 
> ---
>  mm/vmalloc.c | 30 --
>  1 file changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index cfea25b..819b690 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -45,7 +45,8 @@ struct vfree_deferred {
>  static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
>  
>  static void __vunmap(const void *, int);
> -
> +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> +   unsigned long flags, const void *caller);
>  static void free_work(struct work_struct *w)
>  {
>   struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
> @@ -1138,6 +1139,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
>   BUG_ON(!va);
>   debug_check_no_locks_freed((void *)va->va_start,
>   (va->va_end - va->va_start));
> + kfree(va->vm);
>   free_unmap_vmap_area(va);
>  }
>  EXPORT_SYMBOL(vm_unmap_ram);
> @@ -1170,6 +1172,8 @@ void *vm_map_ram(struct page **pages, unsigned int 
> count, int node, pgprot_t pro
>   addr = (unsigned long)mem;
>   } else {
>   struct vmap_area *va;
> + struct vm_struct *area;
> +
>   va = alloc_vmap_area(size, PAGE_SIZE,
>   VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
>   if (IS_ERR(va))
> @@ -1177,11 +1181,17 @@ void *vm_map_ram(struct page **pages, unsigned int 
> count, int node, pgprot_t pro
>  
>   addr = va->va_start;
>   mem = (void *)addr;
> + area = kzalloc_node(sizeof(*area), GFP_KERNEL, node);
> + if (likely(area)) {
> + setup_vmalloc_vm(area, va, 0, 
> __builtin_return_address(0));
> + va->flags &= ~VM_VM_AREA;
> + }
Hi Zhaoyang，

I think if we set the flag VM_VM_AREA, that means we have some info,
so how about do not clear the flag after setup_vmalloc_vm, and just
update the print in s_show.

...
if (v->flags & VM_ALLOC)
seq_puts(m, " vmalloc");
+   if (v->flags & VM_MAP_RAM)  // add a new flag for vm_map_ram?
+   seq_puts(m, " vm_map_ram");
...

Thanks,
Xishi QIu
>   }
>   if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
>   vm_unmap_ram(mem, count);
>   return NULL;
>   }
> +
>   return mem;
>  }
>  EXPORT_SYMBOL(vm_map_ram);
> @@ -2688,19 +2698,19 @@ static int s_show(struct seq_file *m, void *p)
>* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
>* behalf of vmap area is being tear down or vm_map_ram allocation.
>*/
> - if (!(va->flags & VM_VM_AREA)) {
> - seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
> - (void *)va->va_start, (void *)va->va_end,
> - va->va_end - va->va_start,
> - va->flags & VM_LAZY_FREE ? "unpurged vm_area" : 
> "vm_map_ram");
> -
> + if (!(va->flags & VM_VM_AREA) && !va->vm)
>   return 0;
> - }
>  
>   v = va->vm;
>  
> - seq_printf(m, "0x%pK-0x%pK %7ld",
> - v->addr, v->addr + v->size, v->size);
> + if (!(va->flags & VM_VM_AREA))
> + seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
> + (void *)va->va_start, (void *)va->va_end,
> + va->va_end - va->va_start,
> + va->flags & VM_LAZY_FREE ? "unpurged vm_area" : 
> "vm_map_ram");
> + else
> + seq_printf(m, "0x%pK-0x%pK %7ld",
> + v->addr, v->addr + v->size, v->size);
>  
>   if (v->caller)
>   seq_printf(m, " %pS", v->caller);
>

[RFC] mm: why active file + inactive file is larger than cached + buffers?

2018-05-25 Thread Xishi Qiu

Hi, I find the active file + inactive file is larger than cached + buffers, 
about 5G,
and can not free it by "echo 3 > /proc/sys/vm/drop_caches"

The meminfo shows that the mapped is also very small, so maybe some get the 
page? (e.g. get_user_pages())
Then it will dec the count of NR_FILE_PAGES when delete from page cache, but 
because of the count,
the page can not delete from lru, so left active file + inactive file so large, 
right?

MemTotal:   61528660 kB
MemFree:48678936 kB
MemAvailable:   53657348 kB
Buffers:   10832 kB
Cached:   640340 kB
SwapCached:0 kB
Active:  5933664 kB
Inactive: 588968 kB
Active(anon):1022200 kB
Inactive(anon):   533248 kB
Active(file):4911464 kB
Inactive(file):55720 kB
Unevictable:   0 kB
Mlocked:   0 kB
SwapTotal: 0 kB
SwapFree:  0 kB
Dirty: 10952 kB
Writeback: 0 kB
AnonPages:980572 kB
Mapped:85464 kB
Shmem:581580 kB
Slab: 343480 kB
SReclaimable: 264632 kB
SUnreclaim:78848 kB
KernelStack:8368 kB
PageTables:18012 kB
NFS_Unstable:  0 kB
Bounce:0 kB
WritebackTmp:  0 kB
CommitLimit:28667176 kB
Committed_AS:4144064 kB
VmallocTotal:   34359738367 kB
VmallocUsed:  265848 kB
VmallocChunk:   34359417096 kB
HardwareCorrupted: 0 kB
AnonHugePages:688128 kB
HugePages_Total:   4
HugePages_Free:0
HugePages_Rsvd:0
HugePages_Surp:0
Hugepagesize:1048576 kB
DirectMap4k:  182112 kB
DirectMap2M:11294720 kB
DirectMap1G:51380224 kB

[RFC] mm: why active file + inactive file is larger than cached + buffers?

2018-05-25 Thread Xishi Qiu

Hi, I find the active file + inactive file is larger than cached + buffers, 
about 5G,
and can not free it by "echo 3 > /proc/sys/vm/drop_caches"

The meminfo shows that the mapped is also very small, so maybe some get the 
page? (e.g. get_user_pages())
Then it will dec the count of NR_FILE_PAGES when delete from page cache, but 
because of the count,
the page can not delete from lru, so left active file + inactive file so large, 
right?

MemTotal:   61528660 kB
MemFree:48678936 kB
MemAvailable:   53657348 kB
Buffers:   10832 kB
Cached:   640340 kB
SwapCached:0 kB
Active:  5933664 kB
Inactive: 588968 kB
Active(anon):1022200 kB
Inactive(anon):   533248 kB
Active(file):4911464 kB
Inactive(file):55720 kB
Unevictable:   0 kB
Mlocked:   0 kB
SwapTotal: 0 kB
SwapFree:  0 kB
Dirty: 10952 kB
Writeback: 0 kB
AnonPages:980572 kB
Mapped:85464 kB
Shmem:581580 kB
Slab: 343480 kB
SReclaimable: 264632 kB
SUnreclaim:78848 kB
KernelStack:8368 kB
PageTables:18012 kB
NFS_Unstable:  0 kB
Bounce:0 kB
WritebackTmp:  0 kB
CommitLimit:28667176 kB
Committed_AS:4144064 kB
VmallocTotal:   34359738367 kB
VmallocUsed:  265848 kB
VmallocChunk:   34359417096 kB
HardwareCorrupted: 0 kB
AnonHugePages:688128 kB
HugePages_Total:   4
HugePages_Free:0
HugePages_Rsvd:0
HugePages_Surp:0
Hugepagesize:1048576 kB
DirectMap4k:  182112 kB
DirectMap2M:11294720 kB
DirectMap1G:51380224 kB

Re: [RFC] should BIOS change the efi type when we set CONFIG_X86_RESERVE_LOW ?

2018-04-23 Thread Xishi Qiu

On 2018/4/12 9:49, Xishi Qiu wrote:

> Hi, I find CONFIG_X86_RESERVE_LOW=64 in my system, so trim_low_memory_range()
> will reserve low 64kb memory. But efi_free_boot_services() will free it to
> buddy system again later because BIOS set the type to EFI_BOOT_SERVICES_CODE.
> 
> Here is the log:
> ...
> efi: mem03: type=3, attr=0xf, range=[0xe000-0x0001) 
> (0MB
> ...
> 
> 

When call memblock_is_region_reserved(), it will set md->num_pages = 0 if the
memblock region is reserved. But trim_low_memory_range() reserve the region
after efi, so this breaks the logic, and efi_free_boot_services() will free
the pages(efi code/data). That means trim_low_memory_range() has not reserve
the low memory range.

...
efi_reserve_boot_services()
...
trim_low_memory_range()
...
efi_free_boot_services()
...

Shall we move trim_low_memory_range() before efi_reserve_boot_services()?

Thanks,
Xishi Qiu

> .
>

Re: [RFC] should BIOS change the efi type when we set CONFIG_X86_RESERVE_LOW ?

2018-04-23 Thread Xishi Qiu

On 2018/4/12 9:49, Xishi Qiu wrote:

> Hi, I find CONFIG_X86_RESERVE_LOW=64 in my system, so trim_low_memory_range()
> will reserve low 64kb memory. But efi_free_boot_services() will free it to
> buddy system again later because BIOS set the type to EFI_BOOT_SERVICES_CODE.
> 
> Here is the log:
> ...
> efi: mem03: type=3, attr=0xf, range=[0xe000-0x0001) 
> (0MB
> ...
> 
> 

When call memblock_is_region_reserved(), it will set md->num_pages = 0 if the
memblock region is reserved. But trim_low_memory_range() reserve the region
after efi, so this breaks the logic, and efi_free_boot_services() will free
the pages(efi code/data). That means trim_low_memory_range() has not reserve
the low memory range.

...
efi_reserve_boot_services()
...
trim_low_memory_range()
...
efi_free_boot_services()
...

Shall we move trim_low_memory_range() before efi_reserve_boot_services()?

Thanks,
Xishi Qiu

> .
>

[RFC] should BIOS change the efi type when we set CONFIG_X86_RESERVE_LOW ?

2018-04-11 Thread Xishi Qiu

Hi, I find CONFIG_X86_RESERVE_LOW=64 in my system, so trim_low_memory_range()
will reserve low 64kb memory. But efi_free_boot_services() will free it to
buddy system again later because BIOS set the type to EFI_BOOT_SERVICES_CODE.

Here is the log:
...
efi: mem03: type=3, attr=0xf, range=[0xe000-0x0001) (0MB
...

[RFC] should BIOS change the efi type when we set CONFIG_X86_RESERVE_LOW ?

2018-04-11 Thread Xishi Qiu

Hi, I find CONFIG_X86_RESERVE_LOW=64 in my system, so trim_low_memory_range()
will reserve low 64kb memory. But efi_free_boot_services() will free it to
buddy system again later because BIOS set the type to EFI_BOOT_SERVICES_CODE.

Here is the log:
...
efi: mem03: type=3, attr=0xf, range=[0xe000-0x0001) (0MB
...

Re: [RFC] mm: why vfree() do not free page table memory?

2018-01-17 Thread Xishi Qiu

On 2018/1/17 17:16, Vlastimil Babka wrote:

> On 12/29/2017 09:58 AM, Xishi Qiu wrote:
>> When calling vfree(), it calls unmap_vmap_area() to clear page table,
>> but do not free the memory of page table, why? just for performance?
> 
> I guess it's expected that the free virtual range and associated page
> tables it might be reused later.
> 

Hi Vlastimil，

If use vmalloc/vfree different size, then there will be some hols during 
VMALLOC_START to VMALLOC_END, and this holes takes page table memory, right?

>> If a driver use vmalloc() and vfree() frequently, we will lost much
>> page table memory, maybe oom later.
> 
> If it's reused, then not really.
> 
> Did you notice an actual issue, or is this just theoretical concern.
> 

Yes, we have this problem on our production line.
I find the page table memory takes 200-300M.

Thanks,
Xishi Qiu

>> Thanks,
>> Xishi Qiu
>>
> 
> 
> .
>

Re: [RFC] mm: why vfree() do not free page table memory?

2018-01-17 Thread Xishi Qiu

On 2018/1/17 17:16, Vlastimil Babka wrote:

> On 12/29/2017 09:58 AM, Xishi Qiu wrote:
>> When calling vfree(), it calls unmap_vmap_area() to clear page table,
>> but do not free the memory of page table, why? just for performance?
> 
> I guess it's expected that the free virtual range and associated page
> tables it might be reused later.
> 

Hi Vlastimil，

If use vmalloc/vfree different size, then there will be some hols during 
VMALLOC_START to VMALLOC_END, and this holes takes page table memory, right?

>> If a driver use vmalloc() and vfree() frequently, we will lost much
>> page table memory, maybe oom later.
> 
> If it's reused, then not really.
> 
> Did you notice an actual issue, or is this just theoretical concern.
> 

Yes, we have this problem on our production line.
I find the page table memory takes 200-300M.

Thanks,
Xishi Qiu

>> Thanks,
>> Xishi Qiu
>>
> 
> 
> .
>

Re: [RFC] mm: why vfree() do not free page table memory?

2018-01-10 Thread Xishi Qiu

On 2017/12/29 16:58, Xishi Qiu wrote:

> When calling vfree(), it calls unmap_vmap_area() to clear page table,
> but do not free the memory of page table, why? just for performance?
> 
> If a driver use vmalloc() and vfree() frequently, we will lost much
> page table memory, maybe oom later.
> 
> Thanks,
> Xishi Qiu
> 

ping

> 
> .
>

Re: [RFC] mm: why vfree() do not free page table memory?

2018-01-10 Thread Xishi Qiu

On 2017/12/29 16:58, Xishi Qiu wrote:

> When calling vfree(), it calls unmap_vmap_area() to clear page table,
> but do not free the memory of page table, why? just for performance?
> 
> If a driver use vmalloc() and vfree() frequently, we will lost much
> page table memory, maybe oom later.
> 
> Thanks,
> Xishi Qiu
> 

ping

> 
> .
>

Re: [RFC] boot failed when enable KAISER/KPTI

2018-01-05 Thread Xishi Qiu

On 2018/1/6 2:33, Jiri Kosina wrote:

> On Fri, 5 Jan 2018, Xishi Qiu wrote:
> 
>> I run the latest RHEL 7.2 with the KAISER/KPTI patch, and boot failed.
>>
>> ...
>> [0.00] PM: Registered nosave memory: [mem 
>> 0x810-0x8ff]
>> [0.00] PM: Registered nosave memory: [mem 
>> 0x910-0xfff]
>> [0.00] PM: Registered nosave memory: [mem 
>> 0x1010-0x10ff]
>> [0.00] PM: Registered nosave memory: [mem 
>> 0x1110-0x17ff]
>> [0.00] PM: Regitered nosave memory: [mem 
>> 0x1810-0x18ff]
>> [0.00] e820: [mem 0x9000-0xfed1bfff] available for PCI devices
>> [0.00] Booting paravirtualized kernel on bare hardware
>> [0.00] setup_percpu: NR_CPUS:5120 nr_cpumask_bits:1536 
>> nr_cpu_ids:1536 nr_node_ids:8
>> [0.00] PERCPU: max_distance=0x180ffe24 too large for vmalloc 
>> space 0x1fff
>> [0.00] setup_percpu: auto allocator failed (-22), falling back to 
>> page size
>> [0.00] PERCPU: 32 4K pages/cpu @c900 s107200 r8192 d15680
>> [0.00] Built 8 zonelists in Zone order, mobility grouping on.  Total 
>> pages: 132001804
>> [0.00] Policy zone: Normal
>> iosdevname=0 8250.nr_uarts=8 efi=old_map rdloaddriver=usb_storage 
>> rdloaddriver=sd_mod udev.event-timeout=600 softlockup_panic=0 
>> rcupdate.rcu_cpu_stall_timeout=300
>> [0.00] Intel-IOMMU: enabled
>> [0.00] PID hash table entries: 4096 (order: 3, 32768 bytes)
>> [0.00] x86/fpu: xstate_offset[2]: 0240, xstate_sizes[2]: 0100
>> [0.00] xsave: enabled xstate_bv 0x7, cntxt size 0x340
>> [0.00] AGP: Checking aperture...
>> [0.00] AGP: No AGP bridge found
>> [0.00] Memory: 526901612k/26910638080k available (6528k kernel code, 
>> 26374249692k absent, 9486776k reserved, 4302k data, 1676k init)
>> [0.00] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=1536, Nodes=8
>> [0.00] x86/pti: Unmapping kernel while in userspace
>> [0.00] Hierarchical RCU implementation.
>> [0.00]   RCU restricting CPUs from NR_CPUS=5120 to 
>> nr_cpu_ids=1536.
>> [0.00]   Offload RCU callbacks from all CPUs
>> [0.00]   Offload RCU callbacks from CPUs: 0-1535.
>> [0.00] NR_IRQS:327936 nr_irqs:15976 0
>> [0.00] Console: colour dummy device 80x25
>> [0.00] console [tty0] enabled
>> [0.00] console [ttyS0] enabled
>> [0.00] allocated 2145910784 bytes of page_cgroup
>> [0.00] please try 'cgroup_disable=memory' option if you don't want 
>> memory cgroups
>> [0.00] Enabling automatic NUMA balancing. Configure with 
>> numa_balancing= or the kernel.numa_balancing sysctl
>> [0.00] tsc: Fast TSC calibration using PIT
>> [0.00] tsc: Detected 2799.999 MHz processor
>> [0.001803] Calibrating delay loop (skipped), value calculated using 
>> timer frequency.. 5599.99 BogoMIPS (lpj=279)
>> [0.012408] pid_max: default: 1572864 minimum: 12288
>> [0.017987] init_memory_mapping: [mem 0x5947f000-0x5b47efff]
>> [0.023701] init_memory_mapping: [mem 0x5b47f000-0x5b87efff]
>> [0.029369] init_memory_mapping: [mem 0x6d368000-0x6d3edfff]
>> [0.039130] BUG: unable to handle kernel paging request at 
>> 5b835f90
>> [0.046101] IP: [<5b835f90>] 0x5b835f8f
>> [0.050637] PGD 81f61067 PUD 190ffefff067 PMD 190ffeffd067 PTE 
>> 5b835063
>> [0.057989] Oops: 0011 [#1] SMP 
>> [0.061241] Modules linked in:
>> [0.064304] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
>> 3.10.0-327.59.59.46.h42.x86_64 #1
>> [0.072280] Hardware name: Huawei FusionServer9032/IT91SMUB, BIOS 
>> BLXSV316 11/14/2017
>> [0.080082] task: 8196e440 ti: 81958000 task.ti: 
>> 81958000
>> [0.087539] RIP: 0010:[<5b835f90>]  [<5b835f90>] 
>> 0x5b835f8f
>> [0.094494] RSP: :8195be28  EFLAGS: 00010046
>> [0.099788] RAX: 80050033 RBX: 910fbc802000 RCX: 
>> 02d0
>> [0.106897] RDX: 0030 RSI: 02d0 RDI: 
>> 5b835f90
>> [0.114006] RBP: 8195bf38 R08: 0001 R09: 
>> 090fbc802000
>> [0.121116] R10: 88ffbcc07340 R11: 0001 R12: 
>> 0001
>> [0.128225] R13: 090fbc802000 R14: 02d0 R15: 
>> 0

Re: [RFC] boot failed when enable KAISER/KPTI

2018-01-05 Thread Xishi Qiu

On 2018/1/6 2:33, Jiri Kosina wrote:

> On Fri, 5 Jan 2018, Xishi Qiu wrote:
> 
>> I run the latest RHEL 7.2 with the KAISER/KPTI patch, and boot failed.
>>
>> ...
>> [0.00] PM: Registered nosave memory: [mem 
>> 0x810-0x8ff]
>> [0.00] PM: Registered nosave memory: [mem 
>> 0x910-0xfff]
>> [0.00] PM: Registered nosave memory: [mem 
>> 0x1010-0x10ff]
>> [0.00] PM: Registered nosave memory: [mem 
>> 0x1110-0x17ff]
>> [0.00] PM: Regitered nosave memory: [mem 
>> 0x1810-0x18ff]
>> [0.00] e820: [mem 0x9000-0xfed1bfff] available for PCI devices
>> [0.00] Booting paravirtualized kernel on bare hardware
>> [0.00] setup_percpu: NR_CPUS:5120 nr_cpumask_bits:1536 
>> nr_cpu_ids:1536 nr_node_ids:8
>> [0.00] PERCPU: max_distance=0x180ffe24 too large for vmalloc 
>> space 0x1fff
>> [0.00] setup_percpu: auto allocator failed (-22), falling back to 
>> page size
>> [0.00] PERCPU: 32 4K pages/cpu @c900 s107200 r8192 d15680
>> [0.00] Built 8 zonelists in Zone order, mobility grouping on.  Total 
>> pages: 132001804
>> [0.00] Policy zone: Normal
>> iosdevname=0 8250.nr_uarts=8 efi=old_map rdloaddriver=usb_storage 
>> rdloaddriver=sd_mod udev.event-timeout=600 softlockup_panic=0 
>> rcupdate.rcu_cpu_stall_timeout=300
>> [0.00] Intel-IOMMU: enabled
>> [0.00] PID hash table entries: 4096 (order: 3, 32768 bytes)
>> [0.00] x86/fpu: xstate_offset[2]: 0240, xstate_sizes[2]: 0100
>> [0.00] xsave: enabled xstate_bv 0x7, cntxt size 0x340
>> [0.00] AGP: Checking aperture...
>> [0.00] AGP: No AGP bridge found
>> [0.00] Memory: 526901612k/26910638080k available (6528k kernel code, 
>> 26374249692k absent, 9486776k reserved, 4302k data, 1676k init)
>> [0.00] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=1536, Nodes=8
>> [0.00] x86/pti: Unmapping kernel while in userspace
>> [0.00] Hierarchical RCU implementation.
>> [0.00]   RCU restricting CPUs from NR_CPUS=5120 to 
>> nr_cpu_ids=1536.
>> [0.00]   Offload RCU callbacks from all CPUs
>> [0.00]   Offload RCU callbacks from CPUs: 0-1535.
>> [0.00] NR_IRQS:327936 nr_irqs:15976 0
>> [0.00] Console: colour dummy device 80x25
>> [0.00] console [tty0] enabled
>> [0.00] console [ttyS0] enabled
>> [0.00] allocated 2145910784 bytes of page_cgroup
>> [0.00] please try 'cgroup_disable=memory' option if you don't want 
>> memory cgroups
>> [0.00] Enabling automatic NUMA balancing. Configure with 
>> numa_balancing= or the kernel.numa_balancing sysctl
>> [0.00] tsc: Fast TSC calibration using PIT
>> [0.00] tsc: Detected 2799.999 MHz processor
>> [0.001803] Calibrating delay loop (skipped), value calculated using 
>> timer frequency.. 5599.99 BogoMIPS (lpj=279)
>> [0.012408] pid_max: default: 1572864 minimum: 12288
>> [0.017987] init_memory_mapping: [mem 0x5947f000-0x5b47efff]
>> [0.023701] init_memory_mapping: [mem 0x5b47f000-0x5b87efff]
>> [0.029369] init_memory_mapping: [mem 0x6d368000-0x6d3edfff]
>> [0.039130] BUG: unable to handle kernel paging request at 
>> 5b835f90
>> [0.046101] IP: [<5b835f90>] 0x5b835f8f
>> [0.050637] PGD 81f61067 PUD 190ffefff067 PMD 190ffeffd067 PTE 
>> 5b835063
>> [0.057989] Oops: 0011 [#1] SMP 
>> [0.061241] Modules linked in:
>> [0.064304] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
>> 3.10.0-327.59.59.46.h42.x86_64 #1
>> [0.072280] Hardware name: Huawei FusionServer9032/IT91SMUB, BIOS 
>> BLXSV316 11/14/2017
>> [0.080082] task: 8196e440 ti: 81958000 task.ti: 
>> 81958000
>> [0.087539] RIP: 0010:[<5b835f90>]  [<5b835f90>] 
>> 0x5b835f8f
>> [0.094494] RSP: :8195be28  EFLAGS: 00010046
>> [0.099788] RAX: 80050033 RBX: 910fbc802000 RCX: 
>> 02d0
>> [0.106897] RDX: 0030 RSI: 02d0 RDI: 
>> 5b835f90
>> [0.114006] RBP: 8195bf38 R08: 0001 R09: 
>> 090fbc802000
>> [0.121116] R10: 88ffbcc07340 R11: 0001 R12: 
>> 0001
>> [0.128225] R13: 090fbc802000 R14: 02d0 R15: 
>> 0

[RFC] boot failed when enable KAISER/KPTI

2018-01-04 Thread Xishi Qiu

I run the latest RHEL 7.2 with the KAISER/KPTI patch, and boot failed.

...
[0.00] PM: Registered nosave memory: [mem 0x810-0x8ff]
[0.00] PM: Registered nosave memory: [mem 0x910-0xfff]
[0.00] PM: Registered nosave memory: [mem 0x1010-0x10ff]
[0.00] PM: Registered nosave memory: [mem 0x1110-0x17ff]
[0.00] PM: Regitered nosave memory: [mem 0x1810-0x18ff]
[0.00] e820: [mem 0x9000-0xfed1bfff] available for PCI devices
[0.00] Booting paravirtualized kernel on bare hardware
[0.00] setup_percpu: NR_CPUS:5120 nr_cpumask_bits:1536 nr_cpu_ids:1536 
nr_node_ids:8
[0.00] PERCPU: max_distance=0x180ffe24 too large for vmalloc space 
0x1fff
[0.00] setup_percpu: auto allocator failed (-22), falling back to page 
size
[0.00] PERCPU: 32 4K pages/cpu @c900 s107200 r8192 d15680
[0.00] Built 8 zonelists in Zone order, mobility grouping on.  Total 
pages: 132001804
[0.00] Policy zone: Normal
iosdevname=0 8250.nr_uarts=8 efi=old_map rdloaddriver=usb_storage 
rdloaddriver=sd_mod udev.event-timeout=600 softlockup_panic=0 
rcupdate.rcu_cpu_stall_timeout=300
[0.00] Intel-IOMMU: enabled
[0.00] PID hash table entries: 4096 (order: 3, 32768 bytes)
[0.00] x86/fpu: xstate_offset[2]: 0240, xstate_sizes[2]: 0100
[0.00] xsave: enabled xstate_bv 0x7, cntxt size 0x340
[0.00] AGP: Checking aperture...
[0.00] AGP: No AGP bridge found
[0.00] Memory: 526901612k/26910638080k available (6528k kernel code, 
26374249692k absent, 9486776k reserved, 4302k data, 1676k init)
[0.00] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=1536, Nodes=8
[0.00] x86/pti: Unmapping kernel while in userspace
[0.00] Hierarchical RCU implementation.
[0.00]  RCU restricting CPUs from NR_CPUS=5120 to nr_cpu_ids=1536.
[0.00]  Offload RCU callbacks from all CPUs
[0.00]  Offload RCU callbacks from CPUs: 0-1535.
[0.00] NR_IRQS:327936 nr_irqs:15976 0
[0.00] Console: colour dummy device 80x25
[0.00] console [tty0] enabled
[0.00] console [ttyS0] enabled
[0.00] allocated 2145910784 bytes of page_cgroup
[0.00] please try 'cgroup_disable=memory' option if you don't want 
memory cgroups
[0.00] Enabling automatic NUMA balancing. Configure with 
numa_balancing= or the kernel.numa_balancing sysctl
[0.00] tsc: Fast TSC calibration using PIT
[0.00] tsc: Detected 2799.999 MHz processor
[0.001803] Calibrating delay loop (skipped), value calculated using timer 
frequency.. 5599.99 BogoMIPS (lpj=279)
[0.012408] pid_max: default: 1572864 minimum: 12288
[0.017987] init_memory_mapping: [mem 0x5947f000-0x5b47efff]
[0.023701] init_memory_mapping: [mem 0x5b47f000-0x5b87efff]
[0.029369] init_memory_mapping: [mem 0x6d368000-0x6d3edfff]
[0.039130] BUG: unable to handle kernel paging request at 5b835f90
[0.046101] IP: [<5b835f90>] 0x5b835f8f
[0.050637] PGD 81f61067 PUD 190ffefff067 PMD 190ffeffd067 PTE 
5b835063
[0.057989] Oops: 0011 [#1] SMP 
[0.061241] Modules linked in:
[0.064304] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
3.10.0-327.59.59.46.h42.x86_64 #1
[0.072280] Hardware name: Huawei FusionServer9032/IT91SMUB, BIOS BLXSV316 
11/14/2017
[0.080082] task: 8196e440 ti: 81958000 task.ti: 
81958000
[0.087539] RIP: 0010:[<5b835f90>]  [<5b835f90>] 0x5b835f8f
[0.094494] RSP: :8195be28  EFLAGS: 00010046
[0.099788] RAX: 80050033 RBX: 910fbc802000 RCX: 02d0
[0.106897] RDX: 0030 RSI: 02d0 RDI: 5b835f90
[0.114006] RBP: 8195bf38 R08: 0001 R09: 090fbc802000
[0.121116] R10: 88ffbcc07340 R11: 0001 R12: 0001
[0.128225] R13: 090fbc802000 R14: 02d0 R15: 0001
[0.135336] FS:  () GS:c900() 
knlGS:
[0.143398] CS:  0010 DS:  ES:  CR0: 80050033
[0.149124] CR2: 5b835f90 CR3: 01966000 CR4: 000606b0
[0.156234] DR0:  DR1:  DR2: 
[0.163344] DR3:  DR6: fffe0ff0 DR7: 0400
[0.170454] Call Trace:
[0.172899]  [] ? efi_call4+0x6c/0xf0
[0.178108]  [] ? native_flush_tlb_global+0x8e/0xc0
[0.184527]  [] ? set_memory_x+0x43/0x50
[0.189997]  [] ? efi_enter_virtual_mode+0x3bc/0x538
[0.196505]  [] start_kernel+0x39f/0x44f
[0.201972]  [] ? repair_env_string+0x5c/0x5c
[0.207872]  [] ? early_idt_handlers+0x120/0x120
[0.214030]  [] x86_64_start_reservations+0x2a/0x2c
[0.220449]  [] x86_64_start_kernel+0x152/0x175
[0.226521] Code:  Bad

[RFC] boot failed when enable KAISER/KPTI

2018-01-04 Thread Xishi Qiu

I run the latest RHEL 7.2 with the KAISER/KPTI patch, and boot failed.

...
[0.00] PM: Registered nosave memory: [mem 0x810-0x8ff]
[0.00] PM: Registered nosave memory: [mem 0x910-0xfff]
[0.00] PM: Registered nosave memory: [mem 0x1010-0x10ff]
[0.00] PM: Registered nosave memory: [mem 0x1110-0x17ff]
[0.00] PM: Regitered nosave memory: [mem 0x1810-0x18ff]
[0.00] e820: [mem 0x9000-0xfed1bfff] available for PCI devices
[0.00] Booting paravirtualized kernel on bare hardware
[0.00] setup_percpu: NR_CPUS:5120 nr_cpumask_bits:1536 nr_cpu_ids:1536 
nr_node_ids:8
[0.00] PERCPU: max_distance=0x180ffe24 too large for vmalloc space 
0x1fff
[0.00] setup_percpu: auto allocator failed (-22), falling back to page 
size
[0.00] PERCPU: 32 4K pages/cpu @c900 s107200 r8192 d15680
[0.00] Built 8 zonelists in Zone order, mobility grouping on.  Total 
pages: 132001804
[0.00] Policy zone: Normal
iosdevname=0 8250.nr_uarts=8 efi=old_map rdloaddriver=usb_storage 
rdloaddriver=sd_mod udev.event-timeout=600 softlockup_panic=0 
rcupdate.rcu_cpu_stall_timeout=300
[0.00] Intel-IOMMU: enabled
[0.00] PID hash table entries: 4096 (order: 3, 32768 bytes)
[0.00] x86/fpu: xstate_offset[2]: 0240, xstate_sizes[2]: 0100
[0.00] xsave: enabled xstate_bv 0x7, cntxt size 0x340
[0.00] AGP: Checking aperture...
[0.00] AGP: No AGP bridge found
[0.00] Memory: 526901612k/26910638080k available (6528k kernel code, 
26374249692k absent, 9486776k reserved, 4302k data, 1676k init)
[0.00] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=1536, Nodes=8
[0.00] x86/pti: Unmapping kernel while in userspace
[0.00] Hierarchical RCU implementation.
[0.00]  RCU restricting CPUs from NR_CPUS=5120 to nr_cpu_ids=1536.
[0.00]  Offload RCU callbacks from all CPUs
[0.00]  Offload RCU callbacks from CPUs: 0-1535.
[0.00] NR_IRQS:327936 nr_irqs:15976 0
[0.00] Console: colour dummy device 80x25
[0.00] console [tty0] enabled
[0.00] console [ttyS0] enabled
[0.00] allocated 2145910784 bytes of page_cgroup
[0.00] please try 'cgroup_disable=memory' option if you don't want 
memory cgroups
[0.00] Enabling automatic NUMA balancing. Configure with 
numa_balancing= or the kernel.numa_balancing sysctl
[0.00] tsc: Fast TSC calibration using PIT
[0.00] tsc: Detected 2799.999 MHz processor
[0.001803] Calibrating delay loop (skipped), value calculated using timer 
frequency.. 5599.99 BogoMIPS (lpj=279)
[0.012408] pid_max: default: 1572864 minimum: 12288
[0.017987] init_memory_mapping: [mem 0x5947f000-0x5b47efff]
[0.023701] init_memory_mapping: [mem 0x5b47f000-0x5b87efff]
[0.029369] init_memory_mapping: [mem 0x6d368000-0x6d3edfff]
[0.039130] BUG: unable to handle kernel paging request at 5b835f90
[0.046101] IP: [<5b835f90>] 0x5b835f8f
[0.050637] PGD 81f61067 PUD 190ffefff067 PMD 190ffeffd067 PTE 
5b835063
[0.057989] Oops: 0011 [#1] SMP 
[0.061241] Modules linked in:
[0.064304] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
3.10.0-327.59.59.46.h42.x86_64 #1
[0.072280] Hardware name: Huawei FusionServer9032/IT91SMUB, BIOS BLXSV316 
11/14/2017
[0.080082] task: 8196e440 ti: 81958000 task.ti: 
81958000
[0.087539] RIP: 0010:[<5b835f90>]  [<5b835f90>] 0x5b835f8f
[0.094494] RSP: :8195be28  EFLAGS: 00010046
[0.099788] RAX: 80050033 RBX: 910fbc802000 RCX: 02d0
[0.106897] RDX: 0030 RSI: 02d0 RDI: 5b835f90
[0.114006] RBP: 8195bf38 R08: 0001 R09: 090fbc802000
[0.121116] R10: 88ffbcc07340 R11: 0001 R12: 0001
[0.128225] R13: 090fbc802000 R14: 02d0 R15: 0001
[0.135336] FS:  () GS:c900() 
knlGS:
[0.143398] CS:  0010 DS:  ES:  CR0: 80050033
[0.149124] CR2: 5b835f90 CR3: 01966000 CR4: 000606b0
[0.156234] DR0:  DR1:  DR2: 
[0.163344] DR3:  DR6: fffe0ff0 DR7: 0400
[0.170454] Call Trace:
[0.172899]  [] ? efi_call4+0x6c/0xf0
[0.178108]  [] ? native_flush_tlb_global+0x8e/0xc0
[0.184527]  [] ? set_memory_x+0x43/0x50
[0.189997]  [] ? efi_enter_virtual_mode+0x3bc/0x538
[0.196505]  [] start_kernel+0x39f/0x44f
[0.201972]  [] ? repair_env_string+0x5c/0x5c
[0.207872]  [] ? early_idt_handlers+0x120/0x120
[0.214030]  [] x86_64_start_reservations+0x2a/0x2c
[0.220449]  [] x86_64_start_kernel+0x152/0x175
[0.226521] Code:  Bad

[RFC] mm: why vfree() do not free page table memory?

2017-12-29 Thread Xishi Qiu

When calling vfree(), it calls unmap_vmap_area() to clear page table,
but do not free the memory of page table, why? just for performance?

If a driver use vmalloc() and vfree() frequently, we will lost much
page table memory, maybe oom later.

Thanks,
Xishi Qiu

[RFC] mm: why vfree() do not free page table memory?

2017-12-29 Thread Xishi Qiu

When calling vfree(), it calls unmap_vmap_area() to clear page table,
but do not free the memory of page table, why? just for performance?

If a driver use vmalloc() and vfree() frequently, we will lost much
page table memory, maybe oom later.

Thanks,
Xishi Qiu

Re: [RFC] does ioremap() cause memory leak?

2017-12-22 Thread Xishi Qiu

On 2017/12/21 16:55, Xishi Qiu wrote:

> When we use iounmap() to free the mapping, it calls unmap_vmap_area() to 
> clear page table,
> but do not free the memory of page table, right?
> 
> So when use ioremap() to mapping another area(incluce the area before), it 
> may use
> large mapping(e.g. ioremap_pmd_enabled()), so the original page table 
> memory(e.g. pte memory)
> will be lost, it cause memory leak, right?
> 
> Thanks,
> Xishi Qiu
> 
> 
> .
> 

Hi, here is another question from lious.li...@hisilicon.com

As ARM-ARM said

“The architecture permits the caching of any translation table entry that has 
been returned from memory without a

fault, provided that the entry does not, itself, cause a Translation fault, an 
Address size fault, or an Access Flag fault.

This means that the entries that can be cached include:

• Entries in translation tables that point to subsequent tables to be used in 
that stage of translation.

• Stage 2 translation table entries used as part of a stage 1 translation table 
walk

• Stage 2 translation table entries used to translate the output address of the 
stage 1 translation.”

this means pgd, pud, pmd, pte all can be cached in TLB if itself have not a 
fault.

the scenario want page walk from:

4K:pgd0 --> pud0 --> pmd0 --> pte0 (4K)

To

2M:   pgd0 --> pud0 --> pte1(2M)

--> is connect next pagetable

-X-> is disconnect next pagetable

I have seen the ioremap and iounmap software flow for ARM64 in Kernel version 
4.14.

When I use ioremap to get a valid virtual address for a device address, Kernel 
would use ioremap_page_range to config the pagetable.

In ioremap_page_range function, if there is no pud, pmd or pte, Kernel would 
alloc one page for it. And then Kernel write the valid value into the address.

When I use iounmap to release this area, Kernel would write zero into the last 
level pagetable, then execute tlbi vaae1is to flush the tlb. But I haven`t seen 
Kernel would free the used page for pud, pmd or pte.

So there is a scene, I config Kernel to use 4K pagetable, and enable 
CONFIG_HAVE_ARCH_HUGE_VMAP. The when I use ioremap, Kernel would config 1G, 2M 
or 4K pagetable according to the size.

First I use ioremap to ask for 4K size. Kernel returns a virtual address VA1. 
Then I use iounmap to free this area. Kernel would write zero into the VA1`s 
level3 pagetable. Then when Kernel wants to get VA1 back, Kernel would send a 
tlbi vaae1is.

the page become follow:

1. 4K:   pgd0 --> pud0 --> pmd0 --> pte0 (4K)

2. pte0 write 0

3. 4K:   pgd0 --> pud0 --> pmd0(still valid) -X-> pte0 (4K,not valid)

4. tlbi vaae1is

Sencond I use ioremap to ask for 2M size. Kernel would config a 2M page, then 
return the virtual address. And Kernel just allocates the same virtual address 
VA1 for me. But I see in the ioremap_page_range software flow, Kernel just 
write the valid value into the level2 pagetable address, and doesn`t release 
the allocated page for the previous level3 pagetable. And when Kernel modifies 
the level2 pagetable, it also doesn`t follow the ARM break-before-make flow.

the page change as follow:

1.pgd0 --> pud0 --> pmd0(still valid) -X-> pte0 (4K,not valid)

2.write pmd0(still valid) to block for 2M.

3.expect pgd0 --> pud0 --> pte1(2M)

but because pmd0(4K pmd, still valid) before becoming to pte(2M pte), maybe 
have a speculative access between 1 and 3.

the pgd0, pud0, pmd0 have no fault will be cached in TBL, the pte0 have fault 
so can't be cached, this speculative access will be drop(no exception).

and the page change as:

1.pgd0 --> pud0 --> pmd0(still valid) -X-> pte0 (4K,not valid)

2.speculative access the same VA(pgd0 --> pud0 --> pmd0(still valid) -X-> pte0 
(4K,not valid)). cache the pgd0, pud0, pmd0.

3.write pmd0 from pmd to block(pte) for 2M.

4.the page walker maybe pgd0 --> pud0 --> pmd0(cached in TLB) --> 0x0 
(translation fault)

So I have two questions for this scene.

1. When the same virtual address allocated from ioremap, first is 4K size, 
second is 2M size, if Kernel would leak memory.

2. Kernel modifies the old invalid 4K pagetable to 2M, but doesn`t follow the 
ARM break-before-make flow, CPU maybe get the old invalid 4K pagetable 
information, then Kernel would panic.

Re: [RFC] does ioremap() cause memory leak?

2017-12-22 Thread Xishi Qiu

On 2017/12/21 16:55, Xishi Qiu wrote:

> When we use iounmap() to free the mapping, it calls unmap_vmap_area() to 
> clear page table,
> but do not free the memory of page table, right?
> 
> So when use ioremap() to mapping another area(incluce the area before), it 
> may use
> large mapping(e.g. ioremap_pmd_enabled()), so the original page table 
> memory(e.g. pte memory)
> will be lost, it cause memory leak, right?
> 
> Thanks,
> Xishi Qiu
> 
> 
> .
> 

Hi, here is another question from lious.li...@hisilicon.com

As ARM-ARM said

“The architecture permits the caching of any translation table entry that has 
been returned from memory without a

fault, provided that the entry does not, itself, cause a Translation fault, an 
Address size fault, or an Access Flag fault.

This means that the entries that can be cached include:

• Entries in translation tables that point to subsequent tables to be used in 
that stage of translation.

• Stage 2 translation table entries used as part of a stage 1 translation table 
walk

• Stage 2 translation table entries used to translate the output address of the 
stage 1 translation.”

this means pgd, pud, pmd, pte all can be cached in TLB if itself have not a 
fault.

the scenario want page walk from:

4K:pgd0 --> pud0 --> pmd0 --> pte0 (4K)

To

2M:   pgd0 --> pud0 --> pte1(2M)

--> is connect next pagetable

-X-> is disconnect next pagetable

I have seen the ioremap and iounmap software flow for ARM64 in Kernel version 
4.14.

When I use ioremap to get a valid virtual address for a device address, Kernel 
would use ioremap_page_range to config the pagetable.

In ioremap_page_range function, if there is no pud, pmd or pte, Kernel would 
alloc one page for it. And then Kernel write the valid value into the address.

When I use iounmap to release this area, Kernel would write zero into the last 
level pagetable, then execute tlbi vaae1is to flush the tlb. But I haven`t seen 
Kernel would free the used page for pud, pmd or pte.

So there is a scene, I config Kernel to use 4K pagetable, and enable 
CONFIG_HAVE_ARCH_HUGE_VMAP. The when I use ioremap, Kernel would config 1G, 2M 
or 4K pagetable according to the size.

First I use ioremap to ask for 4K size. Kernel returns a virtual address VA1. 
Then I use iounmap to free this area. Kernel would write zero into the VA1`s 
level3 pagetable. Then when Kernel wants to get VA1 back, Kernel would send a 
tlbi vaae1is.

the page become follow:

1. 4K:   pgd0 --> pud0 --> pmd0 --> pte0 (4K)

2. pte0 write 0

3. 4K:   pgd0 --> pud0 --> pmd0(still valid) -X-> pte0 (4K,not valid)

4. tlbi vaae1is

Sencond I use ioremap to ask for 2M size. Kernel would config a 2M page, then 
return the virtual address. And Kernel just allocates the same virtual address 
VA1 for me. But I see in the ioremap_page_range software flow, Kernel just 
write the valid value into the level2 pagetable address, and doesn`t release 
the allocated page for the previous level3 pagetable. And when Kernel modifies 
the level2 pagetable, it also doesn`t follow the ARM break-before-make flow.

the page change as follow:

1.pgd0 --> pud0 --> pmd0(still valid) -X-> pte0 (4K,not valid)

2.write pmd0(still valid) to block for 2M.

3.expect pgd0 --> pud0 --> pte1(2M)

but because pmd0(4K pmd, still valid) before becoming to pte(2M pte), maybe 
have a speculative access between 1 and 3.

the pgd0, pud0, pmd0 have no fault will be cached in TBL, the pte0 have fault 
so can't be cached, this speculative access will be drop(no exception).

and the page change as:

1.pgd0 --> pud0 --> pmd0(still valid) -X-> pte0 (4K,not valid)

2.speculative access the same VA(pgd0 --> pud0 --> pmd0(still valid) -X-> pte0 
(4K,not valid)). cache the pgd0, pud0, pmd0.

3.write pmd0 from pmd to block(pte) for 2M.

4.the page walker maybe pgd0 --> pud0 --> pmd0(cached in TLB) --> 0x0 
(translation fault)

So I have two questions for this scene.

1. When the same virtual address allocated from ioremap, first is 4K size, 
second is 2M size, if Kernel would leak memory.

2. Kernel modifies the old invalid 4K pagetable to 2M, but doesn`t follow the 
ARM break-before-make flow, CPU maybe get the old invalid 4K pagetable 
information, then Kernel would panic.

[RFC] does ioremap() cause memory leak?

2017-12-21 Thread Xishi Qiu

When we use iounmap() to free the mapping, it calls unmap_vmap_area() to clear 
page table,
but do not free the memory of page table, right?

So when use ioremap() to mapping another area(incluce the area before), it may 
use
large mapping(e.g. ioremap_pmd_enabled()), so the original page table 
memory(e.g. pte memory)
will be lost, it cause memory leak, right?

Thanks,
Xishi Qiu

[RFC] does ioremap() cause memory leak?

2017-12-21 Thread Xishi Qiu

When we use iounmap() to free the mapping, it calls unmap_vmap_area() to clear 
page table,
but do not free the memory of page table, right?

So when use ioremap() to mapping another area(incluce the area before), it may 
use
large mapping(e.g. ioremap_pmd_enabled()), so the original page table 
memory(e.g. pte memory)
will be lost, it cause memory leak, right?

Thanks,
Xishi Qiu

Re: [PATCH 1/2] mm: drop migrate type checks from has_unmovable_pages

2017-10-20 Thread Xishi Qiu

On 2017/10/20 10:13, Joonsoo Kim wrote:

> On Thu, Oct 19, 2017 at 02:21:18PM +0200, Michal Hocko wrote:
>> On Thu 19-10-17 10:20:41, Michal Hocko wrote:
>>> On Thu 19-10-17 16:33:56, Joonsoo Kim wrote:
>>>> On Thu, Oct 19, 2017 at 09:15:03AM +0200, Michal Hocko wrote:
>>>>> On Thu 19-10-17 11:51:11, Joonsoo Kim wrote:
>>> [...]
>>>>>> Hello,
>>>>>>
>>>>>> This patch will break the CMA user. As you mentioned, CMA allocation
>>>>>> itself isn't migrateable. So, after a single page is allocated through
>>>>>> CMA allocation, has_unmovable_pages() will return true for this
>>>>>> pageblock. Then, futher CMA allocation request to this pageblock will
>>>>>> fail because it requires isolating the pageblock.
>>>>>
>>>>> Hmm, does this mean that the CMA allocation path depends on
>>>>> has_unmovable_pages to return false here even though the memory is not
>>>>> movable? This sounds really strange to me and kind of abuse of this
>>>>
>>>> Your understanding is correct. Perhaps, abuse or wrong function name.
>>>>
>>>>> function. Which path is that? Can we do the migrate type test theres?
>>>>
>>>> alloc_contig_range() -> start_isolate_page_range() ->
>>>> set_migratetype_isolate() -> has_unmovable_pages()
>>>
>>> I see. It seems that the CMA and memory hotplug have a very different
>>> view on what should happen during isolation.
>>>  
>>>> We can add one argument, 'XXX' to set_migratetype_isolate() and change
>>>> it to check migrate type rather than has_unmovable_pages() if 'XXX' is
>>>> specified.
>>>
>>> Can we use the migratetype argument and do the special thing for
>>> MIGRATE_CMA? Like the following diff?
>>
>> And with the full changelog.
>> ---
>> >From 8cbd811d741f5dd93d1b21bb3ef94482a4d0bd32 Mon Sep 17 00:00:00 2001
>> From: Michal Hocko <mho...@suse.com>
>> Date: Thu, 19 Oct 2017 14:14:02 +0200
>> Subject: [PATCH] mm: distinguish CMA and MOVABLE isolation in
>>  has_unmovable_pages
>>
>> Joonsoo has noticed that "mm: drop migrate type checks from
>> has_unmovable_pages" would break CMA allocator because it relies on
>> has_unmovable_pages returning false even for CMA pageblocks which in
>> fact don't have to be movable:
>> alloc_contig_range
>>   start_isolate_page_range
>> set_migratetype_isolate
>>   has_unmovable_pages
>>
>> This is a result of the code sharing between CMA and memory hotplug
>> while each one has a different idea of what has_unmovable_pages should
>> return. This is unfortunate but fixing it properly would require a lot
>> of code duplication.
>>
>> Fix the issue by introducing the requested migrate type argument
>> and special case MIGRATE_CMA case where CMA page blocks are handled
>> properly. This will work for memory hotplug because it requires
>> MIGRATE_MOVABLE.
> 
> Unfortunately, alloc_contig_range() can be called with
> MIGRATE_MOVABLE so this patch cannot perfectly fix the problem.
> 
> I did a more thinking and found that it's strange to check if there is
> unmovable page in the pageblock during the set_migratetype_isolate().
> set_migratetype_isolate() should be just for setting the migratetype
> of the pageblock. Checking other things should be done by another
> place, for example, before calling the start_isolate_page_range() in
> __offline_pages().
> 
> Thanks.
> 

Hi Joonsoo,

How about add a flag to skip or not has_unmovable_pages() in 
set_migratetype_isolate()?
Something like the skip_hwpoisoned_pages.

Thanks,
Xishi Qiu

> 
> .
>

Re: [PATCH 1/2] mm: drop migrate type checks from has_unmovable_pages

2017-10-20 Thread Xishi Qiu

On 2017/10/20 10:13, Joonsoo Kim wrote:

> On Thu, Oct 19, 2017 at 02:21:18PM +0200, Michal Hocko wrote:
>> On Thu 19-10-17 10:20:41, Michal Hocko wrote:
>>> On Thu 19-10-17 16:33:56, Joonsoo Kim wrote:
>>>> On Thu, Oct 19, 2017 at 09:15:03AM +0200, Michal Hocko wrote:
>>>>> On Thu 19-10-17 11:51:11, Joonsoo Kim wrote:
>>> [...]
>>>>>> Hello,
>>>>>>
>>>>>> This patch will break the CMA user. As you mentioned, CMA allocation
>>>>>> itself isn't migrateable. So, after a single page is allocated through
>>>>>> CMA allocation, has_unmovable_pages() will return true for this
>>>>>> pageblock. Then, futher CMA allocation request to this pageblock will
>>>>>> fail because it requires isolating the pageblock.
>>>>>
>>>>> Hmm, does this mean that the CMA allocation path depends on
>>>>> has_unmovable_pages to return false here even though the memory is not
>>>>> movable? This sounds really strange to me and kind of abuse of this
>>>>
>>>> Your understanding is correct. Perhaps, abuse or wrong function name.
>>>>
>>>>> function. Which path is that? Can we do the migrate type test theres?
>>>>
>>>> alloc_contig_range() -> start_isolate_page_range() ->
>>>> set_migratetype_isolate() -> has_unmovable_pages()
>>>
>>> I see. It seems that the CMA and memory hotplug have a very different
>>> view on what should happen during isolation.
>>>  
>>>> We can add one argument, 'XXX' to set_migratetype_isolate() and change
>>>> it to check migrate type rather than has_unmovable_pages() if 'XXX' is
>>>> specified.
>>>
>>> Can we use the migratetype argument and do the special thing for
>>> MIGRATE_CMA? Like the following diff?
>>
>> And with the full changelog.
>> ---
>> >From 8cbd811d741f5dd93d1b21bb3ef94482a4d0bd32 Mon Sep 17 00:00:00 2001
>> From: Michal Hocko 
>> Date: Thu, 19 Oct 2017 14:14:02 +0200
>> Subject: [PATCH] mm: distinguish CMA and MOVABLE isolation in
>>  has_unmovable_pages
>>
>> Joonsoo has noticed that "mm: drop migrate type checks from
>> has_unmovable_pages" would break CMA allocator because it relies on
>> has_unmovable_pages returning false even for CMA pageblocks which in
>> fact don't have to be movable:
>> alloc_contig_range
>>   start_isolate_page_range
>> set_migratetype_isolate
>>   has_unmovable_pages
>>
>> This is a result of the code sharing between CMA and memory hotplug
>> while each one has a different idea of what has_unmovable_pages should
>> return. This is unfortunate but fixing it properly would require a lot
>> of code duplication.
>>
>> Fix the issue by introducing the requested migrate type argument
>> and special case MIGRATE_CMA case where CMA page blocks are handled
>> properly. This will work for memory hotplug because it requires
>> MIGRATE_MOVABLE.
> 
> Unfortunately, alloc_contig_range() can be called with
> MIGRATE_MOVABLE so this patch cannot perfectly fix the problem.
> 
> I did a more thinking and found that it's strange to check if there is
> unmovable page in the pageblock during the set_migratetype_isolate().
> set_migratetype_isolate() should be just for setting the migratetype
> of the pageblock. Checking other things should be done by another
> place, for example, before calling the start_isolate_page_range() in
> __offline_pages().
> 
> Thanks.
> 

Hi Joonsoo,

How about add a flag to skip or not has_unmovable_pages() in 
set_migratetype_isolate()?
Something like the skip_hwpoisoned_pages.

Thanks,
Xishi Qiu

> 
> .
>

Re: [RFC] a question about mlockall() and mprotect()

2017-10-09 Thread Xishi Qiu

On 2017/10/10 2:26, Michal Hocko wrote:

> On Wed 27-09-17 13:51:09, Xishi Qiu wrote:
>> On 2017/9/26 19:00, Michal Hocko wrote:
>>
>>> On Tue 26-09-17 11:45:16, Vlastimil Babka wrote:
>>>> On 09/26/2017 11:22 AM, Xishi Qiu wrote:
>>>>> On 2017/9/26 17:13, Xishi Qiu wrote:
>>>>>>> This is still very fuzzy. What are you actually trying to achieve?
>>>>>>
>>>>>> I don't expect page fault any more after mlock.
>>>>>>
>>>>>
>>>>> Our apps is some thing like RT, and page-fault maybe cause a lot of time,
>>>>> e.g. lock, mem reclaim ..., so I use mlock and don't want page fault
>>>>> any more.
>>>>
>>>> Why does your app then have restricted mprotect when calling mlockall()
>>>> and only later adjusts the mprotect?
>>>
>>> Ahh, OK I see what is goging on. So you have PROT_NONE vma at the time
>>> mlockall and then later mprotect it something else and want to fault all
>>> that memory at the mprotect time?
>>>
>>> So basically to do
>>> ---
>>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>>> index 6d3e2f082290..b665b5d1c544 100644
>>> --- a/mm/mprotect.c
>>> +++ b/mm/mprotect.c
>>> @@ -369,7 +369,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct 
>>> vm_area_struct **pprev,
>>>  * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
>>>  * fault on access.
>>>  */
>>> -   if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
>>> +   if ((oldflags & (VM_WRITE | VM_LOCKED)) == VM_LOCKED &&
>>> (newflags & VM_WRITE)) {
>>> populate_vma_page_range(vma, start, end, NULL);
>>> }
>>>
>>
>> Hi Michal,
>>
>> My kernel is v3.10, and I missed this code, thank you reminding me.
> 
> I guess I didn't get your answer. Does the above diff resolves your
> problem?

Hi Michal,

This upstream patch 36f881883c57941bb32d25cea6524f9612ab5a2c has already
resolve my problem, thank you for your attention.

Thanks,
Xishi Qiu

Re: [RFC] a question about mlockall() and mprotect()

2017-10-09 Thread Xishi Qiu

On 2017/10/10 2:26, Michal Hocko wrote:

> On Wed 27-09-17 13:51:09, Xishi Qiu wrote:
>> On 2017/9/26 19:00, Michal Hocko wrote:
>>
>>> On Tue 26-09-17 11:45:16, Vlastimil Babka wrote:
>>>> On 09/26/2017 11:22 AM, Xishi Qiu wrote:
>>>>> On 2017/9/26 17:13, Xishi Qiu wrote:
>>>>>>> This is still very fuzzy. What are you actually trying to achieve?
>>>>>>
>>>>>> I don't expect page fault any more after mlock.
>>>>>>
>>>>>
>>>>> Our apps is some thing like RT, and page-fault maybe cause a lot of time,
>>>>> e.g. lock, mem reclaim ..., so I use mlock and don't want page fault
>>>>> any more.
>>>>
>>>> Why does your app then have restricted mprotect when calling mlockall()
>>>> and only later adjusts the mprotect?
>>>
>>> Ahh, OK I see what is goging on. So you have PROT_NONE vma at the time
>>> mlockall and then later mprotect it something else and want to fault all
>>> that memory at the mprotect time?
>>>
>>> So basically to do
>>> ---
>>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>>> index 6d3e2f082290..b665b5d1c544 100644
>>> --- a/mm/mprotect.c
>>> +++ b/mm/mprotect.c
>>> @@ -369,7 +369,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct 
>>> vm_area_struct **pprev,
>>>  * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
>>>  * fault on access.
>>>  */
>>> -   if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
>>> +   if ((oldflags & (VM_WRITE | VM_LOCKED)) == VM_LOCKED &&
>>> (newflags & VM_WRITE)) {
>>> populate_vma_page_range(vma, start, end, NULL);
>>> }
>>>
>>
>> Hi Michal,
>>
>> My kernel is v3.10, and I missed this code, thank you reminding me.
> 
> I guess I didn't get your answer. Does the above diff resolves your
> problem?

Hi Michal,

This upstream patch 36f881883c57941bb32d25cea6524f9612ab5a2c has already
resolve my problem, thank you for your attention.

Thanks,
Xishi Qiu

Re: [RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

On 2017/9/26 19:00, Michal Hocko wrote:

> On Tue 26-09-17 11:45:16, Vlastimil Babka wrote:
>> On 09/26/2017 11:22 AM, Xishi Qiu wrote:
>>> On 2017/9/26 17:13, Xishi Qiu wrote:
>>>>> This is still very fuzzy. What are you actually trying to achieve?
>>>>
>>>> I don't expect page fault any more after mlock.
>>>>
>>>
>>> Our apps is some thing like RT, and page-fault maybe cause a lot of time,
>>> e.g. lock, mem reclaim ..., so I use mlock and don't want page fault
>>> any more.
>>
>> Why does your app then have restricted mprotect when calling mlockall()
>> and only later adjusts the mprotect?
> 
> Ahh, OK I see what is goging on. So you have PROT_NONE vma at the time
> mlockall and then later mprotect it something else and want to fault all
> that memory at the mprotect time?
> 
> So basically to do
> ---
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 6d3e2f082290..b665b5d1c544 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -369,7 +369,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct 
> vm_area_struct **pprev,
>* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
>* fault on access.
>*/
> - if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
> + if ((oldflags & (VM_WRITE | VM_LOCKED)) == VM_LOCKED &&
>           (newflags & VM_WRITE)) {
>   populate_vma_page_range(vma, start, end, NULL);
>   }
> 

Hi Michal,

My kernel is v3.10, and I missed this code, thank you reminding me.

Thanks,
Xishi Qiu

Re: [RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

On 2017/9/26 19:00, Michal Hocko wrote:

> On Tue 26-09-17 11:45:16, Vlastimil Babka wrote:
>> On 09/26/2017 11:22 AM, Xishi Qiu wrote:
>>> On 2017/9/26 17:13, Xishi Qiu wrote:
>>>>> This is still very fuzzy. What are you actually trying to achieve?
>>>>
>>>> I don't expect page fault any more after mlock.
>>>>
>>>
>>> Our apps is some thing like RT, and page-fault maybe cause a lot of time,
>>> e.g. lock, mem reclaim ..., so I use mlock and don't want page fault
>>> any more.
>>
>> Why does your app then have restricted mprotect when calling mlockall()
>> and only later adjusts the mprotect?
> 
> Ahh, OK I see what is goging on. So you have PROT_NONE vma at the time
> mlockall and then later mprotect it something else and want to fault all
> that memory at the mprotect time?
> 
> So basically to do
> ---
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 6d3e2f082290..b665b5d1c544 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -369,7 +369,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct 
> vm_area_struct **pprev,
>* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
>* fault on access.
>*/
> - if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
> + if ((oldflags & (VM_WRITE | VM_LOCKED)) == VM_LOCKED &&
>           (newflags & VM_WRITE)) {
>   populate_vma_page_range(vma, start, end, NULL);
>   }
> 

Hi Michal,

My kernel is v3.10, and I missed this code, thank you reminding me.

Thanks,
Xishi Qiu

Re: [RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

On 2017/9/26 17:13, Xishi Qiu wrote:

> On 2017/9/26 17:02, Michal Hocko wrote:
> 
>> On Tue 26-09-17 16:39:56, Xishi Qiu wrote:
>>> On 2017/9/26 16:17, Michal Hocko wrote:
>>>
>>>> On Tue 26-09-17 15:56:55, Xishi Qiu wrote:
>>>>> When we call mlockall(), we will add VM_LOCKED to the vma,
>>>>> if the vma prot is ---p,
>>>>
>>>> not sure what you mean here. apply_mlockall_flags will set the flag on
>>>> all vmas except for special mappings (mlock_fixup). This phase will
>>>> cause that memory reclaim will not free already mapped pages in those
>>>> vmas (see page_check_references and the lazy mlock pages move to
>>>> unevictable LRUs).
>>>>
>>>>> then mm_populate -> get_user_pages will not alloc memory.
>>>>
>>>> mm_populate all the vmas with pages. Well there are certainly some
>>>> constrains - e.g. memory cgroup hard limit might be hit and so the
>>>> faulting might fail.
>>>>
>>>>> I find it said "ignore errors" in mm_populate()
>>>>> static inline void mm_populate(unsigned long addr, unsigned long len)
>>>>> {
>>>>>   /* Ignore errors */
>>>>>   (void) __mm_populate(addr, len, 1);
>>>>> }
>>>>
>>>> But we do not report the failure because any failure past
>>>> apply_mlockall_flags would be tricky to handle. We have already dropped
>>>> the mmap_sem lock so some other address space operations could have
>>>> interfered.
>>>>  
>>>>> And later we call mprotect() to change the prot, then it is
>>>>> still not alloc memory for the mlocked vma.
>>>>>
>>>>> My question is that, shall we alloc memory if the prot changed,
>>>>> and who(kernel, glibc, user) should alloc the memory?
>>>>
>>>> I do not understand your question but if you are asking how to get pages
>>>> to map your vmas then touching that area will fault the memory in.
>>>
>>> Hi Michal,
>>>
>>> syscall mlockall() will first apply the VM_LOCKED to the vma, then
>>> call mm_populate() to map the vmas.
>>>
>>> mm_populate
>>> populate_vma_page_range
>>> __get_user_pages
>>> check_vma_flags
>>> And the above path maybe return -EFAULT in some case, right?
>>>
>>> If we call mprotect() to change the prot of vma, just let
>>> check_vma_flags() return 0, then we will get the mlocked pages
>>> in following page-fault, right?
>>
>> Any future page fault to the existing vma will result in the mlocked
>> page. That is what VM_LOCKED guarantess.
>>
>>> My question is that, shall we map the vmas immediately when
>>> the prot changed? If we should map it immediately, who(kernel, glibc, user)
>>> do this step?
>>
>> This is still very fuzzy. What are you actually trying to achieve?
> 
> I don't expect page fault any more after mlock.
> 

Our apps is some thing like RT, and page-fault maybe cause a lot of time,
e.g. lock, mem reclaim ..., so I use mlock and don't want page fault
any more.

Thanks,
Xishi Qiu

> 
> .
>

Re: [RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

On 2017/9/26 17:13, Xishi Qiu wrote:

> On 2017/9/26 17:02, Michal Hocko wrote:
> 
>> On Tue 26-09-17 16:39:56, Xishi Qiu wrote:
>>> On 2017/9/26 16:17, Michal Hocko wrote:
>>>
>>>> On Tue 26-09-17 15:56:55, Xishi Qiu wrote:
>>>>> When we call mlockall(), we will add VM_LOCKED to the vma,
>>>>> if the vma prot is ---p,
>>>>
>>>> not sure what you mean here. apply_mlockall_flags will set the flag on
>>>> all vmas except for special mappings (mlock_fixup). This phase will
>>>> cause that memory reclaim will not free already mapped pages in those
>>>> vmas (see page_check_references and the lazy mlock pages move to
>>>> unevictable LRUs).
>>>>
>>>>> then mm_populate -> get_user_pages will not alloc memory.
>>>>
>>>> mm_populate all the vmas with pages. Well there are certainly some
>>>> constrains - e.g. memory cgroup hard limit might be hit and so the
>>>> faulting might fail.
>>>>
>>>>> I find it said "ignore errors" in mm_populate()
>>>>> static inline void mm_populate(unsigned long addr, unsigned long len)
>>>>> {
>>>>>   /* Ignore errors */
>>>>>   (void) __mm_populate(addr, len, 1);
>>>>> }
>>>>
>>>> But we do not report the failure because any failure past
>>>> apply_mlockall_flags would be tricky to handle. We have already dropped
>>>> the mmap_sem lock so some other address space operations could have
>>>> interfered.
>>>>  
>>>>> And later we call mprotect() to change the prot, then it is
>>>>> still not alloc memory for the mlocked vma.
>>>>>
>>>>> My question is that, shall we alloc memory if the prot changed,
>>>>> and who(kernel, glibc, user) should alloc the memory?
>>>>
>>>> I do not understand your question but if you are asking how to get pages
>>>> to map your vmas then touching that area will fault the memory in.
>>>
>>> Hi Michal,
>>>
>>> syscall mlockall() will first apply the VM_LOCKED to the vma, then
>>> call mm_populate() to map the vmas.
>>>
>>> mm_populate
>>> populate_vma_page_range
>>> __get_user_pages
>>> check_vma_flags
>>> And the above path maybe return -EFAULT in some case, right?
>>>
>>> If we call mprotect() to change the prot of vma, just let
>>> check_vma_flags() return 0, then we will get the mlocked pages
>>> in following page-fault, right?
>>
>> Any future page fault to the existing vma will result in the mlocked
>> page. That is what VM_LOCKED guarantess.
>>
>>> My question is that, shall we map the vmas immediately when
>>> the prot changed? If we should map it immediately, who(kernel, glibc, user)
>>> do this step?
>>
>> This is still very fuzzy. What are you actually trying to achieve?
> 
> I don't expect page fault any more after mlock.
> 

Our apps is some thing like RT, and page-fault maybe cause a lot of time,
e.g. lock, mem reclaim ..., so I use mlock and don't want page fault
any more.

Thanks,
Xishi Qiu

> 
> .
>

Re: [RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

On 2017/9/26 17:02, Michal Hocko wrote:

> On Tue 26-09-17 16:39:56, Xishi Qiu wrote:
>> On 2017/9/26 16:17, Michal Hocko wrote:
>>
>>> On Tue 26-09-17 15:56:55, Xishi Qiu wrote:
>>>> When we call mlockall(), we will add VM_LOCKED to the vma,
>>>> if the vma prot is ---p,
>>>
>>> not sure what you mean here. apply_mlockall_flags will set the flag on
>>> all vmas except for special mappings (mlock_fixup). This phase will
>>> cause that memory reclaim will not free already mapped pages in those
>>> vmas (see page_check_references and the lazy mlock pages move to
>>> unevictable LRUs).
>>>
>>>> then mm_populate -> get_user_pages will not alloc memory.
>>>
>>> mm_populate all the vmas with pages. Well there are certainly some
>>> constrains - e.g. memory cgroup hard limit might be hit and so the
>>> faulting might fail.
>>>
>>>> I find it said "ignore errors" in mm_populate()
>>>> static inline void mm_populate(unsigned long addr, unsigned long len)
>>>> {
>>>>/* Ignore errors */
>>>>(void) __mm_populate(addr, len, 1);
>>>> }
>>>
>>> But we do not report the failure because any failure past
>>> apply_mlockall_flags would be tricky to handle. We have already dropped
>>> the mmap_sem lock so some other address space operations could have
>>> interfered.
>>>  
>>>> And later we call mprotect() to change the prot, then it is
>>>> still not alloc memory for the mlocked vma.
>>>>
>>>> My question is that, shall we alloc memory if the prot changed,
>>>> and who(kernel, glibc, user) should alloc the memory?
>>>
>>> I do not understand your question but if you are asking how to get pages
>>> to map your vmas then touching that area will fault the memory in.
>>
>> Hi Michal,
>>
>> syscall mlockall() will first apply the VM_LOCKED to the vma, then
>> call mm_populate() to map the vmas.
>>
>> mm_populate
>>  populate_vma_page_range
>>  __get_user_pages
>>  check_vma_flags
>> And the above path maybe return -EFAULT in some case, right?
>>
>> If we call mprotect() to change the prot of vma, just let
>> check_vma_flags() return 0, then we will get the mlocked pages
>> in following page-fault, right?
> 
> Any future page fault to the existing vma will result in the mlocked
> page. That is what VM_LOCKED guarantess.
> 
>> My question is that, shall we map the vmas immediately when
>> the prot changed? If we should map it immediately, who(kernel, glibc, user)
>> do this step?
> 
> This is still very fuzzy. What are you actually trying to achieve?

I don't expect page fault any more after mlock.

Re: [RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

On 2017/9/26 17:02, Michal Hocko wrote:

> On Tue 26-09-17 16:39:56, Xishi Qiu wrote:
>> On 2017/9/26 16:17, Michal Hocko wrote:
>>
>>> On Tue 26-09-17 15:56:55, Xishi Qiu wrote:
>>>> When we call mlockall(), we will add VM_LOCKED to the vma,
>>>> if the vma prot is ---p,
>>>
>>> not sure what you mean here. apply_mlockall_flags will set the flag on
>>> all vmas except for special mappings (mlock_fixup). This phase will
>>> cause that memory reclaim will not free already mapped pages in those
>>> vmas (see page_check_references and the lazy mlock pages move to
>>> unevictable LRUs).
>>>
>>>> then mm_populate -> get_user_pages will not alloc memory.
>>>
>>> mm_populate all the vmas with pages. Well there are certainly some
>>> constrains - e.g. memory cgroup hard limit might be hit and so the
>>> faulting might fail.
>>>
>>>> I find it said "ignore errors" in mm_populate()
>>>> static inline void mm_populate(unsigned long addr, unsigned long len)
>>>> {
>>>>/* Ignore errors */
>>>>(void) __mm_populate(addr, len, 1);
>>>> }
>>>
>>> But we do not report the failure because any failure past
>>> apply_mlockall_flags would be tricky to handle. We have already dropped
>>> the mmap_sem lock so some other address space operations could have
>>> interfered.
>>>  
>>>> And later we call mprotect() to change the prot, then it is
>>>> still not alloc memory for the mlocked vma.
>>>>
>>>> My question is that, shall we alloc memory if the prot changed,
>>>> and who(kernel, glibc, user) should alloc the memory?
>>>
>>> I do not understand your question but if you are asking how to get pages
>>> to map your vmas then touching that area will fault the memory in.
>>
>> Hi Michal,
>>
>> syscall mlockall() will first apply the VM_LOCKED to the vma, then
>> call mm_populate() to map the vmas.
>>
>> mm_populate
>>  populate_vma_page_range
>>  __get_user_pages
>>  check_vma_flags
>> And the above path maybe return -EFAULT in some case, right?
>>
>> If we call mprotect() to change the prot of vma, just let
>> check_vma_flags() return 0, then we will get the mlocked pages
>> in following page-fault, right?
> 
> Any future page fault to the existing vma will result in the mlocked
> page. That is what VM_LOCKED guarantess.
> 
>> My question is that, shall we map the vmas immediately when
>> the prot changed? If we should map it immediately, who(kernel, glibc, user)
>> do this step?
> 
> This is still very fuzzy. What are you actually trying to achieve?

I don't expect page fault any more after mlock.

Re: [RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

On 2017/9/26 16:17, Michal Hocko wrote:

> On Tue 26-09-17 15:56:55, Xishi Qiu wrote:
>> When we call mlockall(), we will add VM_LOCKED to the vma,
>> if the vma prot is ---p,
> 
> not sure what you mean here. apply_mlockall_flags will set the flag on
> all vmas except for special mappings (mlock_fixup). This phase will
> cause that memory reclaim will not free already mapped pages in those
> vmas (see page_check_references and the lazy mlock pages move to
> unevictable LRUs).
> 
>> then mm_populate -> get_user_pages will not alloc memory.
> 
> mm_populate all the vmas with pages. Well there are certainly some
> constrains - e.g. memory cgroup hard limit might be hit and so the
> faulting might fail.
> 
>> I find it said "ignore errors" in mm_populate()
>> static inline void mm_populate(unsigned long addr, unsigned long len)
>> {
>>  /* Ignore errors */
>>  (void) __mm_populate(addr, len, 1);
>> }
> 
> But we do not report the failure because any failure past
> apply_mlockall_flags would be tricky to handle. We have already dropped
> the mmap_sem lock so some other address space operations could have
> interfered.
>  
>> And later we call mprotect() to change the prot, then it is
>> still not alloc memory for the mlocked vma.
>>
>> My question is that, shall we alloc memory if the prot changed,
>> and who(kernel, glibc, user) should alloc the memory?
> 
> I do not understand your question but if you are asking how to get pages
> to map your vmas then touching that area will fault the memory in.

Hi Michal,

syscall mlockall() will first apply the VM_LOCKED to the vma, then
call mm_populate() to map the vmas.

mm_populate
populate_vma_page_range
__get_user_pages
check_vma_flags
And the above path maybe return -EFAULT in some case, right?

If we call mprotect() to change the prot of vma, just let
check_vma_flags() return 0, then we will get the mlocked pages
in following page-fault, right?

My question is that, shall we map the vmas immediately when
the prot changed? If we should map it immediately, who(kernel, glibc, user)
do this step?

Thanks,
Xishi Qiu

Re: [RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

On 2017/9/26 16:17, Michal Hocko wrote:

> On Tue 26-09-17 15:56:55, Xishi Qiu wrote:
>> When we call mlockall(), we will add VM_LOCKED to the vma,
>> if the vma prot is ---p,
> 
> not sure what you mean here. apply_mlockall_flags will set the flag on
> all vmas except for special mappings (mlock_fixup). This phase will
> cause that memory reclaim will not free already mapped pages in those
> vmas (see page_check_references and the lazy mlock pages move to
> unevictable LRUs).
> 
>> then mm_populate -> get_user_pages will not alloc memory.
> 
> mm_populate all the vmas with pages. Well there are certainly some
> constrains - e.g. memory cgroup hard limit might be hit and so the
> faulting might fail.
> 
>> I find it said "ignore errors" in mm_populate()
>> static inline void mm_populate(unsigned long addr, unsigned long len)
>> {
>>  /* Ignore errors */
>>  (void) __mm_populate(addr, len, 1);
>> }
> 
> But we do not report the failure because any failure past
> apply_mlockall_flags would be tricky to handle. We have already dropped
> the mmap_sem lock so some other address space operations could have
> interfered.
>  
>> And later we call mprotect() to change the prot, then it is
>> still not alloc memory for the mlocked vma.
>>
>> My question is that, shall we alloc memory if the prot changed,
>> and who(kernel, glibc, user) should alloc the memory?
> 
> I do not understand your question but if you are asking how to get pages
> to map your vmas then touching that area will fault the memory in.

Hi Michal,

syscall mlockall() will first apply the VM_LOCKED to the vma, then
call mm_populate() to map the vmas.

mm_populate
populate_vma_page_range
__get_user_pages
check_vma_flags
And the above path maybe return -EFAULT in some case, right?

If we call mprotect() to change the prot of vma, just let
check_vma_flags() return 0, then we will get the mlocked pages
in following page-fault, right?

My question is that, shall we map the vmas immediately when
the prot changed? If we should map it immediately, who(kernel, glibc, user)
do this step?

Thanks,
Xishi Qiu

[RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

When we call mlockall(), we will add VM_LOCKED to the vma,
if the vma prot is ---p, then mm_populate -> get_user_pages
will not alloc memory.

I find it said "ignore errors" in mm_populate()
static inline void mm_populate(unsigned long addr, unsigned long len)
{
/* Ignore errors */
(void) __mm_populate(addr, len, 1);
}

And later we call mprotect() to change the prot, then it is
still not alloc memory for the mlocked vma.

My question is that, shall we alloc memory if the prot changed,
and who(kernel, glibc, user) should alloc the memory?

Thanks,
Xishi Qiu

[RFC] a question about mlockall() and mprotect()

2017-09-26 Thread Xishi Qiu

When we call mlockall(), we will add VM_LOCKED to the vma,
if the vma prot is ---p, then mm_populate -> get_user_pages
will not alloc memory.

I find it said "ignore errors" in mm_populate()
static inline void mm_populate(unsigned long addr, unsigned long len)
{
/* Ignore errors */
(void) __mm_populate(addr, len, 1);
}

And later we call mprotect() to change the prot, then it is
still not alloc memory for the mlocked vma.

My question is that, shall we alloc memory if the prot changed,
and who(kernel, glibc, user) should alloc the memory?

Thanks,
Xishi Qiu

[RFC] a question about stack size form /proc/pid/task/child pid/limits

2017-09-05 Thread Xishi Qiu

Hi, I find if I use a defined stack size to create a child thread,
then the max stack size from /proc/pid/task/child pid/limits still
shows "Max stack size8388608", it doesn't update to
the user defined size, is it a problem?

Here is the test code:
...
pthread_attr_t attr;
ret = pthread_attr_init();
if (ret)
printf("error\n");
ret = pthread_attr_setstacksize(, 83886080);
if (ret)
printf("error\n");
ret = pthread_create(_1[i], , (void  *)thread_alloc, 
NULL);
...

I use strace to track the app, it shows glibc will call mmap to
alloc the child thread stack. So should gilbc call setrlimit to
update the stack limit too?

And glibc will only insert a guard at the start of the stack vma,
so the stack vma maybe merged to another vma at the end, right?

...
mmap(NULL, 83890176, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, 
-1, 0) = 0x7fca1d6a6000
mprotect(0x7fca1d6a6000, 4096, PROT_NONE) = 0
clone(child_stack=0x7fca226a5fb0, 
flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID,
 parent_tidptr=0x7fca226a69d0, tls=0x7fca226a6700, child_tidptr=0x7fca226a69d0) 
= 21043
...

Thanks,
Xishi Qiu

[RFC] a question about stack size form /proc/pid/task/child pid/limits

2017-09-05 Thread Xishi Qiu

Hi, I find if I use a defined stack size to create a child thread,
then the max stack size from /proc/pid/task/child pid/limits still
shows "Max stack size8388608", it doesn't update to
the user defined size, is it a problem?

Here is the test code:
...
pthread_attr_t attr;
ret = pthread_attr_init();
if (ret)
printf("error\n");
ret = pthread_attr_setstacksize(, 83886080);
if (ret)
printf("error\n");
ret = pthread_create(_1[i], , (void  *)thread_alloc, 
NULL);
...

I use strace to track the app, it shows glibc will call mmap to
alloc the child thread stack. So should gilbc call setrlimit to
update the stack limit too?

And glibc will only insert a guard at the start of the stack vma,
so the stack vma maybe merged to another vma at the end, right?

...
mmap(NULL, 83890176, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, 
-1, 0) = 0x7fca1d6a6000
mprotect(0x7fca1d6a6000, 4096, PROT_NONE) = 0
clone(child_stack=0x7fca226a5fb0, 
flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID,
 parent_tidptr=0x7fca226a69d0, tls=0x7fca226a6700, child_tidptr=0x7fca226a69d0) 
= 21043
...

Thanks,
Xishi Qiu

Re: [PATCH 2/2] mm, memory_hotplug: remove timeout from __offline_memory

2017-09-04 Thread Xishi Qiu

On 2017/9/4 17:01, Michal Hocko wrote:

> On Mon 04-09-17 16:58:30, Xishi Qiu wrote:
>> On 2017/9/4 16:21, Michal Hocko wrote:
>>
>>> From: Michal Hocko <mho...@suse.com>
>>>
>>> We have a hardcoded 120s timeout after which the memory offline fails
>>> basically since the hot remove has been introduced. This is essentially
>>> a policy implemented in the kernel. Moreover there is no way to adjust
>>> the timeout and so we are sometimes facing memory offline failures if
>>> the system is under a heavy memory pressure or very intensive CPU
>>> workload on large machines.
>>>
>>> It is not very clear what purpose the timeout actually serves. The
>>> offline operation is interruptible by a signal so if userspace wants
>>
>> Hi Michal,
>>
>> If the user know what he should do if migration for a long time,
>> it is OK, but I don't think all the users know this operation
>> (e.g. ctrl + c) and the affect.
> 
> How is this operation any different from other potentially long
> interruptible syscalls?
> 

Hi Michal,

I means the user should stop it by himself if migration always retry in endless.

Thanks,
Xishi Qiu

Re: [PATCH 2/2] mm, memory_hotplug: remove timeout from __offline_memory

2017-09-04 Thread Xishi Qiu

On 2017/9/4 17:01, Michal Hocko wrote:

> On Mon 04-09-17 16:58:30, Xishi Qiu wrote:
>> On 2017/9/4 16:21, Michal Hocko wrote:
>>
>>> From: Michal Hocko 
>>>
>>> We have a hardcoded 120s timeout after which the memory offline fails
>>> basically since the hot remove has been introduced. This is essentially
>>> a policy implemented in the kernel. Moreover there is no way to adjust
>>> the timeout and so we are sometimes facing memory offline failures if
>>> the system is under a heavy memory pressure or very intensive CPU
>>> workload on large machines.
>>>
>>> It is not very clear what purpose the timeout actually serves. The
>>> offline operation is interruptible by a signal so if userspace wants
>>
>> Hi Michal,
>>
>> If the user know what he should do if migration for a long time,
>> it is OK, but I don't think all the users know this operation
>> (e.g. ctrl + c) and the affect.
> 
> How is this operation any different from other potentially long
> interruptible syscalls?
> 

Hi Michal,

I means the user should stop it by himself if migration always retry in endless.

Thanks,
Xishi Qiu

Re: [PATCH 2/2] mm, memory_hotplug: remove timeout from __offline_memory

2017-09-04 Thread Xishi Qiu

On 2017/9/4 16:21, Michal Hocko wrote:

> From: Michal Hocko <mho...@suse.com>
> 
> We have a hardcoded 120s timeout after which the memory offline fails
> basically since the hot remove has been introduced. This is essentially
> a policy implemented in the kernel. Moreover there is no way to adjust
> the timeout and so we are sometimes facing memory offline failures if
> the system is under a heavy memory pressure or very intensive CPU
> workload on large machines.
> 
> It is not very clear what purpose the timeout actually serves. The
> offline operation is interruptible by a signal so if userspace wants

Hi Michal,

If the user know what he should do if migration for a long time,
it is OK, but I don't think all the users know this operation
(e.g. ctrl + c) and the affect.

Thanks,
Xishi Qiu

> some timeout based termination this can be done trivially by sending a
> signal.
> 
> If there is a strong usecase to do this from the kernel then we should
> do it properly and have a it tunable from the userspace with the timeout
> disabled by default along with the explanation who uses it and for what
> purporse.
> 
> Signed-off-by: Michal Hocko <mho...@suse.com>
> ---
>  mm/memory_hotplug.c | 10 +++---
>  1 file changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index c9dcbe6d2ac6..b8a85c11360e 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1593,9 +1593,9 @@ static void node_states_clear_node(int node, struct 
> memory_notify *arg)
>  }
>  
>  static int __ref __offline_pages(unsigned long start_pfn,
> -   unsigned long end_pfn, unsigned long timeout)
> +   unsigned long end_pfn)
>  {
> - unsigned long pfn, nr_pages, expire;
> + unsigned long pfn, nr_pages;
>   long offlined_pages;
>   int ret, node;
>   unsigned long flags;
> @@ -1633,12 +1633,8 @@ static int __ref __offline_pages(unsigned long 
> start_pfn,
>   goto failed_removal;
>  
>   pfn = start_pfn;
> - expire = jiffies + timeout;
>  repeat:
>   /* start memory hot removal */
> - ret = -EBUSY;
> - if (time_after(jiffies, expire))
> - goto failed_removal;
>   ret = -EINTR;
>   if (signal_pending(current))
>   goto failed_removal;
> @@ -1711,7 +1707,7 @@ static int __ref __offline_pages(unsigned long 
> start_pfn,
>  /* Must be protected by mem_hotplug_begin() or a device_lock */
>  int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
>  {
> - return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
> + return __offline_pages(start_pfn, start_pfn + nr_pages);
>  }
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>

Re: [PATCH 2/2] mm, memory_hotplug: remove timeout from __offline_memory

2017-09-04 Thread Xishi Qiu

On 2017/9/4 16:21, Michal Hocko wrote:

> From: Michal Hocko 
> 
> We have a hardcoded 120s timeout after which the memory offline fails
> basically since the hot remove has been introduced. This is essentially
> a policy implemented in the kernel. Moreover there is no way to adjust
> the timeout and so we are sometimes facing memory offline failures if
> the system is under a heavy memory pressure or very intensive CPU
> workload on large machines.
> 
> It is not very clear what purpose the timeout actually serves. The
> offline operation is interruptible by a signal so if userspace wants

Hi Michal,

If the user know what he should do if migration for a long time,
it is OK, but I don't think all the users know this operation
(e.g. ctrl + c) and the affect.

Thanks,
Xishi Qiu

> some timeout based termination this can be done trivially by sending a
> signal.
> 
> If there is a strong usecase to do this from the kernel then we should
> do it properly and have a it tunable from the userspace with the timeout
> disabled by default along with the explanation who uses it and for what
> purporse.
> 
> Signed-off-by: Michal Hocko 
> ---
>  mm/memory_hotplug.c | 10 +++---
>  1 file changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index c9dcbe6d2ac6..b8a85c11360e 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1593,9 +1593,9 @@ static void node_states_clear_node(int node, struct 
> memory_notify *arg)
>  }
>  
>  static int __ref __offline_pages(unsigned long start_pfn,
> -   unsigned long end_pfn, unsigned long timeout)
> +   unsigned long end_pfn)
>  {
> - unsigned long pfn, nr_pages, expire;
> + unsigned long pfn, nr_pages;
>   long offlined_pages;
>   int ret, node;
>   unsigned long flags;
> @@ -1633,12 +1633,8 @@ static int __ref __offline_pages(unsigned long 
> start_pfn,
>   goto failed_removal;
>  
>   pfn = start_pfn;
> - expire = jiffies + timeout;
>  repeat:
>   /* start memory hot removal */
> - ret = -EBUSY;
> - if (time_after(jiffies, expire))
> - goto failed_removal;
>   ret = -EINTR;
>   if (signal_pending(current))
>   goto failed_removal;
> @@ -1711,7 +1707,7 @@ static int __ref __offline_pages(unsigned long 
> start_pfn,
>  /* Must be protected by mem_hotplug_begin() or a device_lock */
>  int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
>  {
> - return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
> + return __offline_pages(start_pfn, start_pfn + nr_pages);
>  }
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>

Re: mm, something wrong in page_lock_anon_vma_read()?

2017-07-19 Thread Xishi Qiu

On 2017/7/19 16:40, Vlastimil Babka wrote:

> On 07/18/2017 12:59 PM, Xishi Qiu wrote:
>> Hi,
>>
>> Unfortunately, this patch(mm: thp: fix SMP race condition between
>> THP page fault and MADV_DONTNEED) didn't help, I got the panic again.
> 
> Too bad then. I don't know of any other patch from my own experience
> being directly related, try to look for similar THP-related race fixes.
> Did you already check whether disabling THP (set it to "never" under
> /sys/...) prevents the issue? I forgot.
> 

Hi Vlastimil,

Thanks for your reply.

This bug is hard to reproduce, and my production line don't allowed 
disable THP because performance regression. Also I have no condition to
reproduce this bug(I don't have the user apps or stress from production
line).

>> And I find this error before panic, "[468229.996610] BUG: Bad rss-counter 
>> state mm:8806aebc2580 idx:1 val:1"
> 
> This likely means that a pte was overwritten to zero, and an anon page
> had no other reference than this pte, so it became orphaned. Its
> anon_vma object was freed as the process exited, and eventually
> overwritten by a new user, so compaction or reclaim looking at it sooner
> or later makes a bad memory access.
> 
> The pte overwriting may be a result of races with multiple threads
> trying to either read or write within the same page, involving THP zero
> page. It doesn't have to be MADV_DONTNEED related.
> 

I find two patches from upstream.
887843961c4b4681ee993c36d4997bf4b4aa8253
a9c8e4beeeb64c22b84c803747487857fe424b68

I can't find any relations to the panic from the first one, and the second
one seems triggered from xen, but we use kvm.

Thanks,
Xishi Qiu 

>> [468451.702807] BUG: unable to handle kernel NULL pointer dereference at 
>> 0008
>> [468451.702861] IP: [] down_read_trylock+0x9/0x30
>> [468451.702900] PGD 12445e067 PUD 11acaa067 PMD 0 
>> [468451.702931] Oops:  [#1] SMP 
>> [468451.702953] kbox catch die event.
>> [468451.703003] collected_len = 1047419, LOG_BUF_LEN_LOCAL = 1048576
>> [468451.703003] kbox: notify die begin
>> [468451.703003] kbox: no notify die func register. no need to notify
>> [468451.703003] do nothing after die!
>> [468451.703003] Modules linked in: ipt_REJECT macvlan ip_set_hash_ipport 
>> vport_vxlan(OVE) xt_statistic xt_physdev xt_nat xt_recent xt_mark xt_comment 
>> veth ct_limit(OVE) bum_extract(OVE) policy(OVE) bum(OVE) ip_set nfnetlink 
>> openvswitch(OVE) nf_defrag_ipv6 gre ext3 jbd ipt_MASQUERADE 
>> nf_nat_masquerade_ipv4 iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 
>> nf_nat_ipv4 xt_addrtype iptable_filter xt_conntrack nf_nat nf_conntrack 
>> bridge stp llc kboxdriver(O) kbox(O) dm_thin_pool dm_persistent_data 
>> crc32_pclmul dm_bio_prison dm_bufio ghash_clmulni_intel libcrc32c 
>> aesni_intel lrw gf128mul glue_helper ablk_helper cryptd ppdev sg parport_pc 
>> cirrus virtio_console parport syscopyarea sysfillrect sysimgblt ttm 
>> drm_kms_helper drm i2c_piix4 i2c_core pcspkr ip_tables ext4 jbd2 mbcache 
>> sr_mod cdrom ata_generic pata_acpi
>> [468451.703003]  virtio_net virtio_blk crct10dif_pclmul crct10dif_common 
>> ata_piix virtio_pci libata serio_raw virtio_ring crc32c_intel virtio 
>> dm_mirror dm_region_hash dm_log dm_mod
>> [468451.703003] CPU: 6 PID: 21965 Comm: docker-containe Tainted: G   
>> OE  V---   3.10.0-327.53.58.73.x86_64 #1
>> [468451.703003] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
>> rel-1.8.1-0-g4adadbd-20170107_142945-9_64_246_229 04/01/2014
>> [468451.703003] task: 880692402e00 ti: 88018209c000 task.ti: 
>> 88018209c000
>> [468451.703003] RIP: 0010:[]  [] 
>> down_read_trylock+0x9/0x30
>> [468451.703003] RSP: 0018:88018209f8f8  EFLAGS: 00010202
>> [468451.703003] RAX:  RBX: 880720cd7740 RCX: 
>> 880720cd7740
>> [468451.703003] RDX: 0001 RSI: 0301 RDI: 
>> 0008
>> [468451.703003] RBP: 88018209f8f8 R08: c0e0f310 R09: 
>> 880720cd7740
>> [468451.703003] R10: 88083efd8000 R11:  R12: 
>> 880720cd7741
>> [468451.703003] R13: ea000824d100 R14: 0008 R15: 
>> 
>> [468451.703003] FS:  7fc0e2a85700() GS:88083ed8() 
>> knlGS:
>> [468451.703003] CS:  0010 DS:  ES:  CR0: 80050033
>> [468451.703003] CR2: 0008 CR3: 000661906000 CR4: 
>> 001407e0
>> [468451.703003] DR0:  DR1:  DR2: 
>> 
>> [468451.703003] DR3:  DR6: 0

Re: mm, something wrong in page_lock_anon_vma_read()?

2017-07-19 Thread Xishi Qiu

On 2017/7/19 16:40, Vlastimil Babka wrote:

> On 07/18/2017 12:59 PM, Xishi Qiu wrote:
>> Hi,
>>
>> Unfortunately, this patch(mm: thp: fix SMP race condition between
>> THP page fault and MADV_DONTNEED) didn't help, I got the panic again.
> 
> Too bad then. I don't know of any other patch from my own experience
> being directly related, try to look for similar THP-related race fixes.
> Did you already check whether disabling THP (set it to "never" under
> /sys/...) prevents the issue? I forgot.
> 

Hi Vlastimil,

Thanks for your reply.

This bug is hard to reproduce, and my production line don't allowed 
disable THP because performance regression. Also I have no condition to
reproduce this bug(I don't have the user apps or stress from production
line).

>> And I find this error before panic, "[468229.996610] BUG: Bad rss-counter 
>> state mm:8806aebc2580 idx:1 val:1"
> 
> This likely means that a pte was overwritten to zero, and an anon page
> had no other reference than this pte, so it became orphaned. Its
> anon_vma object was freed as the process exited, and eventually
> overwritten by a new user, so compaction or reclaim looking at it sooner
> or later makes a bad memory access.
> 
> The pte overwriting may be a result of races with multiple threads
> trying to either read or write within the same page, involving THP zero
> page. It doesn't have to be MADV_DONTNEED related.
> 

I find two patches from upstream.
887843961c4b4681ee993c36d4997bf4b4aa8253
a9c8e4beeeb64c22b84c803747487857fe424b68

I can't find any relations to the panic from the first one, and the second
one seems triggered from xen, but we use kvm.

Thanks,
Xishi Qiu 

>> [468451.702807] BUG: unable to handle kernel NULL pointer dereference at 
>> 0008
>> [468451.702861] IP: [] down_read_trylock+0x9/0x30
>> [468451.702900] PGD 12445e067 PUD 11acaa067 PMD 0 
>> [468451.702931] Oops:  [#1] SMP 
>> [468451.702953] kbox catch die event.
>> [468451.703003] collected_len = 1047419, LOG_BUF_LEN_LOCAL = 1048576
>> [468451.703003] kbox: notify die begin
>> [468451.703003] kbox: no notify die func register. no need to notify
>> [468451.703003] do nothing after die!
>> [468451.703003] Modules linked in: ipt_REJECT macvlan ip_set_hash_ipport 
>> vport_vxlan(OVE) xt_statistic xt_physdev xt_nat xt_recent xt_mark xt_comment 
>> veth ct_limit(OVE) bum_extract(OVE) policy(OVE) bum(OVE) ip_set nfnetlink 
>> openvswitch(OVE) nf_defrag_ipv6 gre ext3 jbd ipt_MASQUERADE 
>> nf_nat_masquerade_ipv4 iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 
>> nf_nat_ipv4 xt_addrtype iptable_filter xt_conntrack nf_nat nf_conntrack 
>> bridge stp llc kboxdriver(O) kbox(O) dm_thin_pool dm_persistent_data 
>> crc32_pclmul dm_bio_prison dm_bufio ghash_clmulni_intel libcrc32c 
>> aesni_intel lrw gf128mul glue_helper ablk_helper cryptd ppdev sg parport_pc 
>> cirrus virtio_console parport syscopyarea sysfillrect sysimgblt ttm 
>> drm_kms_helper drm i2c_piix4 i2c_core pcspkr ip_tables ext4 jbd2 mbcache 
>> sr_mod cdrom ata_generic pata_acpi
>> [468451.703003]  virtio_net virtio_blk crct10dif_pclmul crct10dif_common 
>> ata_piix virtio_pci libata serio_raw virtio_ring crc32c_intel virtio 
>> dm_mirror dm_region_hash dm_log dm_mod
>> [468451.703003] CPU: 6 PID: 21965 Comm: docker-containe Tainted: G   
>> OE  V---   3.10.0-327.53.58.73.x86_64 #1
>> [468451.703003] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
>> rel-1.8.1-0-g4adadbd-20170107_142945-9_64_246_229 04/01/2014
>> [468451.703003] task: 880692402e00 ti: 88018209c000 task.ti: 
>> 88018209c000
>> [468451.703003] RIP: 0010:[]  [] 
>> down_read_trylock+0x9/0x30
>> [468451.703003] RSP: 0018:88018209f8f8  EFLAGS: 00010202
>> [468451.703003] RAX:  RBX: 880720cd7740 RCX: 
>> 880720cd7740
>> [468451.703003] RDX: 0001 RSI: 0301 RDI: 
>> 0008
>> [468451.703003] RBP: 88018209f8f8 R08: c0e0f310 R09: 
>> 880720cd7740
>> [468451.703003] R10: 88083efd8000 R11:  R12: 
>> 880720cd7741
>> [468451.703003] R13: ea000824d100 R14: 0008 R15: 
>> 
>> [468451.703003] FS:  7fc0e2a85700() GS:88083ed8() 
>> knlGS:
>> [468451.703003] CS:  0010 DS:  ES:  CR0: 80050033
>> [468451.703003] CR2: 0008 CR3: 000661906000 CR4: 
>> 001407e0
>> [468451.703003] DR0:  DR1:  DR2: 
>> 
>> [468451.703003] DR3:  DR6: 0

Re: mm, something wrong in page_lock_anon_vma_read()?

2017-07-18 Thread Xishi Qiu

On 2017/6/8 21:59, Vlastimil Babka wrote:

> On 06/08/2017 03:44 PM, Xishi Qiu wrote:
>> On 2017/5/23 17:33, Vlastimil Babka wrote:
>>
>>> On 05/23/2017 11:21 AM, zhong jiang wrote:
>>>> On 2017/5/23 0:51, Vlastimil Babka wrote:
>>>>> On 05/20/2017 05:01 AM, zhong jiang wrote:
>>>>>> On 2017/5/20 10:40, Hugh Dickins wrote:
>>>>>>> On Sat, 20 May 2017, Xishi Qiu wrote:
>>>>>>>> Here is a bug report form redhat: 
>>>>>>>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>>>>>>>> And I meet the bug too. However it is hard to reproduce, and 
>>>>>>>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is 
>>>>>>>> not help.
>>>>>>>>
>>>>>>>> From the vmcore, it seems that the page is still mapped(_mapcount=0 
>>>>>>>> and _count=2),
>>>>>>>> and the value of mapping is a valid address(mapping = 
>>>>>>>> 0x8801b3e2a101),
>>>>>>>> but anon_vma has been corrupted.
>>>>>>>>
>>>>>>>> Any ideas?
>>>>>>> Sorry, no.  I assume that _mapcount has been misaccounted, for example
>>>>>>> a pte mapped in on top of another pte; but cannot begin tell you where
>>>>>>> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>>>>>>>
>>>>>>> Hugh
>>>>>>>
>>>>>>> .
>>>>>>>
>>>>>> Hi, Hugh
>>>>>>
>>>>>> I find the following message from the dmesg.
>>>>>>
>>>>>> [26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1
>>>>>>
>>>>>> I can prove that the __mapcount is misaccount.  when task is exited. the 
>>>>>> rmap
>>>>>> still exist.
>>>>> Check if the kernel in question contains this commit: ad33bb04b2a6 ("mm:
>>>>> thp: fix SMP race condition between THP page fault and MADV_DONTNEED")
>>>>   HI, Vlastimil
>>>>  
>>>>   I miss the patch.
>>>
>>> Try applying it then, there's good chance the error and crash will go
>>> away. Even if your workload doesn't actually run any madvise(MADV_DONTNEED).
>>>
>>
>> Hi Vlastimil,
>>
>> I find this error was reported by Kirill as following, right?
>> https://patchwork.kernel.org/patch/7550401/
> 
> That was reported by Minchan.
> 
>> The call trace is quite like the same as ours.
> 
> In that thread, the error seems just disappeared in the end.
> 
> So, did you apply the patch I suggested? Did it help?
> 

Hi,

Unfortunately, this patch(mm: thp: fix SMP race condition between
THP page fault and MADV_DONTNEED) didn't help, I got the panic again.

And I find this error before panic, "[468229.996610] BUG: Bad rss-counter state 
mm:8806aebc2580 idx:1 val:1"

[468451.702807] BUG: unable to handle kernel NULL pointer dereference at 
0008
[468451.702861] IP: [] down_read_trylock+0x9/0x30
[468451.702900] PGD 12445e067 PUD 11acaa067 PMD 0 
[468451.702931] Oops:  [#1] SMP 
[468451.702953] kbox catch die event.
[468451.703003] collected_len = 1047419, LOG_BUF_LEN_LOCAL = 1048576
[468451.703003] kbox: notify die begin
[468451.703003] kbox: no notify die func register. no need to notify
[468451.703003] do nothing after die!
[468451.703003] Modules linked in: ipt_REJECT macvlan ip_set_hash_ipport 
vport_vxlan(OVE) xt_statistic xt_physdev xt_nat xt_recent xt_mark xt_comment 
veth ct_limit(OVE) bum_extract(OVE) policy(OVE) bum(OVE) ip_set nfnetlink 
openvswitch(OVE) nf_defrag_ipv6 gre ext3 jbd ipt_MASQUERADE 
nf_nat_masquerade_ipv4 iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 
xt_addrtype iptable_filter xt_conntrack nf_nat nf_conntrack bridge stp llc 
kboxdriver(O) kbox(O) dm_thin_pool dm_persistent_data crc32_pclmul 
dm_bio_prison dm_bufio ghash_clmulni_intel libcrc32c aesni_intel lrw gf128mul 
glue_helper ablk_helper cryptd ppdev sg parport_pc cirrus virtio_console 
parport syscopyarea sysfillrect sysimgblt ttm drm_kms_helper drm i2c_piix4 
i2c_core pcspkr ip_tables ext4 jbd2 mbcache sr_mod cdrom ata_generic pata_acpi
[468451.703003]  virtio_net virtio_blk crct10dif_pclmul crct10dif_common 
ata_piix virtio_pci libata serio_raw virtio_ring crc32c_intel virtio dm_mirror 
dm_region_hash dm_log dm_mod
[468451.703003] CPU: 6 PID: 21965 Comm: docker-containe Tainted: G   OE 
 V---   3.10.0-327.53.58.73.x86_

Re: mm, something wrong in page_lock_anon_vma_read()?

2017-07-18 Thread Xishi Qiu

On 2017/6/8 21:59, Vlastimil Babka wrote:

> On 06/08/2017 03:44 PM, Xishi Qiu wrote:
>> On 2017/5/23 17:33, Vlastimil Babka wrote:
>>
>>> On 05/23/2017 11:21 AM, zhong jiang wrote:
>>>> On 2017/5/23 0:51, Vlastimil Babka wrote:
>>>>> On 05/20/2017 05:01 AM, zhong jiang wrote:
>>>>>> On 2017/5/20 10:40, Hugh Dickins wrote:
>>>>>>> On Sat, 20 May 2017, Xishi Qiu wrote:
>>>>>>>> Here is a bug report form redhat: 
>>>>>>>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>>>>>>>> And I meet the bug too. However it is hard to reproduce, and 
>>>>>>>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is 
>>>>>>>> not help.
>>>>>>>>
>>>>>>>> From the vmcore, it seems that the page is still mapped(_mapcount=0 
>>>>>>>> and _count=2),
>>>>>>>> and the value of mapping is a valid address(mapping = 
>>>>>>>> 0x8801b3e2a101),
>>>>>>>> but anon_vma has been corrupted.
>>>>>>>>
>>>>>>>> Any ideas?
>>>>>>> Sorry, no.  I assume that _mapcount has been misaccounted, for example
>>>>>>> a pte mapped in on top of another pte; but cannot begin tell you where
>>>>>>> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>>>>>>>
>>>>>>> Hugh
>>>>>>>
>>>>>>> .
>>>>>>>
>>>>>> Hi, Hugh
>>>>>>
>>>>>> I find the following message from the dmesg.
>>>>>>
>>>>>> [26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1
>>>>>>
>>>>>> I can prove that the __mapcount is misaccount.  when task is exited. the 
>>>>>> rmap
>>>>>> still exist.
>>>>> Check if the kernel in question contains this commit: ad33bb04b2a6 ("mm:
>>>>> thp: fix SMP race condition between THP page fault and MADV_DONTNEED")
>>>>   HI, Vlastimil
>>>>  
>>>>   I miss the patch.
>>>
>>> Try applying it then, there's good chance the error and crash will go
>>> away. Even if your workload doesn't actually run any madvise(MADV_DONTNEED).
>>>
>>
>> Hi Vlastimil,
>>
>> I find this error was reported by Kirill as following, right?
>> https://patchwork.kernel.org/patch/7550401/
> 
> That was reported by Minchan.
> 
>> The call trace is quite like the same as ours.
> 
> In that thread, the error seems just disappeared in the end.
> 
> So, did you apply the patch I suggested? Did it help?
> 

Hi,

Unfortunately, this patch(mm: thp: fix SMP race condition between
THP page fault and MADV_DONTNEED) didn't help, I got the panic again.

And I find this error before panic, "[468229.996610] BUG: Bad rss-counter state 
mm:8806aebc2580 idx:1 val:1"

[468451.702807] BUG: unable to handle kernel NULL pointer dereference at 
0008
[468451.702861] IP: [] down_read_trylock+0x9/0x30
[468451.702900] PGD 12445e067 PUD 11acaa067 PMD 0 
[468451.702931] Oops:  [#1] SMP 
[468451.702953] kbox catch die event.
[468451.703003] collected_len = 1047419, LOG_BUF_LEN_LOCAL = 1048576
[468451.703003] kbox: notify die begin
[468451.703003] kbox: no notify die func register. no need to notify
[468451.703003] do nothing after die!
[468451.703003] Modules linked in: ipt_REJECT macvlan ip_set_hash_ipport 
vport_vxlan(OVE) xt_statistic xt_physdev xt_nat xt_recent xt_mark xt_comment 
veth ct_limit(OVE) bum_extract(OVE) policy(OVE) bum(OVE) ip_set nfnetlink 
openvswitch(OVE) nf_defrag_ipv6 gre ext3 jbd ipt_MASQUERADE 
nf_nat_masquerade_ipv4 iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 
xt_addrtype iptable_filter xt_conntrack nf_nat nf_conntrack bridge stp llc 
kboxdriver(O) kbox(O) dm_thin_pool dm_persistent_data crc32_pclmul 
dm_bio_prison dm_bufio ghash_clmulni_intel libcrc32c aesni_intel lrw gf128mul 
glue_helper ablk_helper cryptd ppdev sg parport_pc cirrus virtio_console 
parport syscopyarea sysfillrect sysimgblt ttm drm_kms_helper drm i2c_piix4 
i2c_core pcspkr ip_tables ext4 jbd2 mbcache sr_mod cdrom ata_generic pata_acpi
[468451.703003]  virtio_net virtio_blk crct10dif_pclmul crct10dif_common 
ata_piix virtio_pci libata serio_raw virtio_ring crc32c_intel virtio dm_mirror 
dm_region_hash dm_log dm_mod
[468451.703003] CPU: 6 PID: 21965 Comm: docker-containe Tainted: G   OE 
 V---   3.10.0-327.53.58.73.x86_

Re: centos 7.2，I got some oops form my production line

2017-07-04 Thread Xishi Qiu

On 2017/6/29 16:22, Xishi Qiu wrote:

> centos 7.2，I got some oops form my production line,
> Anybody has seen these errors before?
> 

Here is another one

[  703.025737] BUG: unable to handle kernel NULL pointer dereference at 
0d68
[  703.026008] IP: [] mlx4_en_QUERY_PORT+0xa2/0x190 [mlx4_en]
[  703.026008] PGD 377f2a067 PUD 379df4067 PMD 0 
[  703.026008] Oops: 0002 [#1] SMP 
[  703.033019] Modules linked in: sch_htb haek(OVE) squashfs loop binfmt_misc 
phram mtdblock mtd_blkdevs mtd zlib_deflate nf_log_ipv4 nf_log_common xt_LOG 
ipmi_watchdog ipmi_devintf ipmi_si ipmi_msghandler vfat fat bonding tipc 
kboxdriver(O) kbox(O) ipt_REJECT iptable_filter signo_catch(O) mlx4_ib(OVE) 
ib_sa(OVE) ib_mad(OVE) ib_core(OVE) mlx4_en(OVE) ib_addr(OVE) ib_netlink(OVE) 
vxlan ip6_udp_tunnel udp_tunnel ptp pps_core mlx4_core(OVE) compat(OVE) isofs 
crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper 
ablk_helper cryptd ppdev dm_mod parport_pc sg parport pcspkr i2c_piix4 i2c_core 
ip_tables ext3 mbcache jbd sr_mod cdrom ata_generic pata_acpi virtio_blk(OVE) 
virtio_console(OVE) kvm_ivshmem(OVE) crct10dif_pclmul crct10dif_common ata_piix 
crc32c_intel serio_raw libata pv_channel(OVE)
[  703.055064] mlx4_core :00:07.0: mlx4_dec_port_macs removed mac, port: 1, 
now: 0
[  703.033019]  virtio_pci(OVE) virtio_ring(OVE) virtio(OVE) floppy 
monitor_netdev(OE)
[  703.033019] CPU: 3 PID: 3038 Comm: kworker/3:2 Tainted: GW  OE  
V---   3.10.0-327.49.58.52.x86_64 #1
[  703.033019] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
rel-1.8.1-0-g4adadbd-2016_105425-HGH108200 04/01/2014
[  703.033019] Workqueue: events linkwatch_event
[  703.033019] task: 88041a9bf300 ti: 880412134000 task.ti: 
880412134000
[  703.066565] RIP: 0010:[]  [] 
mlx4_en_QUERY_PORT+0xa2/0x190 [mlx4_en]
[  703.066565] RSP: 0018:880412137bd0  EFLAGS: 00010a03
[  703.066565] RAX: 8800ba5bc000 RBX: 880410cd1000 RCX: 0038
[  703.066565] RDX: 0001 RSI: 0246 RDI: 88041472046c
[  703.074752] RBP: 880412137c10 R08: 81668be0 R09: 81dc63c0
[  703.074752] R10: 0400 R11: 0017 R12: 8803773eea20
[  703.074752] R13:  R14:  R15: 88041b5eb000
[  703.074752] FS:  () GS:880434ac() 
knlGS:
[  703.074752] CS:  0010 DS:  ES:  CR0: 80050033
[  703.074752] CR2: 0d68 CR3: 00036ff69000 CR4: 001407e0
[  703.086770] DR0:  DR1:  DR2: 
[  703.086770] DR3:  DR6: 0ff0 DR7: 0400
[  703.086770] Stack:
[  703.086770]  88040043 ea60 8804 
ba5bc000
[  703.086770]  880410ce 880412137cac 88041b5eb8c0 
880410ce08c0
[  703.086770]  880412137c78 a02a29a4 dead00200200 
4ff29e7e
[  703.086770] Call Trace:
[  703.086770]  [] mlx4_en_get_settings+0x34/0x540 [mlx4_en]
[  703.086770]  [] __ethtool_get_settings+0x86/0x140
[  703.104149]  [] bond_update_speed_duplex+0x3d/0x90 
[bonding]
[  703.104149]  [] bond_netdev_event+0x137/0x360 [bonding]
[  703.104149]  [] notifier_call_chain+0x4c/0x70
[  703.104149]  [] raw_notifier_call_chain+0x16/0x20
[  703.104149]  [] call_netdevice_notifiers+0x2d/0x60
[  703.104149]  [] netdev_state_change+0x23/0x40
[  703.104149]  [] linkwatch_do_dev+0x40/0x60
[  703.104149]  [] __linkwatch_run_queue+0xef/0x200
[  703.104149]  [] linkwatch_event+0x25/0x30
[  703.104149]  [] process_one_work+0x17b/0x470
[  703.104149]  [] worker_thread+0x11b/0x400
[  703.104149]  [] ? rescuer_thread+0x400/0x400
[  703.104149]  [] kthread+0xcf/0xe0
[  703.104149]  [] ? kthread_create_on_node+0x140/0x140
[  703.104149]  [] ret_from_fork+0x58/0x90
[  703.104149]  [] ? kthread_create_on_node+0x140/0x140
[  703.134274] Code: 48 8b 3b 4c 89 e6 e8 7e 7e f4 ff 48 83 c4 20 44 89 f0 5b 
41 5c 41 5d 41 5e 5d c3 66 0f 1f 44 00 00 49 8b 04 24 0f be 10 c1 ea 1f <41> 89 
95 68 0d 00 00 0f b6 50 05 83 e2 6f 80 fa 40 0f 87 b7 00 
[  703.134274] RIP  [] mlx4_en_QUERY_PORT+0xa2/0x190 [mlx4_en]
[  703.134274]  RSP 
[  703.134274] CR2: 0d68
[  703.134274] ---[ end trace 76a7da47a517c30b ]---
[  703.134274] Kernel panic - not syncing: Fatal exception


> 
> 1)
> 2017-06-28T02:18:16.461384+08:00[880983.488036] do nothing after die!
> 2017-06-28T02:18:16.462068+08:00[880983.488723] Modules linked in: fuse 
> iptable_filter sha512_generic icp_qa_al_vf(OVE) vfat fat isofs ext4 jbd2 xfs 
> libcrc32c kboxdriver(O) ipmi_devintf ipmi_si ipmi_msghandler kbox(O) 
> signo_catch(O) mlx4_core(OVE) compat(OVE) ppdev crc32_pclmul 
> ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd 
> pcspkr parport_pc i2c_piix4 parport i2c_core ip_tables ext3 mbcache jbd 
> ata_generic pat

Re: centos 7.2，I got some oops form my production line

2017-07-04 Thread Xishi Qiu

On 2017/6/29 16:22, Xishi Qiu wrote:

> centos 7.2，I got some oops form my production line,
> Anybody has seen these errors before?
> 

Here is another one

[  703.025737] BUG: unable to handle kernel NULL pointer dereference at 
0d68
[  703.026008] IP: [] mlx4_en_QUERY_PORT+0xa2/0x190 [mlx4_en]
[  703.026008] PGD 377f2a067 PUD 379df4067 PMD 0 
[  703.026008] Oops: 0002 [#1] SMP 
[  703.033019] Modules linked in: sch_htb haek(OVE) squashfs loop binfmt_misc 
phram mtdblock mtd_blkdevs mtd zlib_deflate nf_log_ipv4 nf_log_common xt_LOG 
ipmi_watchdog ipmi_devintf ipmi_si ipmi_msghandler vfat fat bonding tipc 
kboxdriver(O) kbox(O) ipt_REJECT iptable_filter signo_catch(O) mlx4_ib(OVE) 
ib_sa(OVE) ib_mad(OVE) ib_core(OVE) mlx4_en(OVE) ib_addr(OVE) ib_netlink(OVE) 
vxlan ip6_udp_tunnel udp_tunnel ptp pps_core mlx4_core(OVE) compat(OVE) isofs 
crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper 
ablk_helper cryptd ppdev dm_mod parport_pc sg parport pcspkr i2c_piix4 i2c_core 
ip_tables ext3 mbcache jbd sr_mod cdrom ata_generic pata_acpi virtio_blk(OVE) 
virtio_console(OVE) kvm_ivshmem(OVE) crct10dif_pclmul crct10dif_common ata_piix 
crc32c_intel serio_raw libata pv_channel(OVE)
[  703.055064] mlx4_core :00:07.0: mlx4_dec_port_macs removed mac, port: 1, 
now: 0
[  703.033019]  virtio_pci(OVE) virtio_ring(OVE) virtio(OVE) floppy 
monitor_netdev(OE)
[  703.033019] CPU: 3 PID: 3038 Comm: kworker/3:2 Tainted: GW  OE  
V---   3.10.0-327.49.58.52.x86_64 #1
[  703.033019] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
rel-1.8.1-0-g4adadbd-2016_105425-HGH108200 04/01/2014
[  703.033019] Workqueue: events linkwatch_event
[  703.033019] task: 88041a9bf300 ti: 880412134000 task.ti: 
880412134000
[  703.066565] RIP: 0010:[]  [] 
mlx4_en_QUERY_PORT+0xa2/0x190 [mlx4_en]
[  703.066565] RSP: 0018:880412137bd0  EFLAGS: 00010a03
[  703.066565] RAX: 8800ba5bc000 RBX: 880410cd1000 RCX: 0038
[  703.066565] RDX: 0001 RSI: 0246 RDI: 88041472046c
[  703.074752] RBP: 880412137c10 R08: 81668be0 R09: 81dc63c0
[  703.074752] R10: 0400 R11: 0017 R12: 8803773eea20
[  703.074752] R13:  R14:  R15: 88041b5eb000
[  703.074752] FS:  () GS:880434ac() 
knlGS:
[  703.074752] CS:  0010 DS:  ES:  CR0: 80050033
[  703.074752] CR2: 0d68 CR3: 00036ff69000 CR4: 001407e0
[  703.086770] DR0:  DR1:  DR2: 
[  703.086770] DR3:  DR6: 0ff0 DR7: 0400
[  703.086770] Stack:
[  703.086770]  88040043 ea60 8804 
ba5bc000
[  703.086770]  880410ce 880412137cac 88041b5eb8c0 
880410ce08c0
[  703.086770]  880412137c78 a02a29a4 dead00200200 
4ff29e7e
[  703.086770] Call Trace:
[  703.086770]  [] mlx4_en_get_settings+0x34/0x540 [mlx4_en]
[  703.086770]  [] __ethtool_get_settings+0x86/0x140
[  703.104149]  [] bond_update_speed_duplex+0x3d/0x90 
[bonding]
[  703.104149]  [] bond_netdev_event+0x137/0x360 [bonding]
[  703.104149]  [] notifier_call_chain+0x4c/0x70
[  703.104149]  [] raw_notifier_call_chain+0x16/0x20
[  703.104149]  [] call_netdevice_notifiers+0x2d/0x60
[  703.104149]  [] netdev_state_change+0x23/0x40
[  703.104149]  [] linkwatch_do_dev+0x40/0x60
[  703.104149]  [] __linkwatch_run_queue+0xef/0x200
[  703.104149]  [] linkwatch_event+0x25/0x30
[  703.104149]  [] process_one_work+0x17b/0x470
[  703.104149]  [] worker_thread+0x11b/0x400
[  703.104149]  [] ? rescuer_thread+0x400/0x400
[  703.104149]  [] kthread+0xcf/0xe0
[  703.104149]  [] ? kthread_create_on_node+0x140/0x140
[  703.104149]  [] ret_from_fork+0x58/0x90
[  703.104149]  [] ? kthread_create_on_node+0x140/0x140
[  703.134274] Code: 48 8b 3b 4c 89 e6 e8 7e 7e f4 ff 48 83 c4 20 44 89 f0 5b 
41 5c 41 5d 41 5e 5d c3 66 0f 1f 44 00 00 49 8b 04 24 0f be 10 c1 ea 1f <41> 89 
95 68 0d 00 00 0f b6 50 05 83 e2 6f 80 fa 40 0f 87 b7 00 
[  703.134274] RIP  [] mlx4_en_QUERY_PORT+0xa2/0x190 [mlx4_en]
[  703.134274]  RSP 
[  703.134274] CR2: 0d68
[  703.134274] ---[ end trace 76a7da47a517c30b ]---
[  703.134274] Kernel panic - not syncing: Fatal exception


> 
> 1)
> 2017-06-28T02:18:16.461384+08:00[880983.488036] do nothing after die!
> 2017-06-28T02:18:16.462068+08:00[880983.488723] Modules linked in: fuse 
> iptable_filter sha512_generic icp_qa_al_vf(OVE) vfat fat isofs ext4 jbd2 xfs 
> libcrc32c kboxdriver(O) ipmi_devintf ipmi_si ipmi_msghandler kbox(O) 
> signo_catch(O) mlx4_core(OVE) compat(OVE) ppdev crc32_pclmul 
> ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd 
> pcspkr parport_pc i2c_piix4 parport i2c_core ip_tables ext3 mbcache jbd 
> ata_generic pat

centos 7.2，I got some oops form my production line

2017-06-29 Thread Xishi Qiu

centos 7.2，I got some oops form my production line,
Anybody has seen these errors before?


1)
2017-06-28T02:18:16.461384+08:00[880983.488036] do nothing after die!
2017-06-28T02:18:16.462068+08:00[880983.488723] Modules linked in: fuse 
iptable_filter sha512_generic icp_qa_al_vf(OVE) vfat fat isofs ext4 jbd2 xfs 
libcrc32c kboxdriver(O) ipmi_devintf ipmi_si ipmi_msghandler kbox(O) 
signo_catch(O) mlx4_core(OVE) compat(OVE) ppdev crc32_pclmul 
ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd 
pcspkr parport_pc i2c_piix4 parport i2c_core ip_tables ext3 mbcache jbd 
ata_generic pata_acpi virtio_console(OVE) virtio_balloon(OVE) virtio_blk(OVE) 
virtio_net(OVE) kvm_ivshmem(OVE) crct10dif_pclmul crct10dif_common crc32c_intel 
serio_raw ata_piix pv_channel(OVE) virtio_pci(OVE) virtio_ring(OVE) libata 
virtio(OVE) floppy bonding
2017-06-28T02:18:16.473941+08:00[880983.500597] CPU: 1 PID: 0 Comm: swapper/1 
Tainted: G   OE  V---   3.10.0-327.44.58.28.x86_64 #1
2017-06-28T02:18:16.475784+08:00[880983.502440] Hardware name: QEMU Standard PC 
(i440FX + PIIX, 1996), BIOS 
rel-1.9.1-0-gb3ef39f-20170515_131417-build9a64a246a230 04/01/2014
2017-06-28T02:18:16.477993+08:00[880983.504643] task: 880138b62280 ti: 
880138b74000 task.ti: 880138b74000
2017-06-28T02:18:16.486759+08:00[880983.513413] RIP: 0010:[]  
[] get_next_timer_interrupt+0x1c0/0x270
2017-06-28T02:18:16.490230+08:00[880983.516874] RSP: 0018:880138b77dd8  
EFLAGS: 00010093
2017-06-28T02:18:16.492066+08:00[880983.518720] RAX: d8fe6d5832a10700 RBX: 
0003213fc510c9c0 RCX: 30303031
2017-06-28T02:18:16.494646+08:00[880983.521299] RDX: 30303031 RSI: 
880138ba5298 RDI: 01347e27
2017-06-28T02:18:16.497232+08:00[880983.523877] RBP: 880138b77e30 R08: 
fffefbb208f1 R09: 0001
2017-06-28T02:18:16.499848+08:00[880983.526500] R10: 0027 R11: 
0027 R12: 0001347e26d0
2017-06-28T02:18:16.502272+08:00[880983.528917] R13: 880138ba5028 R14: 
880138ba4000 R15: 880138b77de8
2017-06-28T02:18:16.504845+08:00[880983.531498] FS:  () 
GS:88013ed0() knlGS:
2017-06-28T02:18:16.507583+08:00[880983.534237] CS:  0010 DS:  ES:  
CR0: 80050033
2017-06-28T02:18:16.509242+08:00[880983.535898] CR2: 0ad7a168 CR3: 
00013587f000 CR4: 001407e0
2017-06-28T02:18:16.511736+08:00[880983.538391] DR0:  DR1: 
 DR2: 
2017-06-28T02:18:16.514088+08:00[880983.540742] DR3:  DR6: 
0ff0 DR7: 0400
2017-06-28T02:18:16.516363+08:00[880983.543019] Stack:
2017-06-28T02:18:16.517237+08:00[880983.543895]  880138b77e00 
880138ba5028 880138ba5428 880138ba5828
2017-06-28T02:18:16.519793+08:00[880983.546451]  880138ba5c28 
cdf899bbcb229fc8 88013ed0fbc0 0003213fc510c9c0
2017-06-28T02:18:16.522327+08:00[880983.548982]  0001 
88013ed0cf00 0001347e26d0 880138b77e88
2017-06-28T02:18:16.525118+08:00[880983.551772] Call Trace:
2017-06-28T02:18:16.526205+08:00[880983.552858]  [] 
tick_nohz_stop_sched_tick+0x1e8/0x2e0
2017-06-28T02:18:16.528063+08:00[880983.554716]  [] ? 
kvm_sched_clock_read+0x1f/0x30
2017-06-28T02:18:16.529842+08:00[880983.556496]  [] 
__tick_nohz_idle_enter+0x9e/0x150
2017-06-28T02:18:16.531547+08:00[880983.558199]  [] 
tick_nohz_idle_enter+0x3d/0x70
2017-06-28T02:18:16.533235+08:00[880983.559890]  [] 
cpu_startup_entry+0x9e/0x290
2017-06-28T02:18:16.534843+08:00[880983.561498]  [] 
start_secondary+0x1ba/0x230
2017-06-28T02:18:16.536488+08:00[880983.563141] Code: 45 a8 41 89 fb 41 83 e3 
3f 45 89 da 0f 1f 80 00 00 00 00 49 63 f2 48 89 ca 48 c1 e6 04 4c 01 ee 48 8b 
06 48 39 f0 74 2c 0f 1f 00  40 18 01 48 89 ca 75 18 48 8b 50 10 41 b9 01 00 
00 00 49 89 
2017-06-28T02:18:16.543574+08:00[880983.570229] RIP  [] 
get_next_timer_interrupt+0x1c0/0x270
2017-06-28T02:18:16.545377+08:00[880983.572032]  RSP 


2)
[ 6401.956939] BUG: unable to handle kernel NULL pointer dereference at 
0001
[ 6401.958982] IP: [] seq_escape+0x19/0x120
[ 6401.960325] PGD 5c4d3067 PUD 5c4d2067 PMD 0 
[ 6401.961458] Oops:  [#1] SMP 
[ 6401.962347] Modules linked in: klp_0001_tty_io_c(OE) zram(C) veth loop 
regmap_i2c rfkill binfmt_misc ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat 
nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter 
xt_conntrack nf_nat nf_conntrack bridge stp llc overlay() rtos_kbox_panic(OE) 
signo_catch(O) coretemp vmwgfx ttm crc32_pclmul drm_kms_helper 
ghash_clmulni_intel aesni_intel drm lrw gf128mul glue_helper ppdev ablk_helper 
cryptd pcspkr serio_raw vmw_balloon sg parport_pc parport i2c_piix4 floppy 
vmw_vmci i2c_core shpchp pata_acpi dm_mod sha512_generic ip_tables sr_mod cdrom 
ata_generic sd_mod crc_t10dif crct10dif_generic crct10dif_pclmul 
crct10dif_common crc32c_intel ata_piix vmxnet3 libata vmw_pvscsi ext4 mbcache

centos 7.2，I got some oops form my production line

2017-06-29 Thread Xishi Qiu

centos 7.2，I got some oops form my production line,
Anybody has seen these errors before?


1)
2017-06-28T02:18:16.461384+08:00[880983.488036] do nothing after die!
2017-06-28T02:18:16.462068+08:00[880983.488723] Modules linked in: fuse 
iptable_filter sha512_generic icp_qa_al_vf(OVE) vfat fat isofs ext4 jbd2 xfs 
libcrc32c kboxdriver(O) ipmi_devintf ipmi_si ipmi_msghandler kbox(O) 
signo_catch(O) mlx4_core(OVE) compat(OVE) ppdev crc32_pclmul 
ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd 
pcspkr parport_pc i2c_piix4 parport i2c_core ip_tables ext3 mbcache jbd 
ata_generic pata_acpi virtio_console(OVE) virtio_balloon(OVE) virtio_blk(OVE) 
virtio_net(OVE) kvm_ivshmem(OVE) crct10dif_pclmul crct10dif_common crc32c_intel 
serio_raw ata_piix pv_channel(OVE) virtio_pci(OVE) virtio_ring(OVE) libata 
virtio(OVE) floppy bonding
2017-06-28T02:18:16.473941+08:00[880983.500597] CPU: 1 PID: 0 Comm: swapper/1 
Tainted: G   OE  V---   3.10.0-327.44.58.28.x86_64 #1
2017-06-28T02:18:16.475784+08:00[880983.502440] Hardware name: QEMU Standard PC 
(i440FX + PIIX, 1996), BIOS 
rel-1.9.1-0-gb3ef39f-20170515_131417-build9a64a246a230 04/01/2014
2017-06-28T02:18:16.477993+08:00[880983.504643] task: 880138b62280 ti: 
880138b74000 task.ti: 880138b74000
2017-06-28T02:18:16.486759+08:00[880983.513413] RIP: 0010:[]  
[] get_next_timer_interrupt+0x1c0/0x270
2017-06-28T02:18:16.490230+08:00[880983.516874] RSP: 0018:880138b77dd8  
EFLAGS: 00010093
2017-06-28T02:18:16.492066+08:00[880983.518720] RAX: d8fe6d5832a10700 RBX: 
0003213fc510c9c0 RCX: 30303031
2017-06-28T02:18:16.494646+08:00[880983.521299] RDX: 30303031 RSI: 
880138ba5298 RDI: 01347e27
2017-06-28T02:18:16.497232+08:00[880983.523877] RBP: 880138b77e30 R08: 
fffefbb208f1 R09: 0001
2017-06-28T02:18:16.499848+08:00[880983.526500] R10: 0027 R11: 
0027 R12: 0001347e26d0
2017-06-28T02:18:16.502272+08:00[880983.528917] R13: 880138ba5028 R14: 
880138ba4000 R15: 880138b77de8
2017-06-28T02:18:16.504845+08:00[880983.531498] FS:  () 
GS:88013ed0() knlGS:
2017-06-28T02:18:16.507583+08:00[880983.534237] CS:  0010 DS:  ES:  
CR0: 80050033
2017-06-28T02:18:16.509242+08:00[880983.535898] CR2: 0ad7a168 CR3: 
00013587f000 CR4: 001407e0
2017-06-28T02:18:16.511736+08:00[880983.538391] DR0:  DR1: 
 DR2: 
2017-06-28T02:18:16.514088+08:00[880983.540742] DR3:  DR6: 
0ff0 DR7: 0400
2017-06-28T02:18:16.516363+08:00[880983.543019] Stack:
2017-06-28T02:18:16.517237+08:00[880983.543895]  880138b77e00 
880138ba5028 880138ba5428 880138ba5828
2017-06-28T02:18:16.519793+08:00[880983.546451]  880138ba5c28 
cdf899bbcb229fc8 88013ed0fbc0 0003213fc510c9c0
2017-06-28T02:18:16.522327+08:00[880983.548982]  0001 
88013ed0cf00 0001347e26d0 880138b77e88
2017-06-28T02:18:16.525118+08:00[880983.551772] Call Trace:
2017-06-28T02:18:16.526205+08:00[880983.552858]  [] 
tick_nohz_stop_sched_tick+0x1e8/0x2e0
2017-06-28T02:18:16.528063+08:00[880983.554716]  [] ? 
kvm_sched_clock_read+0x1f/0x30
2017-06-28T02:18:16.529842+08:00[880983.556496]  [] 
__tick_nohz_idle_enter+0x9e/0x150
2017-06-28T02:18:16.531547+08:00[880983.558199]  [] 
tick_nohz_idle_enter+0x3d/0x70
2017-06-28T02:18:16.533235+08:00[880983.559890]  [] 
cpu_startup_entry+0x9e/0x290
2017-06-28T02:18:16.534843+08:00[880983.561498]  [] 
start_secondary+0x1ba/0x230
2017-06-28T02:18:16.536488+08:00[880983.563141] Code: 45 a8 41 89 fb 41 83 e3 
3f 45 89 da 0f 1f 80 00 00 00 00 49 63 f2 48 89 ca 48 c1 e6 04 4c 01 ee 48 8b 
06 48 39 f0 74 2c 0f 1f 00  40 18 01 48 89 ca 75 18 48 8b 50 10 41 b9 01 00 
00 00 49 89 
2017-06-28T02:18:16.543574+08:00[880983.570229] RIP  [] 
get_next_timer_interrupt+0x1c0/0x270
2017-06-28T02:18:16.545377+08:00[880983.572032]  RSP 


2)
[ 6401.956939] BUG: unable to handle kernel NULL pointer dereference at 
0001
[ 6401.958982] IP: [] seq_escape+0x19/0x120
[ 6401.960325] PGD 5c4d3067 PUD 5c4d2067 PMD 0 
[ 6401.961458] Oops:  [#1] SMP 
[ 6401.962347] Modules linked in: klp_0001_tty_io_c(OE) zram(C) veth loop 
regmap_i2c rfkill binfmt_misc ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat 
nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter 
xt_conntrack nf_nat nf_conntrack bridge stp llc overlay() rtos_kbox_panic(OE) 
signo_catch(O) coretemp vmwgfx ttm crc32_pclmul drm_kms_helper 
ghash_clmulni_intel aesni_intel drm lrw gf128mul glue_helper ppdev ablk_helper 
cryptd pcspkr serio_raw vmw_balloon sg parport_pc parport i2c_piix4 floppy 
vmw_vmci i2c_core shpchp pata_acpi dm_mod sha512_generic ip_tables sr_mod cdrom 
ata_generic sd_mod crc_t10dif crct10dif_generic crct10dif_pclmul 
crct10dif_common crc32c_intel ata_piix vmxnet3 libata vmw_pvscsi ext4 mbcache

Re: [RFC] memory corruption caused by efi driver?

2017-06-25 Thread Xishi Qiu

On 2017/6/24 19:12, Greg KH wrote:

> On Sat, Jun 24, 2017 at 05:52:23PM +0800, Yisheng Xie wrote:
>> hi all,
>>
>> I met an Oops problem with linux-3.10. The RIP is sysfs_open_file+0x46/0x2b0 
>> (I will and the full
>> crash log in the end of this mail).
> 
> 3.10 is _very_ old and obsolete, can you duplicate this on a modern
> kernel, like 4.11?
> 
> thanks,
> 
> greg k-h
> 
> .
> 

Hi, if I disable CONFIG_EFI_VARS, it seems OK now.

And I cann't reproduce the problem on mainline(v4.12).

Here is my test, run some stress test, then
cat /sys/firmware/efi/efivars/*
or
cat /sys/firmware/efi/vars/*/*

1) 3.10, get warning
CONFIG_EFI_VARS=y
CONFIG_EFIVAR_FS=y

2) 3.10, get warning
CONFIG_EFI_VARS=y
CONFIG_EFIVAR_FS=n

3) 3.10, ok
CONFIG_EFI_VARS=n
CONFIG_EFIVAR_FS=y

4) mainline, ok
CONFIG_EFI_VARS=y
CONFIG_EFIVAR_FS=y

log:
[78872.389117] WARNING: at fs/sysfs/file.c:343 sysfs_open_file+0x222/0x2b0()
[78872.389118] missing sysfs attribute operations for kobject: (null)
[78872.389177] Modules linked in: gen_timer(OVE) tun zram(C) ext4 jbd2 mbcache 
loop regmap_i2c binfmt_misc scsi_transport_iscsi cfg80211 ip6t_rpfilter 
ip6t_REJECT ipt_REJECT xt_conntrack rfk
ill ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables 
ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle 
ip6table_security ip6table_raw ip6table_filter
 ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat 
nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter 
ip_tables sg iTCO_wdt ipmi_devintf iTCO_ve
ndor_support vfat fat intel_powerclamp coretemp kvm_intel kvm nfsd 
crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ipmi_ssif 
aesni_intel lrw gf128mul auth_rpcgss glue_helper a
blk_helper i7core_edac nfs_acl cryptd lpc_ich pcspkr
[78872.389197]  ipmi_si i2c_i801 edac_core shpchp mfd_core lockd 
ipmi_msghandler acpi_cpufreq grace sunrpc uinput xfs libcrc32c sd_mod sr_mod 
crc_t10dif cdrom crct10dif_common ixgbe igb ahci
 mdio libahci ptp i2c_algo_bit pps_core libata i2c_core megaraid_sas dca 
dm_mirror dm_region_hash dm_log dm_mod [last unloaded: gen_timer]
[78872.389202] CPU: 52 PID: 28434 Comm: cat Tainted: GWC OE  
V---   3.10.0-327.55.58.81.x86_64 #2
[78872.389204] Hardware name: HUAWEI TECHNOLOGIES CO.,LTD. Tecal RH5885 
V2/CH91RGPUC, BIOS RGPUC-BIOS-V058 06/23/2013
[78872.389207]  88200a61fc10 df10e27d 88200a61fbc8 
8163ed14
[78872.389208]  88200a61fc00 8107b300 fff3 
88103f6473a0
[78872.389209]  8880236cb700 88103f6473a0 8860281d8838 
88200a61fc68
[78872.389210] Call Trace:
[78872.389224]  [] dump_stack+0x19/0x1b
[78872.389233]  [] warn_slowpath_common+0x70/0xb0
[78872.389234]  [] warn_slowpath_fmt+0x5c/0x80
[78872.389236]  [] sysfs_open_file+0x222/0x2b0
[78872.389242]  [] do_dentry_open+0x1a7/0x2e0
[78872.389244]  [] ? sysfs_schedule_callback+0x1c0/0x1c0
[78872.389245]  [] vfs_open+0x39/0x70
[78872.389251]  [] do_last+0x1ed/0x12a0
[78872.389259]  [] ? kmem_cache_alloc_trace+0x1ce/0x1f0
[78872.389261]  [] path_openat+0xc2/0x490
[78872.389267]  [] ? call_rcu_sched+0x1d/0x20
[78872.389275]  [] ? shmem_destroy_inode+0x2d/0x40
[78872.389281]  [] ? evict+0x106/0x170
[78872.389283]  [] do_filp_open+0x4b/0xb0
[78872.389286]  [] ? __alloc_fd+0xa7/0x130
[78872.389290]  [] do_sys_open+0xf3/0x1f0
[78872.389291]  [] SyS_open+0x1e/0x20
[78872.389297]  [] system_call_fastpath+0x16/0x1b
[78872.389298] ---[ end trace cbe34632be0fdedf ]---
[78872.390067] [ cut here ]

Re: [RFC] memory corruption caused by efi driver?

2017-06-25 Thread Xishi Qiu

On 2017/6/24 19:12, Greg KH wrote:

> On Sat, Jun 24, 2017 at 05:52:23PM +0800, Yisheng Xie wrote:
>> hi all,
>>
>> I met an Oops problem with linux-3.10. The RIP is sysfs_open_file+0x46/0x2b0 
>> (I will and the full
>> crash log in the end of this mail).
> 
> 3.10 is _very_ old and obsolete, can you duplicate this on a modern
> kernel, like 4.11?
> 
> thanks,
> 
> greg k-h
> 
> .
> 

Hi, if I disable CONFIG_EFI_VARS, it seems OK now.

And I cann't reproduce the problem on mainline(v4.12).

Here is my test, run some stress test, then
cat /sys/firmware/efi/efivars/*
or
cat /sys/firmware/efi/vars/*/*

1) 3.10, get warning
CONFIG_EFI_VARS=y
CONFIG_EFIVAR_FS=y

2) 3.10, get warning
CONFIG_EFI_VARS=y
CONFIG_EFIVAR_FS=n

3) 3.10, ok
CONFIG_EFI_VARS=n
CONFIG_EFIVAR_FS=y

4) mainline, ok
CONFIG_EFI_VARS=y
CONFIG_EFIVAR_FS=y

log:
[78872.389117] WARNING: at fs/sysfs/file.c:343 sysfs_open_file+0x222/0x2b0()
[78872.389118] missing sysfs attribute operations for kobject: (null)
[78872.389177] Modules linked in: gen_timer(OVE) tun zram(C) ext4 jbd2 mbcache 
loop regmap_i2c binfmt_misc scsi_transport_iscsi cfg80211 ip6t_rpfilter 
ip6t_REJECT ipt_REJECT xt_conntrack rfk
ill ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables 
ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle 
ip6table_security ip6table_raw ip6table_filter
 ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat 
nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter 
ip_tables sg iTCO_wdt ipmi_devintf iTCO_ve
ndor_support vfat fat intel_powerclamp coretemp kvm_intel kvm nfsd 
crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ipmi_ssif 
aesni_intel lrw gf128mul auth_rpcgss glue_helper a
blk_helper i7core_edac nfs_acl cryptd lpc_ich pcspkr
[78872.389197]  ipmi_si i2c_i801 edac_core shpchp mfd_core lockd 
ipmi_msghandler acpi_cpufreq grace sunrpc uinput xfs libcrc32c sd_mod sr_mod 
crc_t10dif cdrom crct10dif_common ixgbe igb ahci
 mdio libahci ptp i2c_algo_bit pps_core libata i2c_core megaraid_sas dca 
dm_mirror dm_region_hash dm_log dm_mod [last unloaded: gen_timer]
[78872.389202] CPU: 52 PID: 28434 Comm: cat Tainted: GWC OE  
V---   3.10.0-327.55.58.81.x86_64 #2
[78872.389204] Hardware name: HUAWEI TECHNOLOGIES CO.,LTD. Tecal RH5885 
V2/CH91RGPUC, BIOS RGPUC-BIOS-V058 06/23/2013
[78872.389207]  88200a61fc10 df10e27d 88200a61fbc8 
8163ed14
[78872.389208]  88200a61fc00 8107b300 fff3 
88103f6473a0
[78872.389209]  8880236cb700 88103f6473a0 8860281d8838 
88200a61fc68
[78872.389210] Call Trace:
[78872.389224]  [] dump_stack+0x19/0x1b
[78872.389233]  [] warn_slowpath_common+0x70/0xb0
[78872.389234]  [] warn_slowpath_fmt+0x5c/0x80
[78872.389236]  [] sysfs_open_file+0x222/0x2b0
[78872.389242]  [] do_dentry_open+0x1a7/0x2e0
[78872.389244]  [] ? sysfs_schedule_callback+0x1c0/0x1c0
[78872.389245]  [] vfs_open+0x39/0x70
[78872.389251]  [] do_last+0x1ed/0x12a0
[78872.389259]  [] ? kmem_cache_alloc_trace+0x1ce/0x1f0
[78872.389261]  [] path_openat+0xc2/0x490
[78872.389267]  [] ? call_rcu_sched+0x1d/0x20
[78872.389275]  [] ? shmem_destroy_inode+0x2d/0x40
[78872.389281]  [] ? evict+0x106/0x170
[78872.389283]  [] do_filp_open+0x4b/0xb0
[78872.389286]  [] ? __alloc_fd+0xa7/0x130
[78872.389290]  [] do_sys_open+0xf3/0x1f0
[78872.389291]  [] SyS_open+0x1e/0x20
[78872.389297]  [] system_call_fastpath+0x16/0x1b
[78872.389298] ---[ end trace cbe34632be0fdedf ]---
[78872.390067] [ cut here ]

Re: mm, something wring in page_lock_anon_vma_read()?

2017-06-08 Thread Xishi Qiu

On 2017/5/23 17:33, Vlastimil Babka wrote:

> On 05/23/2017 11:21 AM, zhong jiang wrote:
>> On 2017/5/23 0:51, Vlastimil Babka wrote:
>>> On 05/20/2017 05:01 AM, zhong jiang wrote:
>>>> On 2017/5/20 10:40, Hugh Dickins wrote:
>>>>> On Sat, 20 May 2017, Xishi Qiu wrote:
>>>>>> Here is a bug report form redhat: 
>>>>>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>>>>>> And I meet the bug too. However it is hard to reproduce, and 
>>>>>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not 
>>>>>> help.
>>>>>>
>>>>>> From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>>>>>> _count=2),
>>>>>> and the value of mapping is a valid address(mapping = 
>>>>>> 0x8801b3e2a101),
>>>>>> but anon_vma has been corrupted.
>>>>>>
>>>>>> Any ideas?
>>>>> Sorry, no.  I assume that _mapcount has been misaccounted, for example
>>>>> a pte mapped in on top of another pte; but cannot begin tell you where
>>>>> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>>>>>
>>>>> Hugh
>>>>>
>>>>> .
>>>>>
>>>> Hi, Hugh
>>>>
>>>> I find the following message from the dmesg.
>>>>
>>>> [26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1
>>>>
>>>> I can prove that the __mapcount is misaccount.  when task is exited. the 
>>>> rmap
>>>> still exist.
>>> Check if the kernel in question contains this commit: ad33bb04b2a6 ("mm:
>>> thp: fix SMP race condition between THP page fault and MADV_DONTNEED")
>>   HI, Vlastimil
>>  
>>   I miss the patch.
> 
> Try applying it then, there's good chance the error and crash will go
> away. Even if your workload doesn't actually run any madvise(MADV_DONTNEED).
> 

Hi Vlastimil,

I find this error was reported by Kirill as following, right?
https://patchwork.kernel.org/patch/7550401/

The call trace is quite like the same as ours.

Thanks,
Xishi Qiu

>> when I read the patch. I find the following issue. but I am sure it is right.
>>
>>   if (unlikely(pmd_trans_unstable(pmd)))
>> return 0;
>> /*
>>  * A regular pmd is established and it can't morph into a huge pmd
>>  * from under us anymore at this point because we hold the mmap_sem
>>  * read mode and khugepaged takes it in write mode. So now it's
>>  * safe to run pte_offset_map().
>>  */
>> pte = pte_offset_map(pmd, address);
>>
>>   after pmd_trans_unstable call,  without any protect method.  by the 
>> comments,
>>   it think the pte_offset_map is safe.before pte_offset_map call, it 
>> still may be
>>   unstable. it is possible?
> 
> IIRC it's "unstable" wrt possible none->huge->none transition. But once
> we've seen it's a regular pmd via pmd_trans_unstable(), we're safe as a
> transition from regular pmd can't happen.
> 
>>   Thanks
>> zhongjiang
>>>> Thanks
>>>> zhongjiang
>>>>
>>>> --
>>>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>>> the body to majord...@kvack.org.  For more info on Linux MM,
>>>> see: http://www.linux-mm.org/ .
>>>> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 
>>>>
>>>
>>> .
>>>
>>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 
>>
> 
> 
> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-06-08 Thread Xishi Qiu

On 2017/5/23 17:33, Vlastimil Babka wrote:

> On 05/23/2017 11:21 AM, zhong jiang wrote:
>> On 2017/5/23 0:51, Vlastimil Babka wrote:
>>> On 05/20/2017 05:01 AM, zhong jiang wrote:
>>>> On 2017/5/20 10:40, Hugh Dickins wrote:
>>>>> On Sat, 20 May 2017, Xishi Qiu wrote:
>>>>>> Here is a bug report form redhat: 
>>>>>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>>>>>> And I meet the bug too. However it is hard to reproduce, and 
>>>>>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not 
>>>>>> help.
>>>>>>
>>>>>> From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>>>>>> _count=2),
>>>>>> and the value of mapping is a valid address(mapping = 
>>>>>> 0x8801b3e2a101),
>>>>>> but anon_vma has been corrupted.
>>>>>>
>>>>>> Any ideas?
>>>>> Sorry, no.  I assume that _mapcount has been misaccounted, for example
>>>>> a pte mapped in on top of another pte; but cannot begin tell you where
>>>>> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>>>>>
>>>>> Hugh
>>>>>
>>>>> .
>>>>>
>>>> Hi, Hugh
>>>>
>>>> I find the following message from the dmesg.
>>>>
>>>> [26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1
>>>>
>>>> I can prove that the __mapcount is misaccount.  when task is exited. the 
>>>> rmap
>>>> still exist.
>>> Check if the kernel in question contains this commit: ad33bb04b2a6 ("mm:
>>> thp: fix SMP race condition between THP page fault and MADV_DONTNEED")
>>   HI, Vlastimil
>>  
>>   I miss the patch.
> 
> Try applying it then, there's good chance the error and crash will go
> away. Even if your workload doesn't actually run any madvise(MADV_DONTNEED).
> 

Hi Vlastimil,

I find this error was reported by Kirill as following, right?
https://patchwork.kernel.org/patch/7550401/

The call trace is quite like the same as ours.

Thanks,
Xishi Qiu

>> when I read the patch. I find the following issue. but I am sure it is right.
>>
>>   if (unlikely(pmd_trans_unstable(pmd)))
>> return 0;
>> /*
>>  * A regular pmd is established and it can't morph into a huge pmd
>>  * from under us anymore at this point because we hold the mmap_sem
>>  * read mode and khugepaged takes it in write mode. So now it's
>>  * safe to run pte_offset_map().
>>  */
>> pte = pte_offset_map(pmd, address);
>>
>>   after pmd_trans_unstable call,  without any protect method.  by the 
>> comments,
>>   it think the pte_offset_map is safe.before pte_offset_map call, it 
>> still may be
>>   unstable. it is possible?
> 
> IIRC it's "unstable" wrt possible none->huge->none transition. But once
> we've seen it's a regular pmd via pmd_trans_unstable(), we're safe as a
> transition from regular pmd can't happen.
> 
>>   Thanks
>> zhongjiang
>>>> Thanks
>>>> zhongjiang
>>>>
>>>> --
>>>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>>> the body to majord...@kvack.org.  For more info on Linux MM,
>>>> see: http://www.linux-mm.org/ .
>>>> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 
>>>>
>>>
>>> .
>>>
>>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 
>>
> 
> 
> .
>

Re: [RFC] ubsan: signed integer overflow in setitimer()

2017-06-06 Thread Xishi Qiu

On 2017/6/4 23:06, Thomas Gleixner wrote:

> On Thu, 1 Jun 2017, Xishi Qiu wrote:
> 
> Cc'ed John Stultz
> 
>> Hi, this is the test case, and then I got ubsan error
>> (signed integer overflow) report, so the root cause is from
>> user or kernel? Shall we change something in timeval_valid()?
>>
>>
>> struct itimerval new_value;
>> int ret;
>>
>> new_value.it_interval.tv_sec = 140673496649799L;
>> new_value.it_interval.tv_usec = 6;
>> new_value.it_value.tv_sec = 140673496649807L;
>> new_value.it_value.tv_usec = 5;
>>
>> ret = setitimer(ITIMER_VIRTUAL, _value, NULL);
>>
>>
>> [  533.326588] 
>> 
>> [  533.335346] UBSAN: Undefined behaviour in ./include/linux/time.h:239:27
>> [  533.342155] signed integer overflow:
>> [  533.345837] 140673496649807 * 10 cannot be represented in type 
>> 'long int'
>> [  533.422181]  set_cpu_itimer+0x49c/0x540
>> [  533.442127]  do_setitimer+0xe1/0x540
> 
> We need a similar clamping of the conversion as we have for
> timespec/val_to_ktime(). I'll have a look in the next days unless John
> beats me to it.
> 

Hi Thomas, anything new?

Thanks,
Xishi Qiu

> Thanks,
> 
>   tglx
> 
> .
>

Re: [RFC] ubsan: signed integer overflow in setitimer()

2017-06-06 Thread Xishi Qiu

On 2017/6/4 23:06, Thomas Gleixner wrote:

> On Thu, 1 Jun 2017, Xishi Qiu wrote:
> 
> Cc'ed John Stultz
> 
>> Hi, this is the test case, and then I got ubsan error
>> (signed integer overflow) report, so the root cause is from
>> user or kernel? Shall we change something in timeval_valid()?
>>
>>
>> struct itimerval new_value;
>> int ret;
>>
>> new_value.it_interval.tv_sec = 140673496649799L;
>> new_value.it_interval.tv_usec = 6;
>> new_value.it_value.tv_sec = 140673496649807L;
>> new_value.it_value.tv_usec = 5;
>>
>> ret = setitimer(ITIMER_VIRTUAL, _value, NULL);
>>
>>
>> [  533.326588] 
>> 
>> [  533.335346] UBSAN: Undefined behaviour in ./include/linux/time.h:239:27
>> [  533.342155] signed integer overflow:
>> [  533.345837] 140673496649807 * 10 cannot be represented in type 
>> 'long int'
>> [  533.422181]  set_cpu_itimer+0x49c/0x540
>> [  533.442127]  do_setitimer+0xe1/0x540
> 
> We need a similar clamping of the conversion as we have for
> timespec/val_to_ktime(). I'll have a look in the next days unless John
> beats me to it.
> 

Hi Thomas, anything new?

Thanks,
Xishi Qiu

> Thanks,
> 
>   tglx
> 
> .
>

ubsan: some error report during boot

2017-06-01 Thread Xishi Qiu

I got some error report during boot from ubsan,
kernel version is v4.12


[0.001000] 

[0.001000] UBSAN: Undefined behaviour in 
arch/x86/kernel/apic/apic_flat_64.c:49:11
[0.001000] shift exponent 64 is too large for 64-bit type 'long unsigned 
int'
[0.001000] CPU: 64 PID: 0 Comm: swapper/64 Not tainted 
4.12.0-rc3-327.44.58.18.x86_64+ #30
[0.001000] Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, 
BIOS BLHSV028 11/11/2014
[0.001000] Call Trace:
[0.001000]  dump_stack+0xbc/0x124
[0.001000]  ? _atomic_dec_and_lock+0x14c/0x14c
[0.001000]  ubsan_epilogue+0xd/0x4e
[0.001000]  __ubsan_handle_shift_out_of_bounds+0x1fb/0x254
[0.001000]  ? __ubsan_handle_load_invalid_value+0x15b/0x15b
[0.001000]  ? set_pte_vaddr+0x5f/0x90
[0.001000]  ? __native_set_fixmap+0x29/0x70
[0.001000]  ? native_set_fixmap+0x60/0x70
[0.001000]  ? cpu_init+0x537/0xac0
[0.001000]  flat_init_apic_ldr+0xda/0x120
[0.001000]  ? flat_init_apic_ldr+0xda/0x120
[0.001000]  setup_local_APIC+0xef/0x810
[0.001000]  apic_ap_setup+0xe/0x20
[0.001000]  start_secondary+0x112/0x370
[0.001000]  ? set_cpu_sibling_map+0x1970/0x1970
[0.001000]  secondary_startup_64+0x9f/0x9f
[0.001000] 




[   47.146515] 

[   47.146519] UBSAN: Undefined behaviour in 
drivers/scsi/megaraid/megaraid_sas_fp.c:127:32
[   47.146522] index 255 is out of range for type 'MR_LD_SPAN_MAP [1]'
[   47.146529] CPU: 193 PID: 2435 Comm: systemd-udevd Not tainted 
4.12.0-rc3-327.44.58.18.x86_64+ #30
[   47.146531] Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, 
BIOS BLHSV028 11/11/2014
[   47.146534] Call Trace:
[   47.146554]  dump_stack+0xbc/0x124
[   47.146559]  ? _atomic_dec_and_lock+0x14c/0x14c
[   47.146569]  ubsan_epilogue+0xd/0x4e
[   47.146574]  __ubsan_handle_out_of_bounds+0x106/0x14d
[   47.146578]  ? __ubsan_handle_shift_out_of_bounds+0x254/0x254
[   47.146586]  ? usleep_range+0x110/0x110
[   47.146596]  ? memcpy+0x45/0x50
[   47.146614]  ? MR_PopulateDrvRaidMap+0x5d5/0x820 [megaraid_sas]
[   47.146630]  mr_update_load_balance_params+0x165/0x1c0 [megaraid_sas]
[   47.146645]  MR_ValidateMapInfo+0x218/0xc60 [megaraid_sas]
[   47.146661]  ? megasas_get_ctrl_info+0x3f1/0xdb0 [megaraid_sas]
[   47.146668]  ? __list_del_entry_valid+0x77/0x130
[   47.146683]  ? megasas_return_cmd+0xa5/0x3c0 [megaraid_sas]
[   47.146697]  ? megasas_get_cmd+0x2e0/0x2e0 [megaraid_sas]
[   47.146712]  ? wait_and_poll+0x7e/0x160 [megaraid_sas]
[   47.146727]  megasas_get_map_info+0x26d/0x380 [megaraid_sas]
[   47.146742]  megasas_init_adapter_fusion+0xaa4/0xe50 [megaraid_sas]
[   47.146757]  ? megasas_sync_map_info+0x430/0x430 [megaraid_sas]
[   47.146761]  ? kasan_kmalloc+0xad/0xe0
[   47.146765]  ? kmem_cache_alloc_trace+0x11a/0x320
[   47.146779]  ? megasas_init_fw+0x475/0x1b50 [megaraid_sas]
[   47.146794]  megasas_init_fw+0x500/0x1b50 [megaraid_sas]
[   47.146808]  ? megasas_resume+0x9c0/0x9c0 [megaraid_sas]
[   47.146819]  ? dma_generic_alloc_coherent+0x1a5/0x230
[   47.146825]  ? __init_waitqueue_head+0x9d/0x100
[   47.146828]  ? put_prev_task_stop+0x310/0x310
[   47.146843]  megasas_probe_one.part.19+0xfe2/0x2890 [megaraid_sas]
[   47.146857]  ? megasas_init_fw+0x1b50/0x1b50 [megaraid_sas]
[   47.146862]  ? save_stack_trace+0x1b/0x20
[   47.146866]  ? save_stack+0x46/0xd0
[   47.146869]  ? kasan_slab_free+0x70/0xc0
[   47.146872]  ? kfree+0xd8/0x350
[   47.146878]  ? acpi_pci_irq_enable+0x283/0x440
[   47.146885]  ? pcibios_enable_device+0x4a/0x60
[   47.146890]  ? do_pci_enable_device+0x164/0x240
[   47.146893]  ? pci_enable_device_flags+0x229/0x2d0
[   47.146896]  ? pci_enable_device_mem+0x13/0x20
[   47.146909]  ? megasas_probe_one+0x8e/0x150 [megaraid_sas]
[   47.146912]  ? local_pci_probe+0x95/0x120
[   47.146915]  ? pci_device_probe+0x2dc/0x3f0
[   47.146921]  ? driver_probe_device+0x36a/0x910
[   47.146924]  ? __driver_attach+0x139/0x180
[   47.146927]  ? bus_for_each_dev+0xfb/0x180
[   47.146929]  ? bus_add_driver+0x29b/0x4a0
...

ubsan: some error report during boot

2017-06-01 Thread Xishi Qiu

I got some error report during boot from ubsan,
kernel version is v4.12


[0.001000] 

[0.001000] UBSAN: Undefined behaviour in 
arch/x86/kernel/apic/apic_flat_64.c:49:11
[0.001000] shift exponent 64 is too large for 64-bit type 'long unsigned 
int'
[0.001000] CPU: 64 PID: 0 Comm: swapper/64 Not tainted 
4.12.0-rc3-327.44.58.18.x86_64+ #30
[0.001000] Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, 
BIOS BLHSV028 11/11/2014
[0.001000] Call Trace:
[0.001000]  dump_stack+0xbc/0x124
[0.001000]  ? _atomic_dec_and_lock+0x14c/0x14c
[0.001000]  ubsan_epilogue+0xd/0x4e
[0.001000]  __ubsan_handle_shift_out_of_bounds+0x1fb/0x254
[0.001000]  ? __ubsan_handle_load_invalid_value+0x15b/0x15b
[0.001000]  ? set_pte_vaddr+0x5f/0x90
[0.001000]  ? __native_set_fixmap+0x29/0x70
[0.001000]  ? native_set_fixmap+0x60/0x70
[0.001000]  ? cpu_init+0x537/0xac0
[0.001000]  flat_init_apic_ldr+0xda/0x120
[0.001000]  ? flat_init_apic_ldr+0xda/0x120
[0.001000]  setup_local_APIC+0xef/0x810
[0.001000]  apic_ap_setup+0xe/0x20
[0.001000]  start_secondary+0x112/0x370
[0.001000]  ? set_cpu_sibling_map+0x1970/0x1970
[0.001000]  secondary_startup_64+0x9f/0x9f
[0.001000] 




[   47.146515] 

[   47.146519] UBSAN: Undefined behaviour in 
drivers/scsi/megaraid/megaraid_sas_fp.c:127:32
[   47.146522] index 255 is out of range for type 'MR_LD_SPAN_MAP [1]'
[   47.146529] CPU: 193 PID: 2435 Comm: systemd-udevd Not tainted 
4.12.0-rc3-327.44.58.18.x86_64+ #30
[   47.146531] Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, 
BIOS BLHSV028 11/11/2014
[   47.146534] Call Trace:
[   47.146554]  dump_stack+0xbc/0x124
[   47.146559]  ? _atomic_dec_and_lock+0x14c/0x14c
[   47.146569]  ubsan_epilogue+0xd/0x4e
[   47.146574]  __ubsan_handle_out_of_bounds+0x106/0x14d
[   47.146578]  ? __ubsan_handle_shift_out_of_bounds+0x254/0x254
[   47.146586]  ? usleep_range+0x110/0x110
[   47.146596]  ? memcpy+0x45/0x50
[   47.146614]  ? MR_PopulateDrvRaidMap+0x5d5/0x820 [megaraid_sas]
[   47.146630]  mr_update_load_balance_params+0x165/0x1c0 [megaraid_sas]
[   47.146645]  MR_ValidateMapInfo+0x218/0xc60 [megaraid_sas]
[   47.146661]  ? megasas_get_ctrl_info+0x3f1/0xdb0 [megaraid_sas]
[   47.146668]  ? __list_del_entry_valid+0x77/0x130
[   47.146683]  ? megasas_return_cmd+0xa5/0x3c0 [megaraid_sas]
[   47.146697]  ? megasas_get_cmd+0x2e0/0x2e0 [megaraid_sas]
[   47.146712]  ? wait_and_poll+0x7e/0x160 [megaraid_sas]
[   47.146727]  megasas_get_map_info+0x26d/0x380 [megaraid_sas]
[   47.146742]  megasas_init_adapter_fusion+0xaa4/0xe50 [megaraid_sas]
[   47.146757]  ? megasas_sync_map_info+0x430/0x430 [megaraid_sas]
[   47.146761]  ? kasan_kmalloc+0xad/0xe0
[   47.146765]  ? kmem_cache_alloc_trace+0x11a/0x320
[   47.146779]  ? megasas_init_fw+0x475/0x1b50 [megaraid_sas]
[   47.146794]  megasas_init_fw+0x500/0x1b50 [megaraid_sas]
[   47.146808]  ? megasas_resume+0x9c0/0x9c0 [megaraid_sas]
[   47.146819]  ? dma_generic_alloc_coherent+0x1a5/0x230
[   47.146825]  ? __init_waitqueue_head+0x9d/0x100
[   47.146828]  ? put_prev_task_stop+0x310/0x310
[   47.146843]  megasas_probe_one.part.19+0xfe2/0x2890 [megaraid_sas]
[   47.146857]  ? megasas_init_fw+0x1b50/0x1b50 [megaraid_sas]
[   47.146862]  ? save_stack_trace+0x1b/0x20
[   47.146866]  ? save_stack+0x46/0xd0
[   47.146869]  ? kasan_slab_free+0x70/0xc0
[   47.146872]  ? kfree+0xd8/0x350
[   47.146878]  ? acpi_pci_irq_enable+0x283/0x440
[   47.146885]  ? pcibios_enable_device+0x4a/0x60
[   47.146890]  ? do_pci_enable_device+0x164/0x240
[   47.146893]  ? pci_enable_device_flags+0x229/0x2d0
[   47.146896]  ? pci_enable_device_mem+0x13/0x20
[   47.146909]  ? megasas_probe_one+0x8e/0x150 [megaraid_sas]
[   47.146912]  ? local_pci_probe+0x95/0x120
[   47.146915]  ? pci_device_probe+0x2dc/0x3f0
[   47.146921]  ? driver_probe_device+0x36a/0x910
[   47.146924]  ? __driver_attach+0x139/0x180
[   47.146927]  ? bus_for_each_dev+0xfb/0x180
[   47.146929]  ? bus_add_driver+0x29b/0x4a0
...

[RFC] ubsan: signed integer overflow in setitimer()

2017-06-01 Thread Xishi Qiu

Hi, this is the test case, and then I got ubsan error
(signed integer overflow) report, so the root cause is from
user or kernel? Shall we change something in timeval_valid()?


struct itimerval new_value;
int ret;

new_value.it_interval.tv_sec = 140673496649799L;
new_value.it_interval.tv_usec = 6;
new_value.it_value.tv_sec = 140673496649807L;
new_value.it_value.tv_usec = 5;

ret = setitimer(ITIMER_VIRTUAL, _value, NULL);


[  533.326588] 

[  533.335346] UBSAN: Undefined behaviour in ./include/linux/time.h:239:27
[  533.342155] signed integer overflow:
[  533.345837] 140673496649807 * 10 cannot be represented in type 'long 
int'
[  533.353540] CPU: 102 PID: 17797 Comm: test.exe Tainted: GB   
4.12.0-rc3-327.44.58.18.x86_64+ #30
[  533.363646] Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, 
BIOS BLHSV028 11/11/2014
[  533.373130] Call Trace:
[  533.375670]  dump_stack+0xbc/0x124
[  533.379179]  ? _atomic_dec_and_lock+0x14c/0x14c
[  533.383850]  ubsan_epilogue+0xd/0x4e
[  533.387531]  handle_overflow+0x186/0x1d5
[  533.391571]  ? __ubsan_handle_negate_overflow+0x15b/0x15b
[  533.397133]  ? unlock_page+0x20/0x60
[  533.400815]  ? filemap_map_pages+0x3e8/0x820
[  533.405211]  ? read_cache_page_gfp+0x80/0x80
[  533.409613]  ? tty_ldisc_deref+0x28/0x40
[  533.413653]  ? tty_write+0x344/0x560
[  533.417335]  __ubsan_handle_mul_overflow+0xe/0x19
[  533.422181]  set_cpu_itimer+0x49c/0x540
[  533.426132]  ? get_cpu_itimer+0x290/0x290
[  533.430271]  ? __getnstimeofday64+0x14c/0x210
[  533.434762]  ? __pmd_alloc+0x1b0/0x1b0
[  533.438623]  ? ktime_get+0xd0/0xd0
[  533.442127]  do_setitimer+0xe1/0x540
[  533.445817]  ? __audit_syscall_entry+0x1cd/0x250
[  533.450571]  SyS_setitimer+0x1d6/0x210
[  533.454431]  ? SyS_alarm+0x150/0x150
[  533.463453]  ? trace_event_raw_event_sys_enter+0x590/0x590
[  533.474508]  ? handle_mm_fault+0x15a/0x530
[  533.484144]  ? __do_page_fault+0x3c9/0x740
[  533.493712]  ? SyS_alarm+0x150/0x150
[  533.502553]  do_syscall_64+0xf5/0x2a0
[  533.511294]  ? do_syscall_64+0xf5/0x2a0
[  533.520031]  entry_SYSCALL64_slow_path+0x25/0x25
[  533.529410] RIP: 0033:0x7f32bbd71c67
[  533.537565] RSP: 002b:7ffe9b18df08 EFLAGS: 0202 ORIG_RAX: 
0026
[  533.549766] RAX: ffda RBX:  RCX: 7f32bbd71c67
[  533.561530] RDX:  RSI: 7ffe9b18df30 RDI: 0001
[  533.573237] RBP: 7ffe9b18df60 R08: 7f32bbd09d38 R09: 0012
[  533.584794] R10: 7ffe9b18dc90 R11: 0202 R12: 00400490
[  533.596218] R13: 7ffe9b18e040 R14:  R15: 
[  533.607450] 

[  533.620059] 

[  533.632565] UBSAN: Undefined behaviour in ./include/linux/time.h:239:27
[  533.643357] signed integer overflow:
[  533.651093] 140673496649799 * 10 cannot be represented in type 'long 
int'
[  533.662927] CPU: 102 PID: 17797 Comm: test.exe Tainted: GB   
4.12.0-rc3-327.44.58.18.x86_64+ #30
[  533.681691] Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, 
BIOS BLHSV028 11/11/2014
[  533.700314] Call Trace:
[  533.707469]  dump_stack+0xbc/0x124
[  533.715588]  ? _atomic_dec_and_lock+0x14c/0x14c
[  533.724911]  ubsan_epilogue+0xd/0x4e
[  533.733281]  handle_overflow+0x186/0x1d5
[  533.741952]  ? __ubsan_handle_negate_overflow+0x15b/0x15b
[  533.752190]  ? unlock_page+0x20/0x60
[  533.760592]  ? filemap_map_pages+0x3e8/0x820
[  533.769709]  ? read_cache_page_gfp+0x80/0x80
[  533.778844]  ? tty_ldisc_deref+0x28/0x40
[  533.787641]  ? tty_write+0x344/0x560
[  533.796126]  __ubsan_handle_mul_overflow+0xe/0x19
[  533.805696]  set_cpu_itimer+0x438/0x540
[  533.814412]  ? get_cpu_itimer+0x290/0x290
[  533.823259]  ? __getnstimeofday64+0x14c/0x210
[  533.832504]  ? __pmd_alloc+0x1b0/0x1b0
[  533.841082]  ? ktime_get+0xd0/0xd0
[  533.849294]  do_setitimer+0xe1/0x540
[  533.857675]  ? __audit_syscall_entry+0x1cd/0x250
[  533.867146]  SyS_setitimer+0x1d6/0x210
[  533.875731]  ? SyS_alarm+0x150/0x150
[  533.884099]  ? trace_event_raw_event_sys_enter+0x590/0x590
[  533.894388]  ? handle_mm_fault+0x15a/0x530
[  533.903209]  ? __do_page_fault+0x3c9/0x740
[  533.911953]  ? SyS_alarm+0x150/0x150
[  533.920154]  do_syscall_64+0xf5/0x2a0
[  533.928504]  ? do_syscall_64+0xf5/0x2a0
[  533.936997]  entry_SYSCALL64_slow_path+0x25/0x25
[  533.946160] RIP: 0033:0x7f32bbd71c67
[  533.954353] RSP: 002b:7ffe9b18df08 EFLAGS: 0202 ORIG_RAX: 
0026
[  533.966657] RAX: ffda RBX:  RCX: 7f32bbd71c67
[  533.978452] RDX:  RSI: 7ffe9b18df30 RDI: 0001
[  533.990192] RBP: 7ffe9b18df60 R08: 7f32bbd09d38 R09: 0012
[  534.001811] R10:

[RFC] ubsan: signed integer overflow in setitimer()

2017-06-01 Thread Xishi Qiu

Hi, this is the test case, and then I got ubsan error
(signed integer overflow) report, so the root cause is from
user or kernel? Shall we change something in timeval_valid()?


struct itimerval new_value;
int ret;

new_value.it_interval.tv_sec = 140673496649799L;
new_value.it_interval.tv_usec = 6;
new_value.it_value.tv_sec = 140673496649807L;
new_value.it_value.tv_usec = 5;

ret = setitimer(ITIMER_VIRTUAL, _value, NULL);


[  533.326588] 

[  533.335346] UBSAN: Undefined behaviour in ./include/linux/time.h:239:27
[  533.342155] signed integer overflow:
[  533.345837] 140673496649807 * 10 cannot be represented in type 'long 
int'
[  533.353540] CPU: 102 PID: 17797 Comm: test.exe Tainted: GB   
4.12.0-rc3-327.44.58.18.x86_64+ #30
[  533.363646] Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, 
BIOS BLHSV028 11/11/2014
[  533.373130] Call Trace:
[  533.375670]  dump_stack+0xbc/0x124
[  533.379179]  ? _atomic_dec_and_lock+0x14c/0x14c
[  533.383850]  ubsan_epilogue+0xd/0x4e
[  533.387531]  handle_overflow+0x186/0x1d5
[  533.391571]  ? __ubsan_handle_negate_overflow+0x15b/0x15b
[  533.397133]  ? unlock_page+0x20/0x60
[  533.400815]  ? filemap_map_pages+0x3e8/0x820
[  533.405211]  ? read_cache_page_gfp+0x80/0x80
[  533.409613]  ? tty_ldisc_deref+0x28/0x40
[  533.413653]  ? tty_write+0x344/0x560
[  533.417335]  __ubsan_handle_mul_overflow+0xe/0x19
[  533.422181]  set_cpu_itimer+0x49c/0x540
[  533.426132]  ? get_cpu_itimer+0x290/0x290
[  533.430271]  ? __getnstimeofday64+0x14c/0x210
[  533.434762]  ? __pmd_alloc+0x1b0/0x1b0
[  533.438623]  ? ktime_get+0xd0/0xd0
[  533.442127]  do_setitimer+0xe1/0x540
[  533.445817]  ? __audit_syscall_entry+0x1cd/0x250
[  533.450571]  SyS_setitimer+0x1d6/0x210
[  533.454431]  ? SyS_alarm+0x150/0x150
[  533.463453]  ? trace_event_raw_event_sys_enter+0x590/0x590
[  533.474508]  ? handle_mm_fault+0x15a/0x530
[  533.484144]  ? __do_page_fault+0x3c9/0x740
[  533.493712]  ? SyS_alarm+0x150/0x150
[  533.502553]  do_syscall_64+0xf5/0x2a0
[  533.511294]  ? do_syscall_64+0xf5/0x2a0
[  533.520031]  entry_SYSCALL64_slow_path+0x25/0x25
[  533.529410] RIP: 0033:0x7f32bbd71c67
[  533.537565] RSP: 002b:7ffe9b18df08 EFLAGS: 0202 ORIG_RAX: 
0026
[  533.549766] RAX: ffda RBX:  RCX: 7f32bbd71c67
[  533.561530] RDX:  RSI: 7ffe9b18df30 RDI: 0001
[  533.573237] RBP: 7ffe9b18df60 R08: 7f32bbd09d38 R09: 0012
[  533.584794] R10: 7ffe9b18dc90 R11: 0202 R12: 00400490
[  533.596218] R13: 7ffe9b18e040 R14:  R15: 
[  533.607450] 

[  533.620059] 

[  533.632565] UBSAN: Undefined behaviour in ./include/linux/time.h:239:27
[  533.643357] signed integer overflow:
[  533.651093] 140673496649799 * 10 cannot be represented in type 'long 
int'
[  533.662927] CPU: 102 PID: 17797 Comm: test.exe Tainted: GB   
4.12.0-rc3-327.44.58.18.x86_64+ #30
[  533.681691] Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, 
BIOS BLHSV028 11/11/2014
[  533.700314] Call Trace:
[  533.707469]  dump_stack+0xbc/0x124
[  533.715588]  ? _atomic_dec_and_lock+0x14c/0x14c
[  533.724911]  ubsan_epilogue+0xd/0x4e
[  533.733281]  handle_overflow+0x186/0x1d5
[  533.741952]  ? __ubsan_handle_negate_overflow+0x15b/0x15b
[  533.752190]  ? unlock_page+0x20/0x60
[  533.760592]  ? filemap_map_pages+0x3e8/0x820
[  533.769709]  ? read_cache_page_gfp+0x80/0x80
[  533.778844]  ? tty_ldisc_deref+0x28/0x40
[  533.787641]  ? tty_write+0x344/0x560
[  533.796126]  __ubsan_handle_mul_overflow+0xe/0x19
[  533.805696]  set_cpu_itimer+0x438/0x540
[  533.814412]  ? get_cpu_itimer+0x290/0x290
[  533.823259]  ? __getnstimeofday64+0x14c/0x210
[  533.832504]  ? __pmd_alloc+0x1b0/0x1b0
[  533.841082]  ? ktime_get+0xd0/0xd0
[  533.849294]  do_setitimer+0xe1/0x540
[  533.857675]  ? __audit_syscall_entry+0x1cd/0x250
[  533.867146]  SyS_setitimer+0x1d6/0x210
[  533.875731]  ? SyS_alarm+0x150/0x150
[  533.884099]  ? trace_event_raw_event_sys_enter+0x590/0x590
[  533.894388]  ? handle_mm_fault+0x15a/0x530
[  533.903209]  ? __do_page_fault+0x3c9/0x740
[  533.911953]  ? SyS_alarm+0x150/0x150
[  533.920154]  do_syscall_64+0xf5/0x2a0
[  533.928504]  ? do_syscall_64+0xf5/0x2a0
[  533.936997]  entry_SYSCALL64_slow_path+0x25/0x25
[  533.946160] RIP: 0033:0x7f32bbd71c67
[  533.954353] RSP: 002b:7ffe9b18df08 EFLAGS: 0202 ORIG_RAX: 
0026
[  533.966657] RAX: ffda RBX:  RCX: 7f32bbd71c67
[  533.978452] RDX:  RSI: 7ffe9b18df30 RDI: 0001
[  533.990192] RBP: 7ffe9b18df60 R08: 7f32bbd09d38 R09: 0012
[  534.001811] R10:

Re: [Question] Mlocked count will not be decreased

2017-05-24 Thread Xishi Qiu

On 2017/5/24 21:16, Vlastimil Babka wrote:

> On 05/24/2017 02:10 PM, Xishi Qiu wrote:
>> On 2017/5/24 19:52, Vlastimil Babka wrote:
>>
>>> On 05/24/2017 01:38 PM, Xishi Qiu wrote:
>>>>>
>>>>> Race condition with what? Who else would isolate our pages?
>>>>>
>>>>
>>>> Hi Vlastimil,
>>>>
>>>> I find the root cause, if the page was not cached on the current cpu,
>>>> lru_add_drain() will not push it to LRU. So we should handle fail
>>>> case in mlock_vma_page().
>>>
>>> Yeah that would explain it.
>>>
>>>> follow_page_pte()
>>>>...
>>>>if (page->mapping && trylock_page(page)) {
>>>>lru_add_drain();  /* push cached pages to LRU */
>>>>/*
>>>> * Because we lock page here, and migration is
>>>> * blocked by the pte's page reference, and we
>>>> * know the page is still mapped, we don't even
>>>> * need to check for file-cache page truncation.
>>>> */
>>>>mlock_vma_page(page);
>>>>unlock_page(page);
>>>>}
>>>>...
>>>>
>>>> I think we should add yisheng's patch, also we should add the following 
>>>> change.
>>>> I think it is better than use lru_add_drain_all().
>>>
>>> I agree about yisheng's fix (but v2 didn't address my comments). I don't
>>> think we should add the hunk below, as that deviates from the rest of
>>> the design.
>>
>> Hi Vlastimil,
>>
>> The rest of the design is that mlock should always success here, right?
> 
> The rest of the design allows a temporary disconnect between mlocked
> flag and being placed on unevictable lru.
> 
>> If we don't handle the fail case, the page will be in anon/file lru list
>> later when call __pagevec_lru_add(), but NR_MLOCK increased,
>> this is wrong, right?
> 
> It's not wrong, the page cannot get evicted even if on wrong lru, so
> effectively it's already mlocked. We would be underaccounting NR_MLOCK.
> 

Hi Vlastimil,

I'm not quite understand why the page cannot get evicted even if on wrong lru.
__isolate_lru_page() will only skip PageUnevictable(page), but this flag has not
been set, we only set PageMlocked.

Thanks,
Xishi Qiu

>> Thanks,
>> Xishi Qiu
>>
>>>
>>> Thanks,
>>> Vlastimil
>>>
>>>> diff --git a/mm/mlock.c b/mm/mlock.c
>>>> index 3d3ee6c..ca2aeb9 100644
>>>> --- a/mm/mlock.c
>>>> +++ b/mm/mlock.c
>>>> @@ -88,6 +88,11 @@ void mlock_vma_page(struct page *page)
>>>>count_vm_event(UNEVICTABLE_PGMLOCKED);
>>>>if (!isolate_lru_page(page))
>>>>putback_lru_page(page);
>>>> +  else {
>>>> +  ClearPageMlocked(page);
>>>> +  mod_zone_page_state(page_zone(page), NR_MLOCK,
>>>> +  -hpage_nr_pages(page));
>>>> +  }
>>>>}
>>>>  }
>>>>
>>>> Thanks,
>>>> Xishi Qiu
>>>>
>>>
>>>
>>> .
>>>
>>
>>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 
>>
> 
> 
> .
>

Re: [Question] Mlocked count will not be decreased

2017-05-24 Thread Xishi Qiu

On 2017/5/24 21:16, Vlastimil Babka wrote:

> On 05/24/2017 02:10 PM, Xishi Qiu wrote:
>> On 2017/5/24 19:52, Vlastimil Babka wrote:
>>
>>> On 05/24/2017 01:38 PM, Xishi Qiu wrote:
>>>>>
>>>>> Race condition with what? Who else would isolate our pages?
>>>>>
>>>>
>>>> Hi Vlastimil,
>>>>
>>>> I find the root cause, if the page was not cached on the current cpu,
>>>> lru_add_drain() will not push it to LRU. So we should handle fail
>>>> case in mlock_vma_page().
>>>
>>> Yeah that would explain it.
>>>
>>>> follow_page_pte()
>>>>...
>>>>if (page->mapping && trylock_page(page)) {
>>>>lru_add_drain();  /* push cached pages to LRU */
>>>>/*
>>>> * Because we lock page here, and migration is
>>>> * blocked by the pte's page reference, and we
>>>> * know the page is still mapped, we don't even
>>>> * need to check for file-cache page truncation.
>>>> */
>>>>mlock_vma_page(page);
>>>>unlock_page(page);
>>>>}
>>>>...
>>>>
>>>> I think we should add yisheng's patch, also we should add the following 
>>>> change.
>>>> I think it is better than use lru_add_drain_all().
>>>
>>> I agree about yisheng's fix (but v2 didn't address my comments). I don't
>>> think we should add the hunk below, as that deviates from the rest of
>>> the design.
>>
>> Hi Vlastimil,
>>
>> The rest of the design is that mlock should always success here, right?
> 
> The rest of the design allows a temporary disconnect between mlocked
> flag and being placed on unevictable lru.
> 
>> If we don't handle the fail case, the page will be in anon/file lru list
>> later when call __pagevec_lru_add(), but NR_MLOCK increased,
>> this is wrong, right?
> 
> It's not wrong, the page cannot get evicted even if on wrong lru, so
> effectively it's already mlocked. We would be underaccounting NR_MLOCK.
> 

Hi Vlastimil,

I'm not quite understand why the page cannot get evicted even if on wrong lru.
__isolate_lru_page() will only skip PageUnevictable(page), but this flag has not
been set, we only set PageMlocked.

Thanks,
Xishi Qiu

>> Thanks,
>> Xishi Qiu
>>
>>>
>>> Thanks,
>>> Vlastimil
>>>
>>>> diff --git a/mm/mlock.c b/mm/mlock.c
>>>> index 3d3ee6c..ca2aeb9 100644
>>>> --- a/mm/mlock.c
>>>> +++ b/mm/mlock.c
>>>> @@ -88,6 +88,11 @@ void mlock_vma_page(struct page *page)
>>>>count_vm_event(UNEVICTABLE_PGMLOCKED);
>>>>if (!isolate_lru_page(page))
>>>>putback_lru_page(page);
>>>> +  else {
>>>> +  ClearPageMlocked(page);
>>>> +  mod_zone_page_state(page_zone(page), NR_MLOCK,
>>>> +  -hpage_nr_pages(page));
>>>> +  }
>>>>}
>>>>  }
>>>>
>>>> Thanks,
>>>> Xishi Qiu
>>>>
>>>
>>>
>>> .
>>>
>>
>>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 
>>
> 
> 
> .
>

Re: [Question] Mlocked count will not be decreased

2017-05-24 Thread Xishi Qiu

On 2017/5/24 19:52, Vlastimil Babka wrote:

> On 05/24/2017 01:38 PM, Xishi Qiu wrote:
>>>
>>> Race condition with what? Who else would isolate our pages?
>>>
>>
>> Hi Vlastimil,
>>
>> I find the root cause, if the page was not cached on the current cpu,
>> lru_add_drain() will not push it to LRU. So we should handle fail
>> case in mlock_vma_page().
> 
> Yeah that would explain it.
> 
>> follow_page_pte()
>>  ...
>>  if (page->mapping && trylock_page(page)) {
>>  lru_add_drain();  /* push cached pages to LRU */
>>  /*
>>   * Because we lock page here, and migration is
>>   * blocked by the pte's page reference, and we
>>   * know the page is still mapped, we don't even
>>   * need to check for file-cache page truncation.
>>   */
>>  mlock_vma_page(page);
>>  unlock_page(page);
>>  }
>>  ...
>>
>> I think we should add yisheng's patch, also we should add the following 
>> change.
>> I think it is better than use lru_add_drain_all().
> 
> I agree about yisheng's fix (but v2 didn't address my comments). I don't
> think we should add the hunk below, as that deviates from the rest of
> the design.

Hi Vlastimil,

The rest of the design is that mlock should always success here, right?

If we don't handle the fail case, the page will be in anon/file lru list
later when call __pagevec_lru_add(), but NR_MLOCK increased,
this is wrong, right?

Thanks,
Xishi Qiu

> 
> Thanks,
> Vlastimil
> 
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index 3d3ee6c..ca2aeb9 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -88,6 +88,11 @@ void mlock_vma_page(struct page *page)
>>  count_vm_event(UNEVICTABLE_PGMLOCKED);
>>  if (!isolate_lru_page(page))
>>  putback_lru_page(page);
>> +else {
>> +ClearPageMlocked(page);
>> +mod_zone_page_state(page_zone(page), NR_MLOCK,
>> +-hpage_nr_pages(page));
>> +}
>>  }
>>  }
>>
>> Thanks,
>> Xishi Qiu
>>
> 
> 
> .
>

Re: [Question] Mlocked count will not be decreased

2017-05-24 Thread Xishi Qiu

On 2017/5/24 19:52, Vlastimil Babka wrote:

> On 05/24/2017 01:38 PM, Xishi Qiu wrote:
>>>
>>> Race condition with what? Who else would isolate our pages?
>>>
>>
>> Hi Vlastimil,
>>
>> I find the root cause, if the page was not cached on the current cpu,
>> lru_add_drain() will not push it to LRU. So we should handle fail
>> case in mlock_vma_page().
> 
> Yeah that would explain it.
> 
>> follow_page_pte()
>>  ...
>>  if (page->mapping && trylock_page(page)) {
>>  lru_add_drain();  /* push cached pages to LRU */
>>  /*
>>   * Because we lock page here, and migration is
>>   * blocked by the pte's page reference, and we
>>   * know the page is still mapped, we don't even
>>   * need to check for file-cache page truncation.
>>   */
>>  mlock_vma_page(page);
>>  unlock_page(page);
>>  }
>>  ...
>>
>> I think we should add yisheng's patch, also we should add the following 
>> change.
>> I think it is better than use lru_add_drain_all().
> 
> I agree about yisheng's fix (but v2 didn't address my comments). I don't
> think we should add the hunk below, as that deviates from the rest of
> the design.

Hi Vlastimil,

The rest of the design is that mlock should always success here, right?

If we don't handle the fail case, the page will be in anon/file lru list
later when call __pagevec_lru_add(), but NR_MLOCK increased,
this is wrong, right?

Thanks,
Xishi Qiu

> 
> Thanks,
> Vlastimil
> 
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index 3d3ee6c..ca2aeb9 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -88,6 +88,11 @@ void mlock_vma_page(struct page *page)
>>  count_vm_event(UNEVICTABLE_PGMLOCKED);
>>  if (!isolate_lru_page(page))
>>  putback_lru_page(page);
>> +else {
>> +ClearPageMlocked(page);
>> +mod_zone_page_state(page_zone(page), NR_MLOCK,
>> +-hpage_nr_pages(page));
>> +}
>>  }
>>  }
>>
>> Thanks,
>> Xishi Qiu
>>
> 
> 
> .
>

Re: [Question] Mlocked count will not be decreased

2017-05-24 Thread Xishi Qiu

On 2017/5/24 18:32, Vlastimil Babka wrote:

> On 05/24/2017 10:32 AM, Yisheng Xie wrote:
>> Hi Kefeng，
>> Could you please try this patch.
>>
>> Thanks
>> Yisheng Xie
>> -
>> From a70ae975756e8e97a28d49117ab25684da631689 Mon Sep 17 00:00:00 2001
>> From: Yisheng Xie <xieyishe...@huawei.com>
>> Date: Wed, 24 May 2017 16:01:24 +0800
>> Subject: [PATCH] mlock: fix mlock count can not decrease in race condition
>>
>> Kefeng reported that when run the follow test the mlock count in meminfo
>> cannot be decreased:
>>  [1] testcase
>>  linux:~ # cat test_mlockal
>>  grep Mlocked /proc/meminfo
>>   for j in `seq 0 10`
>>   do
>>  for i in `seq 4 15`
>>  do
>>  ./p_mlockall >> log &
>>  done
>>  sleep 0.2
>>  done
>>  sleep 5 # wait some time to let mlock decrease
>>  grep Mlocked /proc/meminfo
>>
>>  linux:~ # cat p_mlockall.c
>>  #include 
>>  #include 
>>  #include 
>>
>>  #define SPACE_LEN   4096
>>
>>  int main(int argc, char ** argv)
>>  {
>>  int ret;
>>  void *adr = malloc(SPACE_LEN);
>>  if (!adr)
>>  return -1;
>>
>>  ret = mlockall(MCL_CURRENT | MCL_FUTURE);
>>  printf("mlcokall ret = %d\n", ret);
>>
>>  ret = munlockall();
>>  printf("munlcokall ret = %d\n", ret);
>>
>>  free(adr);
>>  return 0;
>>  }
>>
>> When __munlock_pagevec, we ClearPageMlock but isolation_failed in race
>> condition, and we do not count these page into delta_munlocked, which cause 
>> mlock
> 
> Race condition with what? Who else would isolate our pages?
> 

Hi Vlastimil,

I find the root cause, if the page was not cached on the current cpu,
lru_add_drain() will not push it to LRU. So we should handle fail
case in mlock_vma_page().

follow_page_pte()
...
if (page->mapping && trylock_page(page)) {
lru_add_drain();  /* push cached pages to LRU */
/*
 * Because we lock page here, and migration is
 * blocked by the pte's page reference, and we
 * know the page is still mapped, we don't even
 * need to check for file-cache page truncation.
 */
mlock_vma_page(page);
unlock_page(page);
}
...

I think we should add yisheng's patch, also we should add the following change.
I think it is better than use lru_add_drain_all().

diff --git a/mm/mlock.c b/mm/mlock.c
index 3d3ee6c..ca2aeb9 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -88,6 +88,11 @@ void mlock_vma_page(struct page *page)
    count_vm_event(UNEVICTABLE_PGMLOCKED);
if (!isolate_lru_page(page))
putback_lru_page(page);
+   else {
+   ClearPageMlocked(page);
+   mod_zone_page_state(page_zone(page), NR_MLOCK,
+   -hpage_nr_pages(page));
+   }
}
 }

Thanks,
Xishi Qiu

Re: [Question] Mlocked count will not be decreased

2017-05-24 Thread Xishi Qiu

On 2017/5/24 18:32, Vlastimil Babka wrote:

> On 05/24/2017 10:32 AM, Yisheng Xie wrote:
>> Hi Kefeng，
>> Could you please try this patch.
>>
>> Thanks
>> Yisheng Xie
>> -
>> From a70ae975756e8e97a28d49117ab25684da631689 Mon Sep 17 00:00:00 2001
>> From: Yisheng Xie 
>> Date: Wed, 24 May 2017 16:01:24 +0800
>> Subject: [PATCH] mlock: fix mlock count can not decrease in race condition
>>
>> Kefeng reported that when run the follow test the mlock count in meminfo
>> cannot be decreased:
>>  [1] testcase
>>  linux:~ # cat test_mlockal
>>  grep Mlocked /proc/meminfo
>>   for j in `seq 0 10`
>>   do
>>  for i in `seq 4 15`
>>  do
>>  ./p_mlockall >> log &
>>  done
>>  sleep 0.2
>>  done
>>  sleep 5 # wait some time to let mlock decrease
>>  grep Mlocked /proc/meminfo
>>
>>  linux:~ # cat p_mlockall.c
>>  #include 
>>  #include 
>>  #include 
>>
>>  #define SPACE_LEN   4096
>>
>>  int main(int argc, char ** argv)
>>  {
>>  int ret;
>>  void *adr = malloc(SPACE_LEN);
>>  if (!adr)
>>  return -1;
>>
>>  ret = mlockall(MCL_CURRENT | MCL_FUTURE);
>>  printf("mlcokall ret = %d\n", ret);
>>
>>  ret = munlockall();
>>  printf("munlcokall ret = %d\n", ret);
>>
>>  free(adr);
>>  return 0;
>>  }
>>
>> When __munlock_pagevec, we ClearPageMlock but isolation_failed in race
>> condition, and we do not count these page into delta_munlocked, which cause 
>> mlock
> 
> Race condition with what? Who else would isolate our pages?
> 

Hi Vlastimil,

I find the root cause, if the page was not cached on the current cpu,
lru_add_drain() will not push it to LRU. So we should handle fail
case in mlock_vma_page().

follow_page_pte()
...
if (page->mapping && trylock_page(page)) {
lru_add_drain();  /* push cached pages to LRU */
/*
 * Because we lock page here, and migration is
 * blocked by the pte's page reference, and we
 * know the page is still mapped, we don't even
 * need to check for file-cache page truncation.
 */
mlock_vma_page(page);
unlock_page(page);
}
...

I think we should add yisheng's patch, also we should add the following change.
I think it is better than use lru_add_drain_all().

diff --git a/mm/mlock.c b/mm/mlock.c
index 3d3ee6c..ca2aeb9 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -88,6 +88,11 @@ void mlock_vma_page(struct page *page)
count_vm_event(UNEVICTABLE_PGMLOCKED);
if (!isolate_lru_page(page))
putback_lru_page(page);
+   else {
+   ClearPageMlocked(page);
+   mod_zone_page_state(page_zone(page), NR_MLOCK,
+   -hpage_nr_pages(page));
+   }
}
 }

Thanks,
Xishi Qiu

Re: [Question] Mlocked count will not be decreased

2017-05-24 Thread Xishi Qiu

On 2017/5/24 18:32, Vlastimil Babka wrote:

> On 05/24/2017 10:32 AM, Yisheng Xie wrote:
>> Hi Kefeng，
>> Could you please try this patch.
>>
>> Thanks
>> Yisheng Xie
>> -
>> From a70ae975756e8e97a28d49117ab25684da631689 Mon Sep 17 00:00:00 2001
>> From: Yisheng Xie <xieyishe...@huawei.com>
>> Date: Wed, 24 May 2017 16:01:24 +0800
>> Subject: [PATCH] mlock: fix mlock count can not decrease in race condition
>>
>> Kefeng reported that when run the follow test the mlock count in meminfo
>> cannot be decreased:
>>  [1] testcase
>>  linux:~ # cat test_mlockal
>>  grep Mlocked /proc/meminfo
>>   for j in `seq 0 10`
>>   do
>>  for i in `seq 4 15`
>>  do
>>  ./p_mlockall >> log &
>>  done
>>  sleep 0.2
>>  done
>>  sleep 5 # wait some time to let mlock decrease
>>  grep Mlocked /proc/meminfo
>>
>>  linux:~ # cat p_mlockall.c
>>  #include 
>>  #include 
>>  #include 
>>
>>  #define SPACE_LEN   4096
>>
>>  int main(int argc, char ** argv)
>>  {
>>  int ret;
>>  void *adr = malloc(SPACE_LEN);
>>  if (!adr)
>>  return -1;
>>
>>  ret = mlockall(MCL_CURRENT | MCL_FUTURE);
>>  printf("mlcokall ret = %d\n", ret);
>>
>>  ret = munlockall();
>>  printf("munlcokall ret = %d\n", ret);
>>
>>  free(adr);
>>  return 0;
>>  }
>>
>> When __munlock_pagevec, we ClearPageMlock but isolation_failed in race
>> condition, and we do not count these page into delta_munlocked, which cause 
>> mlock
> 
> Race condition with what? Who else would isolate our pages?
> 
>> counter incorrect for we had Clear the PageMlock and cannot count down
>> the number in the feture.
>>
>> Fix it by count the number of page whoes PageMlock flag is cleared.
>>
>> Reported-by: Kefeng Wang <wangkefeng.w...@huawei.com>
>> Signed-off-by: Yisheng Xie <xieyishe...@huawei.com>
> 
> Weird, I can reproduce the issue on my desktop's 4.11 distro kernel, but
> not in qemu and small kernel build, for some reason. So I couldn't test
> the patch yet. But it's true that before 7225522bb429 ("mm: munlock:
> batch non-THP page isolation and munlock+putback using pagevec") we
> decreased NR_MLOCK for each pages that passed TestClearPageMlocked(),
> and that unintentionally changed with my patch. There should be a Fixes:
> tag for that.
> 

Hi Vlastimil,

Why the page has marked Mlocked, but not in lru list?

if (TestClearPageMlocked(page)) {
/*
 * We already have pin from follow_page_mask()
 * so we can spare the get_page() here.
 */
if (__munlock_isolate_lru_page(page, false))
continue;
else
__munlock_isolation_failed(page);  // How this 
happened?
}

Thanks,
Xishi Qiu

>> ---
>>  mm/mlock.c | 7 ---
>>  1 file changed, 4 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index c483c5c..71ba5cf 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -284,7 +284,7 @@ static void __munlock_pagevec(struct pagevec *pvec, 
>> struct zone *zone)
>>  {
>>  int i;
>>  int nr = pagevec_count(pvec);
>> -int delta_munlocked;
>> +int munlocked = 0;
>>  struct pagevec pvec_putback;
>>  int pgrescued = 0;
>>
>> @@ -296,6 +296,7 @@ static void __munlock_pagevec(struct pagevec *pvec, 
>> struct zone *zone)
>>  struct page *page = pvec->pages[i];
>>
>>  if (TestClearPageMlocked(page)) {
>> +munlocked --;
>>  /*
>>   * We already have pin from follow_page_mask()
>>   * so we can spare the get_page() here.
>> @@ -315,8 +316,8 @@ static void __munlock_pagevec(struct pagevec *pvec, 
>> struct zone *zone)
>>  pagevec_add(_putback, pvec->pages[i]);
>>  pvec->pages[i] = NULL;
>>  }
>> -delta_munlocked = -nr + pagevec_count(_putback);
>> -__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
>> +if (munlocked)
> 
> You don't have to if () this, it should be very rare that munlocked will
> be 0, and the code works fine even if it is.
> 
>> +__mod_zone_page_state(zone, NR_MLOCK, munlocked);
>>  spin_unlock_irq(zone_lru_lock(zone));
>>
>>  /* Now we can release pins of pages that we are not munlocking */
>>
> 
> 
> .
>

Re: [Question] Mlocked count will not be decreased

2017-05-24 Thread Xishi Qiu

On 2017/5/24 18:32, Vlastimil Babka wrote:

> On 05/24/2017 10:32 AM, Yisheng Xie wrote:
>> Hi Kefeng，
>> Could you please try this patch.
>>
>> Thanks
>> Yisheng Xie
>> -
>> From a70ae975756e8e97a28d49117ab25684da631689 Mon Sep 17 00:00:00 2001
>> From: Yisheng Xie 
>> Date: Wed, 24 May 2017 16:01:24 +0800
>> Subject: [PATCH] mlock: fix mlock count can not decrease in race condition
>>
>> Kefeng reported that when run the follow test the mlock count in meminfo
>> cannot be decreased:
>>  [1] testcase
>>  linux:~ # cat test_mlockal
>>  grep Mlocked /proc/meminfo
>>   for j in `seq 0 10`
>>   do
>>  for i in `seq 4 15`
>>  do
>>  ./p_mlockall >> log &
>>  done
>>  sleep 0.2
>>  done
>>  sleep 5 # wait some time to let mlock decrease
>>  grep Mlocked /proc/meminfo
>>
>>  linux:~ # cat p_mlockall.c
>>  #include 
>>  #include 
>>  #include 
>>
>>  #define SPACE_LEN   4096
>>
>>  int main(int argc, char ** argv)
>>  {
>>  int ret;
>>  void *adr = malloc(SPACE_LEN);
>>  if (!adr)
>>  return -1;
>>
>>  ret = mlockall(MCL_CURRENT | MCL_FUTURE);
>>  printf("mlcokall ret = %d\n", ret);
>>
>>  ret = munlockall();
>>  printf("munlcokall ret = %d\n", ret);
>>
>>  free(adr);
>>  return 0;
>>  }
>>
>> When __munlock_pagevec, we ClearPageMlock but isolation_failed in race
>> condition, and we do not count these page into delta_munlocked, which cause 
>> mlock
> 
> Race condition with what? Who else would isolate our pages?
> 
>> counter incorrect for we had Clear the PageMlock and cannot count down
>> the number in the feture.
>>
>> Fix it by count the number of page whoes PageMlock flag is cleared.
>>
>> Reported-by: Kefeng Wang 
>> Signed-off-by: Yisheng Xie 
> 
> Weird, I can reproduce the issue on my desktop's 4.11 distro kernel, but
> not in qemu and small kernel build, for some reason. So I couldn't test
> the patch yet. But it's true that before 7225522bb429 ("mm: munlock:
> batch non-THP page isolation and munlock+putback using pagevec") we
> decreased NR_MLOCK for each pages that passed TestClearPageMlocked(),
> and that unintentionally changed with my patch. There should be a Fixes:
> tag for that.
> 

Hi Vlastimil,

Why the page has marked Mlocked, but not in lru list?

if (TestClearPageMlocked(page)) {
        /*
 * We already have pin from follow_page_mask()
 * so we can spare the get_page() here.
 */
if (__munlock_isolate_lru_page(page, false))
continue;
else
__munlock_isolation_failed(page);  // How this 
happened?
}

Thanks,
Xishi Qiu

>> ---
>>  mm/mlock.c | 7 ---
>>  1 file changed, 4 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index c483c5c..71ba5cf 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -284,7 +284,7 @@ static void __munlock_pagevec(struct pagevec *pvec, 
>> struct zone *zone)
>>  {
>>  int i;
>>  int nr = pagevec_count(pvec);
>> -int delta_munlocked;
>> +int munlocked = 0;
>>  struct pagevec pvec_putback;
>>  int pgrescued = 0;
>>
>> @@ -296,6 +296,7 @@ static void __munlock_pagevec(struct pagevec *pvec, 
>> struct zone *zone)
>>  struct page *page = pvec->pages[i];
>>
>>  if (TestClearPageMlocked(page)) {
>> +munlocked --;
>>  /*
>>   * We already have pin from follow_page_mask()
>>   * so we can spare the get_page() here.
>> @@ -315,8 +316,8 @@ static void __munlock_pagevec(struct pagevec *pvec, 
>> struct zone *zone)
>>  pagevec_add(_putback, pvec->pages[i]);
>>  pvec->pages[i] = NULL;
>>  }
>> -delta_munlocked = -nr + pagevec_count(_putback);
>> -__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
>> +if (munlocked)
> 
> You don't have to if () this, it should be very rare that munlocked will
> be 0, and the code works fine even if it is.
> 
>> +__mod_zone_page_state(zone, NR_MLOCK, munlocked);
>>  spin_unlock_irq(zone_lru_lock(zone));
>>
>>  /* Now we can release pins of pages that we are not munlocking */
>>
> 
> 
> .
>

Re: mm, we use rcu access task_struct in mm_match_cgroup(), but not use rcu free in free_task_struct()

2017-05-24 Thread Xishi Qiu

On 2017/5/24 15:49, Vlastimil Babka wrote:

> On 05/24/2017 06:40 AM, Xishi Qiu wrote:
>> On 2017/5/24 9:40, Xishi Qiu wrote:
>>
>>> Hi, I find we use rcu access task_struct in mm_match_cgroup(), but not use
>>> rcu free in free_task_struct(), is it right?
>>>
>>> Here is the backtrace.
> 
> Can you post the whole oops, including kernel version etc? Is it the
> same 3.10 RH kernel as in the other report?
> 

Hi Vlastimil,

Yes, it's RHEL 7.2

[  663.687410] Modules linked in: dm_service_time dm_multipath iscsi_tcp 
libiscsi_tcp libiscsi scsi_transport_iscsi ocfs2_lockfs(OE) ocfs2(OE) 
ocfs2_adl(OE) jbd2 ocfs2_stack_o2cb(OE) ocfs2_dlm(OE) ocfs2_nodemanager(OE) 
ocfs2_stackglue(OE) kboxdriver(O) kbox(O) 8021q garp stp mrp llc 
dev_connlimit(O) vhba(OE) signo_catch(O) hotpatch(OE) bum(O) ip_set nfnetlink 
prio(O) nat(O) vport_vxlan(O) openvswitch(O) nf_defrag_ipv6 gre ipmi_devintf 
pmcint(O) ipmi_si ipmi_msghandler coretemp iTCO_wdt kvm_intel(O) 
iTCO_vendor_support intel_rapl kvm(O) crc32_pclmul crc32c_intel 
ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper mei_me ablk_helper 
cryptd i2c_i801 be2net mei lpc_ich i2c_core vxlan sb_edac pcspkr mfd_core 
edac_core sg ip6_udp_tunnel udp_tunnel shpchp acpi_power_meter 
remote_trigger(O) nf_conntrack_ipv4
[  663.759402]  nf_defrag_ipv4 vhost_net(O) tun(O) vhost(O) macvtap macvlan 
vfio_pci irqbypass vfio_iommu_type1 vfio xt_sctp nf_conntrack_proto_sctp 
nf_nat_proto_sctp nf_nat nf_conntrack sctp libcrc32c ip_tables ext3 mbcache jbd 
dm_mod sd_mod lpfc crc_t10dif ahci crct10dif_generic libahci crct10dif_pclmul 
mpt2sas scsi_transport_fc libata raid_class scsi_tgt crct10dif_common 
scsi_transport_sas nbd(OE) [last unloaded: kbox]
[  663.796117] CPU: 2 PID: 2133 Comm: CPU 15/KVM Tainted: G   OE   
---   3.10.0-327.49.58.52_13.x86_64 #1
[  663.807080] Hardware name: Huawei CH121 V3/IT11SGCA1, BIOS 1.51 06/11/2015
[  663.814107] task: 881fe3353300 ti: 881fe2768000 task.ti: 
881fe2768000
[  663.821854] RIP: 0010:[]  [] 
mem_cgroup_from_task+0x16/0x20
[  663.830895] RSP: :881fe276b810  EFLAGS: 00010286
[  663.836333] RAX: 6b6b6b6b6b6b6b6b RBX: ea007f90 RCX: 0002
[  663.843621] RDX: 0007fa607d67 RSI: 0007fa607d67 RDI: 880fe36d72c0
[  663.850878] RBP: 881fe276b880 R08: 0007fa607600 R09: a801fd67b300
[  663.858164] R10: 57fdec98cc59ecc0 R11: 880fe2e8dbd0 R12: c9001cb74000
[  663.865420] R13: 881fdb8cfda0 R14: 881fe2581570 R15: 0007fa607d67
[  663.872696] FS:  7fdaaba6d700() GS:88103fc8() 
knlGS:
[  663.881092] CS:  0010 DS:  ES:  CR0: 80050033
[  663.886996] CR2: 7fdaba65cf88 CR3: 000fe4919000 CR4: 001427e0
[  663.894252] DR0:  DR1:  DR2: 
[  663.901533] DR3:  DR6: 0ff0 DR7: 0400
[  663.908788] Stack:
[  663.910966]  811a6b8a 881fd828d980 0007fa607d67 
881fe276b950
[  663.918753]   0007fa607600 7fa60760 
00017f59ece0
[  663.926541]  675ac78b 881fe276bc70 ea007f9888a0 
881fe276ba40
[  663.934324]  ea007f90 0001 881fe276b9b8 
81180994
[  663.942113]  881fe3353300 881fe3353300 881fe3353300 

[  663.949903]  881fe276ba38 881fe276ba30 881fe276ba20 
881fe276ba28
[  663.957691]  881fe276ba18  88207efd6000 

[  663.965468]  881fe3353300  00ff8818 

[  663.973261]     
881fe5400410
[  663.980994]  ea007f59ece0 ea007f691ca0 881fe276b940 
881fe276b940
[  663.988773]   881fe276b9b8 881fe276bca0 
881fe276ba10
[  663.996563]  0001 881fe276ba40 881fe5400410 
675ac78b
[  664.004359]  881fe276ba40  881fe276bc70 
0020
[  664.012139]  88207efd6000 881fe276ba80 8118166a 
881fe276ba20
[  664.019926]  881fe276ba30 881fe276ba38 0017 
0016
[  664.027718]  88207efd6540 0003ffe0 00400410 
881fe5400410
[  664.035497]  0020   

[  664.043284]    ea007f625aa0 
ea007f9888e0
[  664.051073]  675ac78b  0020 
881fe5400410
[  664.058870]  0020  881fe276bb80 
81182135
[  664.066629]  00011628 881fe276baa8 0003 
881fe276ba00
[  664.074419]  0020  881fe276bc70 
02dd
[  664.082149]     

[  664.089933]  045d 00

Re: mm, we use rcu access task_struct in mm_match_cgroup(), but not use rcu free in free_task_struct()

2017-05-24 Thread Xishi Qiu

On 2017/5/24 15:49, Vlastimil Babka wrote:

> On 05/24/2017 06:40 AM, Xishi Qiu wrote:
>> On 2017/5/24 9:40, Xishi Qiu wrote:
>>
>>> Hi, I find we use rcu access task_struct in mm_match_cgroup(), but not use
>>> rcu free in free_task_struct(), is it right?
>>>
>>> Here is the backtrace.
> 
> Can you post the whole oops, including kernel version etc? Is it the
> same 3.10 RH kernel as in the other report?
> 

Hi Vlastimil,

Yes, it's RHEL 7.2

[  663.687410] Modules linked in: dm_service_time dm_multipath iscsi_tcp 
libiscsi_tcp libiscsi scsi_transport_iscsi ocfs2_lockfs(OE) ocfs2(OE) 
ocfs2_adl(OE) jbd2 ocfs2_stack_o2cb(OE) ocfs2_dlm(OE) ocfs2_nodemanager(OE) 
ocfs2_stackglue(OE) kboxdriver(O) kbox(O) 8021q garp stp mrp llc 
dev_connlimit(O) vhba(OE) signo_catch(O) hotpatch(OE) bum(O) ip_set nfnetlink 
prio(O) nat(O) vport_vxlan(O) openvswitch(O) nf_defrag_ipv6 gre ipmi_devintf 
pmcint(O) ipmi_si ipmi_msghandler coretemp iTCO_wdt kvm_intel(O) 
iTCO_vendor_support intel_rapl kvm(O) crc32_pclmul crc32c_intel 
ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper mei_me ablk_helper 
cryptd i2c_i801 be2net mei lpc_ich i2c_core vxlan sb_edac pcspkr mfd_core 
edac_core sg ip6_udp_tunnel udp_tunnel shpchp acpi_power_meter 
remote_trigger(O) nf_conntrack_ipv4
[  663.759402]  nf_defrag_ipv4 vhost_net(O) tun(O) vhost(O) macvtap macvlan 
vfio_pci irqbypass vfio_iommu_type1 vfio xt_sctp nf_conntrack_proto_sctp 
nf_nat_proto_sctp nf_nat nf_conntrack sctp libcrc32c ip_tables ext3 mbcache jbd 
dm_mod sd_mod lpfc crc_t10dif ahci crct10dif_generic libahci crct10dif_pclmul 
mpt2sas scsi_transport_fc libata raid_class scsi_tgt crct10dif_common 
scsi_transport_sas nbd(OE) [last unloaded: kbox]
[  663.796117] CPU: 2 PID: 2133 Comm: CPU 15/KVM Tainted: G   OE   
---   3.10.0-327.49.58.52_13.x86_64 #1
[  663.807080] Hardware name: Huawei CH121 V3/IT11SGCA1, BIOS 1.51 06/11/2015
[  663.814107] task: 881fe3353300 ti: 881fe2768000 task.ti: 
881fe2768000
[  663.821854] RIP: 0010:[]  [] 
mem_cgroup_from_task+0x16/0x20
[  663.830895] RSP: :881fe276b810  EFLAGS: 00010286
[  663.836333] RAX: 6b6b6b6b6b6b6b6b RBX: ea007f90 RCX: 0002
[  663.843621] RDX: 0007fa607d67 RSI: 0007fa607d67 RDI: 880fe36d72c0
[  663.850878] RBP: 881fe276b880 R08: 0007fa607600 R09: a801fd67b300
[  663.858164] R10: 57fdec98cc59ecc0 R11: 880fe2e8dbd0 R12: c9001cb74000
[  663.865420] R13: 881fdb8cfda0 R14: 881fe2581570 R15: 0007fa607d67
[  663.872696] FS:  7fdaaba6d700() GS:88103fc8() 
knlGS:
[  663.881092] CS:  0010 DS:  ES:  CR0: 80050033
[  663.886996] CR2: 7fdaba65cf88 CR3: 000fe4919000 CR4: 001427e0
[  663.894252] DR0:  DR1:  DR2: 
[  663.901533] DR3:  DR6: 0ff0 DR7: 0400
[  663.908788] Stack:
[  663.910966]  811a6b8a 881fd828d980 0007fa607d67 
881fe276b950
[  663.918753]   0007fa607600 7fa60760 
00017f59ece0
[  663.926541]  675ac78b 881fe276bc70 ea007f9888a0 
881fe276ba40
[  663.934324]  ea007f90 0001 881fe276b9b8 
81180994
[  663.942113]  881fe3353300 881fe3353300 881fe3353300 

[  663.949903]  881fe276ba38 881fe276ba30 881fe276ba20 
881fe276ba28
[  663.957691]  881fe276ba18  88207efd6000 

[  663.965468]  881fe3353300  00ff8818 

[  663.973261]     
881fe5400410
[  663.980994]  ea007f59ece0 ea007f691ca0 881fe276b940 
881fe276b940
[  663.988773]   881fe276b9b8 881fe276bca0 
881fe276ba10
[  663.996563]  0001 881fe276ba40 881fe5400410 
675ac78b
[  664.004359]  881fe276ba40  881fe276bc70 
0020
[  664.012139]  88207efd6000 881fe276ba80 8118166a 
881fe276ba20
[  664.019926]  881fe276ba30 881fe276ba38 0017 
0016
[  664.027718]  88207efd6540 0003ffe0 00400410 
881fe5400410
[  664.035497]  0020   

[  664.043284]    ea007f625aa0 
ea007f9888e0
[  664.051073]  675ac78b  0020 
881fe5400410
[  664.058870]  0020  881fe276bb80 
81182135
[  664.066629]  00011628 881fe276baa8 0003 
881fe276ba00
[  664.074419]  0020  881fe276bc70 
02dd
[  664.082149]     

[  664.089933]  045d 00

Re: mm, we use rcu access task_struct in mm_match_cgroup(), but not use rcu free in free_task_struct()

2017-05-23 Thread Xishi Qiu

On 2017/5/24 9:40, Xishi Qiu wrote:

> Hi, I find we use rcu access task_struct in mm_match_cgroup(), but not use
> rcu free in free_task_struct(), is it right?
> 
> Here is the backtrace.
> 
> PID: 2133   TASK: 881fe3353300  CPU: 2   COMMAND: "CPU 15/KVM"
>  #0 [881fe276b528] machine_kexec at 8105280b
>  #1 [881fe276b588] crash_kexec at 810f5072
>  #2 [881fe276b658] panic at 8163e23b
>  #3 [881fe276b6d8] oops_end at 8164d61b
>  #4 [881fe276b700] die at 8101872b
>  #5 [881fe276b730] do_general_protection at 8164cefe
>  #6 [881fe276b760] general_protection at 8164c7a8
> [exception RIP: mem_cgroup_from_task+22]
> RIP: 811db536  RSP: 881fe276b810  RFLAGS: 00010286
> RAX: 6b6b6b6b6b6b6b6b  RBX: ea007f90  RCX: 0002
> RDX: 0007fa607d67  RSI: 0007fa607d67  RDI: 880fe36d72c0
> RBP: 881fe276b880   R8: 0007fa607600   R9: a801fd67b300
> R10: 57fdec98cc59ecc0  R11: 880fe2e8dbd0  R12: c9001cb74000
> R13: 881fdb8cfda0  R14: 881fe2581570  R15: 0007fa607d67
> ORIG_RAX:   CS: 0010  SS: 
>  #7 [881fe276b810] page_referenced at 811a6b8a
>  #8 [881fe276b888] shrink_page_list at 81180994
>  #9 [881fe276b9c0] shrink_inactive_list at 8118166a
> #10 [881fe276ba88] shrink_lruvec at 81182135
> #11 [881fe276bb88] shrink_zone at 81182596
> #12 [881fe276bbe0] do_try_to_free_pages at 81182a90
> #13 [881fe276bc58] try_to_free_mem_cgroup_pages at 81182fea
> #14 [881fe276bcf0] mem_cgroup_reclaim at 811dd8de
> #15 [881fe276bd30] __mem_cgroup_try_charge at 811ddd9c
> #16 [881fe276bdf0] __mem_cgroup_try_charge_swapin at 811df62b
> #17 [881fe276be28] mem_cgroup_try_charge_swapin at 811e0537
> #18 [881fe276be38] handle_mm_fault at 8119abdd
> #19 [881fe276bec8] __do_page_fault at 816502d6
> #20 [881fe276bf28] do_page_fault at 81650603
> #21 [881fe276bf50] page_fault at 8164c808
> RIP: 7fdaba456500  RSP: 7fdaaba6c978  RFLAGS: 00010246
> RAX:   RBX:   RCX: fbd0
> RDX:   RSI: ae80  RDI: 002c
> RBP: 7fdaaba6c9f0   R8: 00840c70   R9: 00be
> R10: 7fff  R11: 0246  R12: 03622010
> R13: ae80  R14: 008274e0  R15: 03622010
> ORIG_RAX:   CS: 0033  SS: 002b
> 
> 
> .
>

Re: mm, we use rcu access task_struct in mm_match_cgroup(), but not use rcu free in free_task_struct()

2017-05-23 Thread Xishi Qiu

On 2017/5/24 9:40, Xishi Qiu wrote:

> Hi, I find we use rcu access task_struct in mm_match_cgroup(), but not use
> rcu free in free_task_struct(), is it right?
> 
> Here is the backtrace.
> 
> PID: 2133   TASK: 881fe3353300  CPU: 2   COMMAND: "CPU 15/KVM"
>  #0 [881fe276b528] machine_kexec at 8105280b
>  #1 [881fe276b588] crash_kexec at 810f5072
>  #2 [881fe276b658] panic at 8163e23b
>  #3 [881fe276b6d8] oops_end at 8164d61b
>  #4 [881fe276b700] die at 8101872b
>  #5 [881fe276b730] do_general_protection at 8164cefe
>  #6 [881fe276b760] general_protection at 8164c7a8
> [exception RIP: mem_cgroup_from_task+22]
> RIP: 811db536  RSP: 881fe276b810  RFLAGS: 00010286
> RAX: 6b6b6b6b6b6b6b6b  RBX: ea007f90  RCX: 0002
> RDX: 0007fa607d67  RSI: 0007fa607d67  RDI: 880fe36d72c0
> RBP: 881fe276b880   R8: 0007fa607600   R9: a801fd67b300
> R10: 57fdec98cc59ecc0  R11: 880fe2e8dbd0  R12: c9001cb74000
> R13: 881fdb8cfda0  R14: 881fe2581570  R15: 0007fa607d67
> ORIG_RAX:   CS: 0010  SS: 
>  #7 [881fe276b810] page_referenced at 811a6b8a
>  #8 [881fe276b888] shrink_page_list at 81180994
>  #9 [881fe276b9c0] shrink_inactive_list at 8118166a
> #10 [881fe276ba88] shrink_lruvec at 81182135
> #11 [881fe276bb88] shrink_zone at 81182596
> #12 [881fe276bbe0] do_try_to_free_pages at 81182a90
> #13 [881fe276bc58] try_to_free_mem_cgroup_pages at 81182fea
> #14 [881fe276bcf0] mem_cgroup_reclaim at 811dd8de
> #15 [881fe276bd30] __mem_cgroup_try_charge at 811ddd9c
> #16 [881fe276bdf0] __mem_cgroup_try_charge_swapin at 811df62b
> #17 [881fe276be28] mem_cgroup_try_charge_swapin at 811e0537
> #18 [881fe276be38] handle_mm_fault at 8119abdd
> #19 [881fe276bec8] __do_page_fault at 816502d6
> #20 [881fe276bf28] do_page_fault at 81650603
> #21 [881fe276bf50] page_fault at 8164c808
> RIP: 7fdaba456500  RSP: 7fdaaba6c978  RFLAGS: 00010246
> RAX:   RBX:   RCX: fbd0
> RDX:   RSI: ae80  RDI: 002c
> RBP: 7fdaaba6c9f0   R8: 00840c70   R9: 00be
> R10: 7fff  R11: 0246  R12: 03622010
> R13: ae80  R14: 008274e0  R15: 03622010
> ORIG_RAX:   CS: 0033  SS: 002b
> 
> 
> .
>

mm, we use rcu access task_struct in mm_match_cgroup(), but not use rcu free in free_task_struct()

2017-05-23 Thread Xishi Qiu

Hi, I find we use rcu access task_struct in mm_match_cgroup(), but not use
rcu free in free_task_struct(), is it right?

Here is the backtrace.

PID: 2133   TASK: 881fe3353300  CPU: 2   COMMAND: "CPU 15/KVM"
 #0 [881fe276b528] machine_kexec at 8105280b
 #1 [881fe276b588] crash_kexec at 810f5072
 #2 [881fe276b658] panic at 8163e23b
 #3 [881fe276b6d8] oops_end at 8164d61b
 #4 [881fe276b700] die at 8101872b
 #5 [881fe276b730] do_general_protection at 8164cefe
 #6 [881fe276b760] general_protection at 8164c7a8
[exception RIP: mem_cgroup_from_task+22]
RIP: 811db536  RSP: 881fe276b810  RFLAGS: 00010286
RAX: 6b6b6b6b6b6b6b6b  RBX: ea007f90  RCX: 0002
RDX: 0007fa607d67  RSI: 0007fa607d67  RDI: 880fe36d72c0
RBP: 881fe276b880   R8: 0007fa607600   R9: a801fd67b300
R10: 57fdec98cc59ecc0  R11: 880fe2e8dbd0  R12: c9001cb74000
R13: 881fdb8cfda0  R14: 881fe2581570  R15: 0007fa607d67
ORIG_RAX:   CS: 0010  SS: 
 #7 [881fe276b810] page_referenced at 811a6b8a
 #8 [881fe276b888] shrink_page_list at 81180994
 #9 [881fe276b9c0] shrink_inactive_list at 8118166a
#10 [881fe276ba88] shrink_lruvec at 81182135
#11 [881fe276bb88] shrink_zone at 81182596
#12 [881fe276bbe0] do_try_to_free_pages at 81182a90
#13 [881fe276bc58] try_to_free_mem_cgroup_pages at 81182fea
#14 [881fe276bcf0] mem_cgroup_reclaim at 811dd8de
#15 [881fe276bd30] __mem_cgroup_try_charge at 811ddd9c
#16 [881fe276bdf0] __mem_cgroup_try_charge_swapin at 811df62b
#17 [881fe276be28] mem_cgroup_try_charge_swapin at 811e0537
#18 [881fe276be38] handle_mm_fault at 8119abdd
#19 [881fe276bec8] __do_page_fault at 816502d6
#20 [881fe276bf28] do_page_fault at 81650603
#21 [881fe276bf50] page_fault at 8164c808
RIP: 7fdaba456500  RSP: 7fdaaba6c978  RFLAGS: 00010246
RAX:   RBX:   RCX: fbd0
RDX:   RSI: ae80  RDI: 002c
RBP: 7fdaaba6c9f0   R8: 00840c70   R9: 00be
R10: 7fff  R11: 0246  R12: 03622010
R13: ae80  R14: 008274e0  R15: 03622010
ORIG_RAX:   CS: 0033  SS: 002b

mm, we use rcu access task_struct in mm_match_cgroup(), but not use rcu free in free_task_struct()

2017-05-23 Thread Xishi Qiu

Hi, I find we use rcu access task_struct in mm_match_cgroup(), but not use
rcu free in free_task_struct(), is it right?

Here is the backtrace.

PID: 2133   TASK: 881fe3353300  CPU: 2   COMMAND: "CPU 15/KVM"
 #0 [881fe276b528] machine_kexec at 8105280b
 #1 [881fe276b588] crash_kexec at 810f5072
 #2 [881fe276b658] panic at 8163e23b
 #3 [881fe276b6d8] oops_end at 8164d61b
 #4 [881fe276b700] die at 8101872b
 #5 [881fe276b730] do_general_protection at 8164cefe
 #6 [881fe276b760] general_protection at 8164c7a8
[exception RIP: mem_cgroup_from_task+22]
RIP: 811db536  RSP: 881fe276b810  RFLAGS: 00010286
RAX: 6b6b6b6b6b6b6b6b  RBX: ea007f90  RCX: 0002
RDX: 0007fa607d67  RSI: 0007fa607d67  RDI: 880fe36d72c0
RBP: 881fe276b880   R8: 0007fa607600   R9: a801fd67b300
R10: 57fdec98cc59ecc0  R11: 880fe2e8dbd0  R12: c9001cb74000
R13: 881fdb8cfda0  R14: 881fe2581570  R15: 0007fa607d67
ORIG_RAX:   CS: 0010  SS: 
 #7 [881fe276b810] page_referenced at 811a6b8a
 #8 [881fe276b888] shrink_page_list at 81180994
 #9 [881fe276b9c0] shrink_inactive_list at 8118166a
#10 [881fe276ba88] shrink_lruvec at 81182135
#11 [881fe276bb88] shrink_zone at 81182596
#12 [881fe276bbe0] do_try_to_free_pages at 81182a90
#13 [881fe276bc58] try_to_free_mem_cgroup_pages at 81182fea
#14 [881fe276bcf0] mem_cgroup_reclaim at 811dd8de
#15 [881fe276bd30] __mem_cgroup_try_charge at 811ddd9c
#16 [881fe276bdf0] __mem_cgroup_try_charge_swapin at 811df62b
#17 [881fe276be28] mem_cgroup_try_charge_swapin at 811e0537
#18 [881fe276be38] handle_mm_fault at 8119abdd
#19 [881fe276bec8] __do_page_fault at 816502d6
#20 [881fe276bf28] do_page_fault at 81650603
#21 [881fe276bf50] page_fault at 8164c808
RIP: 7fdaba456500  RSP: 7fdaaba6c978  RFLAGS: 00010246
RAX:   RBX:   RCX: fbd0
RDX:   RSI: ae80  RDI: 002c
RBP: 7fdaaba6c9f0   R8: 00840c70   R9: 00be
R10: 7fff  R11: 0246  R12: 03622010
R13: ae80  R14: 008274e0  R15: 03622010
ORIG_RAX:   CS: 0033  SS: 002b

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-22 Thread Xishi Qiu

On 2017/5/23 3:26, Hugh Dickins wrote:

> On Mon, 22 May 2017, Xishi Qiu wrote:
>> On 2017/5/20 10:40, Hugh Dickins wrote:
>>> On Sat, 20 May 2017, Xishi Qiu wrote:
>>>>
>>>> Here is a bug report form redhat: 
>>>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>>>> And I meet the bug too. However it is hard to reproduce, and 
>>>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not 
>>>> help.
>>>>
>>>> From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>>>> _count=2),
>>>> and the value of mapping is a valid address(mapping = 0x8801b3e2a101),
>>>> but anon_vma has been corrupted.
>>>>
>>>> Any ideas?
>>>
>>> Sorry, no.  I assume that _mapcount has been misaccounted, for example
>>> a pte mapped in on top of another pte; but cannot begin tell you where
>>
>> Hi Hugh,
>>
>> What does "a pte mapped in on top of another pte" mean? Could you give more 
>> info?
> 
> I mean, there are various places in mm/memory.c which decide what they
> intend to do based on orig_pte, then take pte lock, then check that
> pte_same(pte, orig_pte) before taking it any further.  If a pte_same()
> check were missing (I do not know of any such case), then two racing
> tasks might install the same pte, one on top of the other - page
> mapcount being incremented twice, but decremented only once when
> that pte is finally unmapped later.
> 

Hi Hugh,

Do you mean that the ptes from two racing point to the same page?
or the two racing point to two pages, but one covers the other later?
and the first page maybe alone in the lru list, and it will never be freed
when the process exit.

We got this info before crash.
[26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1

Thanks,
Xishi Qiu

> Please see similar discussion in the earlier thread at
> marc.info/?l=linux-mm=148222656211837=2
> 
> Hugh
> 
>>
>> Thanks,
>> Xishi Qiu
>>
>>> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>>>
>>> Hugh
> 
> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-22 Thread Xishi Qiu

On 2017/5/23 3:26, Hugh Dickins wrote:

> On Mon, 22 May 2017, Xishi Qiu wrote:
>> On 2017/5/20 10:40, Hugh Dickins wrote:
>>> On Sat, 20 May 2017, Xishi Qiu wrote:
>>>>
>>>> Here is a bug report form redhat: 
>>>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>>>> And I meet the bug too. However it is hard to reproduce, and 
>>>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not 
>>>> help.
>>>>
>>>> From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>>>> _count=2),
>>>> and the value of mapping is a valid address(mapping = 0x8801b3e2a101),
>>>> but anon_vma has been corrupted.
>>>>
>>>> Any ideas?
>>>
>>> Sorry, no.  I assume that _mapcount has been misaccounted, for example
>>> a pte mapped in on top of another pte; but cannot begin tell you where
>>
>> Hi Hugh,
>>
>> What does "a pte mapped in on top of another pte" mean? Could you give more 
>> info?
> 
> I mean, there are various places in mm/memory.c which decide what they
> intend to do based on orig_pte, then take pte lock, then check that
> pte_same(pte, orig_pte) before taking it any further.  If a pte_same()
> check were missing (I do not know of any such case), then two racing
> tasks might install the same pte, one on top of the other - page
> mapcount being incremented twice, but decremented only once when
> that pte is finally unmapped later.
> 

Hi Hugh,

Do you mean that the ptes from two racing point to the same page?
or the two racing point to two pages, but one covers the other later?
and the first page maybe alone in the lru list, and it will never be freed
when the process exit.

We got this info before crash.
[26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1

Thanks,
Xishi Qiu

> Please see similar discussion in the earlier thread at
> marc.info/?l=linux-mm=148222656211837=2
> 
> Hugh
> 
>>
>> Thanks,
>> Xishi Qiu
>>
>>> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>>>
>>> Hugh
> 
> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-22 Thread Xishi Qiu

On 2017/5/20 10:40, Hugh Dickins wrote:

> On Sat, 20 May 2017, Xishi Qiu wrote:
>>
>> Here is a bug report form redhat: 
>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>> And I meet the bug too. However it is hard to reproduce, and 
>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not 
>> help.
>>
>> From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>> _count=2),
>> and the value of mapping is a valid address(mapping = 0x8801b3e2a101),
>> but anon_vma has been corrupted.
>>
>> Any ideas?
> 
> Sorry, no.  I assume that _mapcount has been misaccounted, for example
> a pte mapped in on top of another pte; but cannot begin tell you where

Hi Hugh,

What does "a pte mapped in on top of another pte" mean? Could you give more 
info?

Thanks,
Xishi Qiu

> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
> 
> Hugh
> 
> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-22 Thread Xishi Qiu

On 2017/5/20 10:40, Hugh Dickins wrote:

> On Sat, 20 May 2017, Xishi Qiu wrote:
>>
>> Here is a bug report form redhat: 
>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>> And I meet the bug too. However it is hard to reproduce, and 
>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not 
>> help.
>>
>> From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>> _count=2),
>> and the value of mapping is a valid address(mapping = 0x8801b3e2a101),
>> but anon_vma has been corrupted.
>>
>> Any ideas?
> 
> Sorry, no.  I assume that _mapcount has been misaccounted, for example
> a pte mapped in on top of another pte; but cannot begin tell you where

Hi Hugh,

What does "a pte mapped in on top of another pte" mean? Could you give more 
info?

Thanks,
Xishi Qiu

> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
> 
> Hugh
> 
> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-19 Thread Xishi Qiu

On 2017/5/20 10:02, Hugh Dickins wrote:

> On Sat, 20 May 2017, Xishi Qiu wrote:
>> On 2017/5/20 6:00, Hugh Dickins wrote:
>>>
>>> You're ignoring the rcu_read_lock() on entry to page_lock_anon_vma_read(),
>>> and the SLAB_DESTROY_BY_RCU (recently renamed SLAB_TYPESAFE_BY_RCU) nature
>>> of the anon_vma_cachep kmem cache.  It is not safe to muck with anon_vma->
>>> root in anon_vma_free(), others could still be looking at it.
>>>
>>> Hugh
>>>
>>
>> Hi Hugh,
>>
>> Thanks for your reply.
>>
>> SLAB_DESTROY_BY_RCU will let it call call_rcu() in free_slab(), but if the
>> anon_vma *reuse* by someone again, access root_anon_vma is not safe, right?
> 
> That is safe, on reuse it is still a struct anon_vma; then the test for
> !page_mapped(page) will show that it's no longer a reliable anon_vma for
> this page, so page_lock_anon_vma_read() returns NULL.
> 
> But of course, if page->_mapcount has been corrupted or misaccounted,
> it may think page_mapped(page) when actually page is not mapped,
> and the anon_vma is not good for it.
> 

Hi Hugh,

Here is a bug report form redhat: 
https://bugzilla.redhat.com/show_bug.cgi?id=1305620
And I meet the bug too. However it is hard to reproduce, and 
624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not help.

>From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>_count=2),
and the value of mapping is a valid address(mapping = 0x8801b3e2a101),
but anon_vma has been corrupted.

Any ideas?

Thanks,
Xishi Qiu

>>
>> e.g. if I clean the root pointer before free it, then access root_anon_vma
>> in page_lock_anon_vma_read() is NULL pointer access, right?
> 
> Yes, cleaning root pointer before free may result in NULL pointer access.
> 
> Hugh
> 
>>
>> anon_vma_free()
>>  ...
>>  anon_vma->root = NULL;
>>  kmem_cache_free(anon_vma_cachep, anon_vma);
>>  ...
>>
>> Thanks,
>> Xishi Qiu
> 
> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-19 Thread Xishi Qiu

On 2017/5/20 10:02, Hugh Dickins wrote:

> On Sat, 20 May 2017, Xishi Qiu wrote:
>> On 2017/5/20 6:00, Hugh Dickins wrote:
>>>
>>> You're ignoring the rcu_read_lock() on entry to page_lock_anon_vma_read(),
>>> and the SLAB_DESTROY_BY_RCU (recently renamed SLAB_TYPESAFE_BY_RCU) nature
>>> of the anon_vma_cachep kmem cache.  It is not safe to muck with anon_vma->
>>> root in anon_vma_free(), others could still be looking at it.
>>>
>>> Hugh
>>>
>>
>> Hi Hugh,
>>
>> Thanks for your reply.
>>
>> SLAB_DESTROY_BY_RCU will let it call call_rcu() in free_slab(), but if the
>> anon_vma *reuse* by someone again, access root_anon_vma is not safe, right?
> 
> That is safe, on reuse it is still a struct anon_vma; then the test for
> !page_mapped(page) will show that it's no longer a reliable anon_vma for
> this page, so page_lock_anon_vma_read() returns NULL.
> 
> But of course, if page->_mapcount has been corrupted or misaccounted,
> it may think page_mapped(page) when actually page is not mapped,
> and the anon_vma is not good for it.
> 

Hi Hugh,

Here is a bug report form redhat: 
https://bugzilla.redhat.com/show_bug.cgi?id=1305620
And I meet the bug too. However it is hard to reproduce, and 
624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not help.

>From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>_count=2),
and the value of mapping is a valid address(mapping = 0x8801b3e2a101),
but anon_vma has been corrupted.

Any ideas?

Thanks,
Xishi Qiu

>>
>> e.g. if I clean the root pointer before free it, then access root_anon_vma
>> in page_lock_anon_vma_read() is NULL pointer access, right?
> 
> Yes, cleaning root pointer before free may result in NULL pointer access.
> 
> Hugh
> 
>>
>> anon_vma_free()
>>  ...
>>  anon_vma->root = NULL;
>>  kmem_cache_free(anon_vma_cachep, anon_vma);
>>  ...
>>
>> Thanks,
>> Xishi Qiu
> 
> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-19 Thread Xishi Qiu

On 2017/5/20 6:00, Hugh Dickins wrote:

> On Fri, 19 May 2017, Xishi Qiu wrote:
>> On 2017/5/19 16:52, Xishi Qiu wrote:
>>> On 2017/5/18 17:46, Xishi Qiu wrote:
>>>
>>>> Hi, my system triggers this bug, and the vmcore shows the anon_vma seems 
>>>> be freed.
>>>> The kernel is RHEL 7.2, and the bug is hard to reproduce, so I don't know 
>>>> if it
>>>> exists in mainline, any reply is welcome!
>>>>
>>>
>>> When we alloc anon_vma, we will init the value of anon_vma->root,
>>> so can we set anon_vma->root to NULL when calling
>>> anon_vma_free -> kmem_cache_free(anon_vma_cachep, anon_vma);
>>>
>>> anon_vma_free()
>>> ...
>>> anon_vma->root = NULL;
>>> kmem_cache_free(anon_vma_cachep, anon_vma);
>>>
>>> I find if we do this above, system boot failed, why?
>>>
>>
>> If anon_vma was freed, we should not to access the root_anon_vma, because it 
>> maybe also
>> freed(e.g. anon_vma == root_anon_vma), right?
>>
>> page_lock_anon_vma_read()
>>  ...
>>  anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
>>  root_anon_vma = ACCESS_ONCE(anon_vma->root);
>>  if (down_read_trylock(_anon_vma->rwsem)) {  // it's not safe
>>  ...
>>  if (!atomic_inc_not_zero(_vma->refcount)) {  // check anon_vma was 
>> not freed
>>  ...
>>  anon_vma_lock_read(anon_vma);  // it's safe
>>  ...
> 
> You're ignoring the rcu_read_lock() on entry to page_lock_anon_vma_read(),
> and the SLAB_DESTROY_BY_RCU (recently renamed SLAB_TYPESAFE_BY_RCU) nature
> of the anon_vma_cachep kmem cache.  It is not safe to muck with anon_vma->
> root in anon_vma_free(), others could still be looking at it.
> 
> Hugh
> 

Hi Hugh,

Thanks for your reply.

SLAB_DESTROY_BY_RCU will let it call call_rcu() in free_slab(), but if the
anon_vma *reuse* by someone again, access root_anon_vma is not safe, right?

e.g. if I clean the root pointer before free it, then access root_anon_vma
in page_lock_anon_vma_read() is NULL pointer access, right?

anon_vma_free()
...
anon_vma->root = NULL;
kmem_cache_free(anon_vma_cachep, anon_vma);
...

Thanks,
Xishi Qiu

> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-19 Thread Xishi Qiu

On 2017/5/20 6:00, Hugh Dickins wrote:

> On Fri, 19 May 2017, Xishi Qiu wrote:
>> On 2017/5/19 16:52, Xishi Qiu wrote:
>>> On 2017/5/18 17:46, Xishi Qiu wrote:
>>>
>>>> Hi, my system triggers this bug, and the vmcore shows the anon_vma seems 
>>>> be freed.
>>>> The kernel is RHEL 7.2, and the bug is hard to reproduce, so I don't know 
>>>> if it
>>>> exists in mainline, any reply is welcome!
>>>>
>>>
>>> When we alloc anon_vma, we will init the value of anon_vma->root,
>>> so can we set anon_vma->root to NULL when calling
>>> anon_vma_free -> kmem_cache_free(anon_vma_cachep, anon_vma);
>>>
>>> anon_vma_free()
>>> ...
>>> anon_vma->root = NULL;
>>> kmem_cache_free(anon_vma_cachep, anon_vma);
>>>
>>> I find if we do this above, system boot failed, why?
>>>
>>
>> If anon_vma was freed, we should not to access the root_anon_vma, because it 
>> maybe also
>> freed(e.g. anon_vma == root_anon_vma), right?
>>
>> page_lock_anon_vma_read()
>>  ...
>>  anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
>>  root_anon_vma = ACCESS_ONCE(anon_vma->root);
>>  if (down_read_trylock(_anon_vma->rwsem)) {  // it's not safe
>>  ...
>>  if (!atomic_inc_not_zero(_vma->refcount)) {  // check anon_vma was 
>> not freed
>>  ...
>>  anon_vma_lock_read(anon_vma);  // it's safe
>>  ...
> 
> You're ignoring the rcu_read_lock() on entry to page_lock_anon_vma_read(),
> and the SLAB_DESTROY_BY_RCU (recently renamed SLAB_TYPESAFE_BY_RCU) nature
> of the anon_vma_cachep kmem cache.  It is not safe to muck with anon_vma->
> root in anon_vma_free(), others could still be looking at it.
> 
> Hugh
> 

Hi Hugh,

Thanks for your reply.

SLAB_DESTROY_BY_RCU will let it call call_rcu() in free_slab(), but if the
anon_vma *reuse* by someone again, access root_anon_vma is not safe, right?

e.g. if I clean the root pointer before free it, then access root_anon_vma
in page_lock_anon_vma_read() is NULL pointer access, right?

anon_vma_free()
...
anon_vma->root = NULL;
kmem_cache_free(anon_vma_cachep, anon_vma);
...

Thanks,
Xishi Qiu

> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-19 Thread Xishi Qiu

On 2017/5/19 16:52, Xishi Qiu wrote:

> On 2017/5/18 17:46, Xishi Qiu wrote:
> 
>> Hi, my system triggers this bug, and the vmcore shows the anon_vma seems be 
>> freed.
>> The kernel is RHEL 7.2, and the bug is hard to reproduce, so I don't know if 
>> it
>> exists in mainline, any reply is welcome!
>>
> 
> When we alloc anon_vma, we will init the value of anon_vma->root,
> so can we set anon_vma->root to NULL when calling
> anon_vma_free -> kmem_cache_free(anon_vma_cachep, anon_vma);
> 
> anon_vma_free()
>   ...
>   anon_vma->root = NULL;
>   kmem_cache_free(anon_vma_cachep, anon_vma);
> 
> I find if we do this above, system boot failed, why?
> 

If anon_vma was freed, we should not to access the root_anon_vma, because it 
maybe also
freed(e.g. anon_vma == root_anon_vma), right?

page_lock_anon_vma_read()
...
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = ACCESS_ONCE(anon_vma->root);
if (down_read_trylock(_anon_vma->rwsem)) {  // it's not safe
...
if (!atomic_inc_not_zero(_vma->refcount)) {  // check anon_vma was 
not freed
...
anon_vma_lock_read(anon_vma);  // it's safe
...


> Thanks,
> Xishi Qiu
> 
>> [35030.332666] general protection fault:  [#1] SMP
>> [35030.333016] Modules linked in: veth ipt_MASQUERADE nf_nat_masquerade_ipv4 
>> iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype 
>> iptable_filter xt_conntrack nf_nat nf_conntrack bridge stp llc dm_thin_pool 
>> dm_persistent_data dm_bio_prison dm_bufio libcrc32c rtos_kbox_panic(OE) 
>> ipmi_devintf ipmi_si ipmi_msghandler signo_catch(O) cirrus syscopyarea 
>> sysfillrect sysimgblt ttm crc32_pclmul ghash_clmulni_intel drm_kms_helper 
>> aesni_intel ppdev drm lrw gf128mul parport_pc glue_helper ablk_helper 
>> serio_raw cryptd i2c_piix4 parport pcspkr sg floppy i2c_core dm_mod 
>> sha512_generic ip_tables sd_mod crc_t10dif crct10dif_generic sr_mod cdrom 
>> virtio_console virtio_scsi virtio_net ata_generic pata_acpi crct10dif_pclmul 
>> crct10dif_common crc32c_intel virtio_pci virtio_ring virtio ata_piix libata 
>> ext4 mbcache
>> [35030.333016]  jbd2
>> [35030.333016] CPU: 3 PID: 48 Comm: kswapd0 Tainted: G   OE   
>> ---   3.10.0-327.36.58.4.x86_64 #1
>> [35030.333016] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
>> rel-1.8.1-0-g4adadbd-20160826_03-hghoulaslx112 04/01/2014
>> [35030.333016] task: 8801b2d2 ti: 8801b4c38000 task.ti: 
>> 8801b4c38000
>> [35030.333016] RIP: 0010:[]  [] 
>> down_read_trylock+0x5/0x50
>> [35030.333016] RSP: :8801b4c3ba90  EFLAGS: 00010282
>> [35030.333016] RAX:  RBX: 8801b3e2a100 RCX: 
>> 
>> [35030.333016] RDX:  RSI:  RDI: 
>> deb604d497705c5d
>> [35030.333016] RBP: 8801b4c3bab8 R08: ea0002c34460 R09: 
>> 8801b3d7e8a0
>> [35030.333016] R10: 0004 R11: fff0fe00 R12: 
>> 8801b3e2a101
>> [35030.333016] R13: ea0002c34440 R14: deb604d497705c5d R15: 
>> ea0002c34440
>> [35030.333016] FS:  () GS:8801bed8() 
>> knlGS:
>> [35030.333016] CS:  0010 DS:  ES:  CR0: 80050033
>> [35030.333016] CR2: 00c422011080 CR3: 01976000 CR4: 
>> 001407e0
>> [35030.333016] DR0:  DR1:  DR2: 
>> 
>> [35030.333016] DR3:  DR6: 0ff0 DR7: 
>> 0400
>> [35030.333016] Stack:
>> [35030.333016]  811b2795 ea0002c34440  
>> 000f
>> [35030.333016]  0001 8801b4c3bb30 811b2a17 
>> 8800a712d640
>> [35030.333016]  0c4229e2 8801b4c3bb80 0001 
>> 0c41fe38
>> [35030.333016] Call Trace:
>> [35030.333016]  [] ? page_lock_anon_vma_read+0x55/0x110
>> [35030.333016]  [] page_referenced+0x1c7/0x350
>> [35030.333016]  [] shrink_active_list+0x1e4/0x400
>> [35030.333016]  [] shrink_lruvec+0x4bd/0x770
>> [35030.333016]  [] shrink_zone+0x76/0x1a0
>> [35030.333016]  [] balance_pgdat+0x49c/0x610
>> [35030.333016]  [] kswapd+0x173/0x450
>> [35030.333016]  [] ? wake_up_atomic_t+0x30/0x30
>> [35030.333016]  [] ? balance_pgdat+0x610/0x610
>> [35030.333016]  [] kthread+0xcf/0xe0
>> [35030.333016]  [] ? kthread_create_on_node+0x120/0x120
>> [35030.333016]  [] ret_from_fork+0x58/0x90
>> [35030.333016]  [] ? kthread_create_on_node+

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-19 Thread Xishi Qiu

On 2017/5/19 16:52, Xishi Qiu wrote:

> On 2017/5/18 17:46, Xishi Qiu wrote:
> 
>> Hi, my system triggers this bug, and the vmcore shows the anon_vma seems be 
>> freed.
>> The kernel is RHEL 7.2, and the bug is hard to reproduce, so I don't know if 
>> it
>> exists in mainline, any reply is welcome!
>>
> 
> When we alloc anon_vma, we will init the value of anon_vma->root,
> so can we set anon_vma->root to NULL when calling
> anon_vma_free -> kmem_cache_free(anon_vma_cachep, anon_vma);
> 
> anon_vma_free()
>   ...
>   anon_vma->root = NULL;
>   kmem_cache_free(anon_vma_cachep, anon_vma);
> 
> I find if we do this above, system boot failed, why?
> 

If anon_vma was freed, we should not to access the root_anon_vma, because it 
maybe also
freed(e.g. anon_vma == root_anon_vma), right?

page_lock_anon_vma_read()
...
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = ACCESS_ONCE(anon_vma->root);
if (down_read_trylock(_anon_vma->rwsem)) {  // it's not safe
...
if (!atomic_inc_not_zero(_vma->refcount)) {  // check anon_vma was 
not freed
...
anon_vma_lock_read(anon_vma);  // it's safe
...


> Thanks,
> Xishi Qiu
> 
>> [35030.332666] general protection fault:  [#1] SMP
>> [35030.333016] Modules linked in: veth ipt_MASQUERADE nf_nat_masquerade_ipv4 
>> iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype 
>> iptable_filter xt_conntrack nf_nat nf_conntrack bridge stp llc dm_thin_pool 
>> dm_persistent_data dm_bio_prison dm_bufio libcrc32c rtos_kbox_panic(OE) 
>> ipmi_devintf ipmi_si ipmi_msghandler signo_catch(O) cirrus syscopyarea 
>> sysfillrect sysimgblt ttm crc32_pclmul ghash_clmulni_intel drm_kms_helper 
>> aesni_intel ppdev drm lrw gf128mul parport_pc glue_helper ablk_helper 
>> serio_raw cryptd i2c_piix4 parport pcspkr sg floppy i2c_core dm_mod 
>> sha512_generic ip_tables sd_mod crc_t10dif crct10dif_generic sr_mod cdrom 
>> virtio_console virtio_scsi virtio_net ata_generic pata_acpi crct10dif_pclmul 
>> crct10dif_common crc32c_intel virtio_pci virtio_ring virtio ata_piix libata 
>> ext4 mbcache
>> [35030.333016]  jbd2
>> [35030.333016] CPU: 3 PID: 48 Comm: kswapd0 Tainted: G   OE   
>> ---   3.10.0-327.36.58.4.x86_64 #1
>> [35030.333016] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
>> rel-1.8.1-0-g4adadbd-20160826_03-hghoulaslx112 04/01/2014
>> [35030.333016] task: 8801b2d2 ti: 8801b4c38000 task.ti: 
>> 8801b4c38000
>> [35030.333016] RIP: 0010:[]  [] 
>> down_read_trylock+0x5/0x50
>> [35030.333016] RSP: :8801b4c3ba90  EFLAGS: 00010282
>> [35030.333016] RAX:  RBX: 8801b3e2a100 RCX: 
>> 
>> [35030.333016] RDX:  RSI:  RDI: 
>> deb604d497705c5d
>> [35030.333016] RBP: 8801b4c3bab8 R08: ea0002c34460 R09: 
>> 8801b3d7e8a0
>> [35030.333016] R10: 0004 R11: fff0fe00 R12: 
>> 8801b3e2a101
>> [35030.333016] R13: ea0002c34440 R14: deb604d497705c5d R15: 
>> ea0002c34440
>> [35030.333016] FS:  () GS:8801bed8() 
>> knlGS:
>> [35030.333016] CS:  0010 DS:  ES:  CR0: 80050033
>> [35030.333016] CR2: 00c422011080 CR3: 01976000 CR4: 
>> 001407e0
>> [35030.333016] DR0:  DR1:  DR2: 
>> 
>> [35030.333016] DR3:  DR6: 0ff0 DR7: 
>> 0400
>> [35030.333016] Stack:
>> [35030.333016]  811b2795 ea0002c34440  
>> 000f
>> [35030.333016]  0001 8801b4c3bb30 811b2a17 
>> 8800a712d640
>> [35030.333016]  0c4229e2 8801b4c3bb80 0001 
>> 0c41fe38
>> [35030.333016] Call Trace:
>> [35030.333016]  [] ? page_lock_anon_vma_read+0x55/0x110
>> [35030.333016]  [] page_referenced+0x1c7/0x350
>> [35030.333016]  [] shrink_active_list+0x1e4/0x400
>> [35030.333016]  [] shrink_lruvec+0x4bd/0x770
>> [35030.333016]  [] shrink_zone+0x76/0x1a0
>> [35030.333016]  [] balance_pgdat+0x49c/0x610
>> [35030.333016]  [] kswapd+0x173/0x450
>> [35030.333016]  [] ? wake_up_atomic_t+0x30/0x30
>> [35030.333016]  [] ? balance_pgdat+0x610/0x610
>> [35030.333016]  [] kthread+0xcf/0xe0
>> [35030.333016]  [] ? kthread_create_on_node+0x120/0x120
>> [35030.333016]  [] ret_from_fork+0x58/0x90
>> [35030.333016]  [] ? kthread_create_on_node+

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-19 Thread Xishi Qiu

On 2017/5/18 17:46, Xishi Qiu wrote:

> Hi, my system triggers this bug, and the vmcore shows the anon_vma seems be 
> freed.
> The kernel is RHEL 7.2, and the bug is hard to reproduce, so I don't know if 
> it
> exists in mainline, any reply is welcome!
> 

When we alloc anon_vma, we will init the value of anon_vma->root,
so can we set anon_vma->root to NULL when calling
anon_vma_free -> kmem_cache_free(anon_vma_cachep, anon_vma);

anon_vma_free()
...
anon_vma->root = NULL;
kmem_cache_free(anon_vma_cachep, anon_vma);

I find if we do this above, system boot failed, why?

Thanks,
Xishi Qiu

> [35030.332666] general protection fault:  [#1] SMP
> [35030.333016] Modules linked in: veth ipt_MASQUERADE nf_nat_masquerade_ipv4 
> iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype 
> iptable_filter xt_conntrack nf_nat nf_conntrack bridge stp llc dm_thin_pool 
> dm_persistent_data dm_bio_prison dm_bufio libcrc32c rtos_kbox_panic(OE) 
> ipmi_devintf ipmi_si ipmi_msghandler signo_catch(O) cirrus syscopyarea 
> sysfillrect sysimgblt ttm crc32_pclmul ghash_clmulni_intel drm_kms_helper 
> aesni_intel ppdev drm lrw gf128mul parport_pc glue_helper ablk_helper 
> serio_raw cryptd i2c_piix4 parport pcspkr sg floppy i2c_core dm_mod 
> sha512_generic ip_tables sd_mod crc_t10dif crct10dif_generic sr_mod cdrom 
> virtio_console virtio_scsi virtio_net ata_generic pata_acpi crct10dif_pclmul 
> crct10dif_common crc32c_intel virtio_pci virtio_ring virtio ata_piix libata 
> ext4 mbcache
> [35030.333016]  jbd2
> [35030.333016] CPU: 3 PID: 48 Comm: kswapd0 Tainted: G   OE   
> ---   3.10.0-327.36.58.4.x86_64 #1
> [35030.333016] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
> rel-1.8.1-0-g4adadbd-20160826_03-hghoulaslx112 04/01/2014
> [35030.333016] task: 8801b2d2 ti: 8801b4c38000 task.ti: 
> 8801b4c38000
> [35030.333016] RIP: 0010:[]  [] 
> down_read_trylock+0x5/0x50
> [35030.333016] RSP: :8801b4c3ba90  EFLAGS: 00010282
> [35030.333016] RAX:  RBX: 8801b3e2a100 RCX: 
> 
> [35030.333016] RDX:  RSI:  RDI: 
> deb604d497705c5d
> [35030.333016] RBP: 8801b4c3bab8 R08: ea0002c34460 R09: 
> 8801b3d7e8a0
> [35030.333016] R10: 0004 R11: fff0fe00 R12: 
> 8801b3e2a101
> [35030.333016] R13: ea0002c34440 R14: deb604d497705c5d R15: 
> ea0002c34440
> [35030.333016] FS:  () GS:8801bed8() 
> knlGS:
> [35030.333016] CS:  0010 DS:  ES:  CR0: 80050033
> [35030.333016] CR2: 00c422011080 CR3: 01976000 CR4: 
> 001407e0
> [35030.333016] DR0:  DR1:  DR2: 
> 
> [35030.333016] DR3:  DR6: 0ff0 DR7: 
> 0400
> [35030.333016] Stack:
> [35030.333016]  811b2795 ea0002c34440  
> 000f
> [35030.333016]  0001 8801b4c3bb30 811b2a17 
> 8800a712d640
> [35030.333016]  0c4229e2 8801b4c3bb80 0001 
> 0c41fe38
> [35030.333016] Call Trace:
> [35030.333016]  [] ? page_lock_anon_vma_read+0x55/0x110
> [35030.333016]  [] page_referenced+0x1c7/0x350
> [35030.333016]  [] shrink_active_list+0x1e4/0x400
> [35030.333016]  [] shrink_lruvec+0x4bd/0x770
> [35030.333016]  [] shrink_zone+0x76/0x1a0
> [35030.333016]  [] balance_pgdat+0x49c/0x610
> [35030.333016]  [] kswapd+0x173/0x450
> [35030.333016]  [] ? wake_up_atomic_t+0x30/0x30
> [35030.333016]  [] ? balance_pgdat+0x610/0x610
> [35030.333016]  [] kthread+0xcf/0xe0
> [35030.333016]  [] ? kthread_create_on_node+0x120/0x120
> [35030.333016]  [] ret_from_fork+0x58/0x90
> [35030.333016]  [] ? kthread_create_on_node+0x120/0x120
> [35030.333016] Code: 00 ba ff ff ff ff 48 89 d8 f0 48 0f c1 10 79 05 e8 31 06 
> 27 00 5b 5d c3 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <48> 
> 8b 07 48 89 c2 48 83 c2 01 7e 07 f0 48 0f b1 17 75 f0 48 f7
> [35030.333016] RIP  [] down_read_trylock+0x5/0x50
> [35030.333016]  RSP 
> [35030.333016] [ cut here ]
> 
> struct page {
>   flags = 9007194960298056,
>   mapping = 0x8801b3e2a101,
>   {
> {
>   index = 34324593617,
>   freelist = 0x7fde7bbd1,
>   pfmemalloc = 209,
>   thp_mmu_gather = {
> counter = -35144751
>   },
>   pmd_huge_pte = 0x7fde7bbd1
> },
> {
>   counters = 8589934592,
>   {
> {
>   _mapcount = {
> counter = 0
>   },
>   {
> inuse = 0,
> objects = 0,
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-19 Thread Xishi Qiu

On 2017/5/18 17:46, Xishi Qiu wrote:

> Hi, my system triggers this bug, and the vmcore shows the anon_vma seems be 
> freed.
> The kernel is RHEL 7.2, and the bug is hard to reproduce, so I don't know if 
> it
> exists in mainline, any reply is welcome!
> 

When we alloc anon_vma, we will init the value of anon_vma->root,
so can we set anon_vma->root to NULL when calling
anon_vma_free -> kmem_cache_free(anon_vma_cachep, anon_vma);

anon_vma_free()
...
anon_vma->root = NULL;
kmem_cache_free(anon_vma_cachep, anon_vma);

I find if we do this above, system boot failed, why?

Thanks,
Xishi Qiu

> [35030.332666] general protection fault:  [#1] SMP
> [35030.333016] Modules linked in: veth ipt_MASQUERADE nf_nat_masquerade_ipv4 
> iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype 
> iptable_filter xt_conntrack nf_nat nf_conntrack bridge stp llc dm_thin_pool 
> dm_persistent_data dm_bio_prison dm_bufio libcrc32c rtos_kbox_panic(OE) 
> ipmi_devintf ipmi_si ipmi_msghandler signo_catch(O) cirrus syscopyarea 
> sysfillrect sysimgblt ttm crc32_pclmul ghash_clmulni_intel drm_kms_helper 
> aesni_intel ppdev drm lrw gf128mul parport_pc glue_helper ablk_helper 
> serio_raw cryptd i2c_piix4 parport pcspkr sg floppy i2c_core dm_mod 
> sha512_generic ip_tables sd_mod crc_t10dif crct10dif_generic sr_mod cdrom 
> virtio_console virtio_scsi virtio_net ata_generic pata_acpi crct10dif_pclmul 
> crct10dif_common crc32c_intel virtio_pci virtio_ring virtio ata_piix libata 
> ext4 mbcache
> [35030.333016]  jbd2
> [35030.333016] CPU: 3 PID: 48 Comm: kswapd0 Tainted: G   OE   
> ---   3.10.0-327.36.58.4.x86_64 #1
> [35030.333016] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
> rel-1.8.1-0-g4adadbd-20160826_03-hghoulaslx112 04/01/2014
> [35030.333016] task: 8801b2d2 ti: 8801b4c38000 task.ti: 
> 8801b4c38000
> [35030.333016] RIP: 0010:[]  [] 
> down_read_trylock+0x5/0x50
> [35030.333016] RSP: :8801b4c3ba90  EFLAGS: 00010282
> [35030.333016] RAX:  RBX: 8801b3e2a100 RCX: 
> 
> [35030.333016] RDX:  RSI:  RDI: 
> deb604d497705c5d
> [35030.333016] RBP: 8801b4c3bab8 R08: ea0002c34460 R09: 
> 8801b3d7e8a0
> [35030.333016] R10: 0004 R11: fff0fe00 R12: 
> 8801b3e2a101
> [35030.333016] R13: ea0002c34440 R14: deb604d497705c5d R15: 
> ea0002c34440
> [35030.333016] FS:  () GS:8801bed8() 
> knlGS:
> [35030.333016] CS:  0010 DS:  ES:  CR0: 80050033
> [35030.333016] CR2: 00c422011080 CR3: 01976000 CR4: 
> 001407e0
> [35030.333016] DR0:  DR1:  DR2: 
> 
> [35030.333016] DR3:  DR6: 0ff0 DR7: 
> 0400
> [35030.333016] Stack:
> [35030.333016]  811b2795 ea0002c34440  
> 000f
> [35030.333016]  0001 8801b4c3bb30 811b2a17 
> 8800a712d640
> [35030.333016]  0c4229e2 8801b4c3bb80 0001 
> 0c41fe38
> [35030.333016] Call Trace:
> [35030.333016]  [] ? page_lock_anon_vma_read+0x55/0x110
> [35030.333016]  [] page_referenced+0x1c7/0x350
> [35030.333016]  [] shrink_active_list+0x1e4/0x400
> [35030.333016]  [] shrink_lruvec+0x4bd/0x770
> [35030.333016]  [] shrink_zone+0x76/0x1a0
> [35030.333016]  [] balance_pgdat+0x49c/0x610
> [35030.333016]  [] kswapd+0x173/0x450
> [35030.333016]  [] ? wake_up_atomic_t+0x30/0x30
> [35030.333016]  [] ? balance_pgdat+0x610/0x610
> [35030.333016]  [] kthread+0xcf/0xe0
> [35030.333016]  [] ? kthread_create_on_node+0x120/0x120
> [35030.333016]  [] ret_from_fork+0x58/0x90
> [35030.333016]  [] ? kthread_create_on_node+0x120/0x120
> [35030.333016] Code: 00 ba ff ff ff ff 48 89 d8 f0 48 0f c1 10 79 05 e8 31 06 
> 27 00 5b 5d c3 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <48> 
> 8b 07 48 89 c2 48 83 c2 01 7e 07 f0 48 0f b1 17 75 f0 48 f7
> [35030.333016] RIP  [] down_read_trylock+0x5/0x50
> [35030.333016]  RSP 
> [35030.333016] [ cut here ]
> 
> struct page {
>   flags = 9007194960298056,
>   mapping = 0x8801b3e2a101,
>   {
> {
>   index = 34324593617,
>   freelist = 0x7fde7bbd1,
>   pfmemalloc = 209,
>   thp_mmu_gather = {
> counter = -35144751
>   },
>   pmd_huge_pte = 0x7fde7bbd1
> },
> {
>   counters = 8589934592,
>   {
> {
>   _mapcount = {
> counter = 0
>   },
>   {
> inuse = 0,
> objects = 0,
>

mm, something wring in page_lock_anon_vma_read()?

2017-05-18 Thread Xishi Qiu

Hi, my system triggers this bug, and the vmcore shows the anon_vma seems be 
freed.
The kernel is RHEL 7.2, and the bug is hard to reproduce, so I don't know if it
exists in mainline, any reply is welcome!

[35030.332666] general protection fault:  [#1] SMP
[35030.333016] Modules linked in: veth ipt_MASQUERADE nf_nat_masquerade_ipv4 
iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype 
iptable_filter xt_conntrack nf_nat nf_conntrack bridge stp llc dm_thin_pool 
dm_persistent_data dm_bio_prison dm_bufio libcrc32c rtos_kbox_panic(OE) 
ipmi_devintf ipmi_si ipmi_msghandler signo_catch(O) cirrus syscopyarea 
sysfillrect sysimgblt ttm crc32_pclmul ghash_clmulni_intel drm_kms_helper 
aesni_intel ppdev drm lrw gf128mul parport_pc glue_helper ablk_helper serio_raw 
cryptd i2c_piix4 parport pcspkr sg floppy i2c_core dm_mod sha512_generic 
ip_tables sd_mod crc_t10dif crct10dif_generic sr_mod cdrom virtio_console 
virtio_scsi virtio_net ata_generic pata_acpi crct10dif_pclmul crct10dif_common 
crc32c_intel virtio_pci virtio_ring virtio ata_piix libata ext4 mbcache
[35030.333016]  jbd2
[35030.333016] CPU: 3 PID: 48 Comm: kswapd0 Tainted: G   OE   
---   3.10.0-327.36.58.4.x86_64 #1
[35030.333016] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
rel-1.8.1-0-g4adadbd-20160826_03-hghoulaslx112 04/01/2014
[35030.333016] task: 8801b2d2 ti: 8801b4c38000 task.ti: 
8801b4c38000
[35030.333016] RIP: 0010:[]  [] 
down_read_trylock+0x5/0x50
[35030.333016] RSP: :8801b4c3ba90  EFLAGS: 00010282
[35030.333016] RAX:  RBX: 8801b3e2a100 RCX: 
[35030.333016] RDX:  RSI:  RDI: deb604d497705c5d
[35030.333016] RBP: 8801b4c3bab8 R08: ea0002c34460 R09: 8801b3d7e8a0
[35030.333016] R10: 0004 R11: fff0fe00 R12: 8801b3e2a101
[35030.333016] R13: ea0002c34440 R14: deb604d497705c5d R15: ea0002c34440
[35030.333016] FS:  () GS:8801bed8() 
knlGS:
[35030.333016] CS:  0010 DS:  ES:  CR0: 80050033
[35030.333016] CR2: 00c422011080 CR3: 01976000 CR4: 001407e0
[35030.333016] DR0:  DR1:  DR2: 
[35030.333016] DR3:  DR6: 0ff0 DR7: 0400
[35030.333016] Stack:
[35030.333016]  811b2795 ea0002c34440  
000f
[35030.333016]  0001 8801b4c3bb30 811b2a17 
8800a712d640
[35030.333016]  0c4229e2 8801b4c3bb80 0001 
0c41fe38
[35030.333016] Call Trace:
[35030.333016]  [] ? page_lock_anon_vma_read+0x55/0x110
[35030.333016]  [] page_referenced+0x1c7/0x350
[35030.333016]  [] shrink_active_list+0x1e4/0x400
[35030.333016]  [] shrink_lruvec+0x4bd/0x770
[35030.333016]  [] shrink_zone+0x76/0x1a0
[35030.333016]  [] balance_pgdat+0x49c/0x610
[35030.333016]  [] kswapd+0x173/0x450
[35030.333016]  [] ? wake_up_atomic_t+0x30/0x30
[35030.333016]  [] ? balance_pgdat+0x610/0x610
[35030.333016]  [] kthread+0xcf/0xe0
[35030.333016]  [] ? kthread_create_on_node+0x120/0x120
[35030.333016]  [] ret_from_fork+0x58/0x90
[35030.333016]  [] ? kthread_create_on_node+0x120/0x120
[35030.333016] Code: 00 ba ff ff ff ff 48 89 d8 f0 48 0f c1 10 79 05 e8 31 06 
27 00 5b 5d c3 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <48> 8b 
07 48 89 c2 48 83 c2 01 7e 07 f0 48 0f b1 17 75 f0 48 f7
[35030.333016] RIP  [] down_read_trylock+0x5/0x50
[35030.333016]  RSP 
[35030.333016] [ cut here ]

struct page {
  flags = 9007194960298056,
  mapping = 0x8801b3e2a101,
  {
{
  index = 34324593617,
  freelist = 0x7fde7bbd1,
  pfmemalloc = 209,
  thp_mmu_gather = {
counter = -35144751
  },
  pmd_huge_pte = 0x7fde7bbd1
},
{
  counters = 8589934592,
  {
{
  _mapcount = {
counter = 0
  },
  {
inuse = 0,
objects = 0,
frozen = 0
  },
  units = 0
},
_count = {
  counter = 2
}
  }
}
  },
  {
lru = {
  next = 0xdead00100100,
  prev = 0xdead00200200
},
{
  next = 0xdead00100100,
  pages = 2097664,
  pobjects = -559087616
},
list = {
  next = 0xdead00100100,
  prev = 0xdead00200200
},
slab_page = 0xdead00100100
  },
  {
private = 0,
ptl = {
  {
rlock = {
  raw_lock = {
{
  head_tail = 0,
  tickets = {
head = 0,
tail = 0
  }
}
  }
}
  }
},
slab_cache = 0x0,
first_page = 0x0
  }
}



crash> struct anon_vma 0x8801b3e2a100
struct anon_vma {
  root = 0xdeb604d497705c55,
  rwsem = {
count = -8192007903225070328,
wait_lock = {

mm, something wring in page_lock_anon_vma_read()?

2017-05-18 Thread Xishi Qiu

Hi, my system triggers this bug, and the vmcore shows the anon_vma seems be 
freed.
The kernel is RHEL 7.2, and the bug is hard to reproduce, so I don't know if it
exists in mainline, any reply is welcome!

[35030.332666] general protection fault:  [#1] SMP
[35030.333016] Modules linked in: veth ipt_MASQUERADE nf_nat_masquerade_ipv4 
iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype 
iptable_filter xt_conntrack nf_nat nf_conntrack bridge stp llc dm_thin_pool 
dm_persistent_data dm_bio_prison dm_bufio libcrc32c rtos_kbox_panic(OE) 
ipmi_devintf ipmi_si ipmi_msghandler signo_catch(O) cirrus syscopyarea 
sysfillrect sysimgblt ttm crc32_pclmul ghash_clmulni_intel drm_kms_helper 
aesni_intel ppdev drm lrw gf128mul parport_pc glue_helper ablk_helper serio_raw 
cryptd i2c_piix4 parport pcspkr sg floppy i2c_core dm_mod sha512_generic 
ip_tables sd_mod crc_t10dif crct10dif_generic sr_mod cdrom virtio_console 
virtio_scsi virtio_net ata_generic pata_acpi crct10dif_pclmul crct10dif_common 
crc32c_intel virtio_pci virtio_ring virtio ata_piix libata ext4 mbcache
[35030.333016]  jbd2
[35030.333016] CPU: 3 PID: 48 Comm: kswapd0 Tainted: G   OE   
---   3.10.0-327.36.58.4.x86_64 #1
[35030.333016] Hardware name: OpenStack Foundation OpenStack Nova, BIOS 
rel-1.8.1-0-g4adadbd-20160826_03-hghoulaslx112 04/01/2014
[35030.333016] task: 8801b2d2 ti: 8801b4c38000 task.ti: 
8801b4c38000
[35030.333016] RIP: 0010:[]  [] 
down_read_trylock+0x5/0x50
[35030.333016] RSP: :8801b4c3ba90  EFLAGS: 00010282
[35030.333016] RAX:  RBX: 8801b3e2a100 RCX: 
[35030.333016] RDX:  RSI:  RDI: deb604d497705c5d
[35030.333016] RBP: 8801b4c3bab8 R08: ea0002c34460 R09: 8801b3d7e8a0
[35030.333016] R10: 0004 R11: fff0fe00 R12: 8801b3e2a101
[35030.333016] R13: ea0002c34440 R14: deb604d497705c5d R15: ea0002c34440
[35030.333016] FS:  () GS:8801bed8() 
knlGS:
[35030.333016] CS:  0010 DS:  ES:  CR0: 80050033
[35030.333016] CR2: 00c422011080 CR3: 01976000 CR4: 001407e0
[35030.333016] DR0:  DR1:  DR2: 
[35030.333016] DR3:  DR6: 0ff0 DR7: 0400
[35030.333016] Stack:
[35030.333016]  811b2795 ea0002c34440  
000f
[35030.333016]  0001 8801b4c3bb30 811b2a17 
8800a712d640
[35030.333016]  0c4229e2 8801b4c3bb80 0001 
0c41fe38
[35030.333016] Call Trace:
[35030.333016]  [] ? page_lock_anon_vma_read+0x55/0x110
[35030.333016]  [] page_referenced+0x1c7/0x350
[35030.333016]  [] shrink_active_list+0x1e4/0x400
[35030.333016]  [] shrink_lruvec+0x4bd/0x770
[35030.333016]  [] shrink_zone+0x76/0x1a0
[35030.333016]  [] balance_pgdat+0x49c/0x610
[35030.333016]  [] kswapd+0x173/0x450
[35030.333016]  [] ? wake_up_atomic_t+0x30/0x30
[35030.333016]  [] ? balance_pgdat+0x610/0x610
[35030.333016]  [] kthread+0xcf/0xe0
[35030.333016]  [] ? kthread_create_on_node+0x120/0x120
[35030.333016]  [] ret_from_fork+0x58/0x90
[35030.333016]  [] ? kthread_create_on_node+0x120/0x120
[35030.333016] Code: 00 ba ff ff ff ff 48 89 d8 f0 48 0f c1 10 79 05 e8 31 06 
27 00 5b 5d c3 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <48> 8b 
07 48 89 c2 48 83 c2 01 7e 07 f0 48 0f b1 17 75 f0 48 f7
[35030.333016] RIP  [] down_read_trylock+0x5/0x50
[35030.333016]  RSP 
[35030.333016] [ cut here ]

struct page {
  flags = 9007194960298056,
  mapping = 0x8801b3e2a101,
  {
{
  index = 34324593617,
  freelist = 0x7fde7bbd1,
  pfmemalloc = 209,
  thp_mmu_gather = {
counter = -35144751
  },
  pmd_huge_pte = 0x7fde7bbd1
},
{
  counters = 8589934592,
  {
{
  _mapcount = {
counter = 0
  },
  {
inuse = 0,
objects = 0,
frozen = 0
  },
  units = 0
},
_count = {
  counter = 2
}
  }
}
  },
  {
lru = {
  next = 0xdead00100100,
  prev = 0xdead00200200
},
{
  next = 0xdead00100100,
  pages = 2097664,
  pobjects = -559087616
},
list = {
  next = 0xdead00100100,
  prev = 0xdead00200200
},
slab_page = 0xdead00100100
  },
  {
private = 0,
ptl = {
  {
rlock = {
  raw_lock = {
{
  head_tail = 0,
  tickets = {
head = 0,
tail = 0
  }
}
  }
}
  }
},
slab_cache = 0x0,
first_page = 0x0
  }
}



crash> struct anon_vma 0x8801b3e2a100
struct anon_vma {
  root = 0xdeb604d497705c55,
  rwsem = {
count = -8192007903225070328,
wait_lock = {

does anyone have seen this bug? reset_interrupt+0x82/0xa0 [floppy]

2017-05-16 Thread Xishi Qiu

2017-05-12T04:46:36.373001+08:00[[32m  OK  [0m] Reached target System 
Initialization.
2017-05-12T04:46:36.385253+08:00[[32m  OK  [0m] Reached target Basic System.
2017-05-12T04:46:43.049936+08:00[   25.839157] BUG: unable to handle kernel [   
25.841509] floppy0: no floppy controllers found
2017-05-12T04:46:43.597615+08:00NULL pointer dereference[   25.841740] work 
still pending
2017-05-12T04:46:44.376251+08:00 at 0010
2017-05-12T04:46:44.621685+08:00[   27.413928] IP: [] 
reset_interrupt+0x82/0xa0 [floppy]
2017-05-12T04:46:45.324827+08:00[   28.117060] PGD 0 
2017-05-12T04:46:45.615632+08:00[   28.407879] Oops:  [#1] SMP 
2017-05-12T04:46:45.710419+08:00[   28.501984] Modules linked in: virtio(+) 
crc32c_intel(+) floppy(+)
2017-05-12T04:46:45.899891+08:00[   28.692147] CPU: 13 PID: 159 Comm: 
kworker/u32:1 Not tainted 3.10.0-327.28.56.6.x86_64 #1
2017-05-12T04:46:46.187787+08:00[   28.980030] Hardware name: QEMU Standard PC 
(i440FX + PIIX, 1996), BIOS 
rel-1.9.1-0-gb3ef39f-20170509_142013-build9a64a246a222 04/01/2014
2017-05-12T04:46:46.597071+08:00[   29.385246] Workqueue: floppy 
reset_interrupt [floppy]
2017-05-12T04:46:46.785518+08:00[   29.577762] task: 880425fe6780 ti: 
88042584 task.ti: 88042584
2017-05-12T04:46:47.059479+08:00[   29.851728] RIP: 0010:[]  
[] reset_interrupt+0x82/0xa0 [floppy]
2017-05-12T04:46:47.473387+08:00[   30.231355] RSP: 0018:880425843e18  
EFLAGS: 00010202
2017-05-12T04:46:47.695370+08:00[   30.487624] RAX:  RBX: 
a000b180 RCX: dead00200200
2017-05-12T04:46:47.886238+08:00[   30.678467] RDX:  RSI: 
00c200c0 RDI: a0009828
2017-05-12T04:46:48.054132+08:00[   30.846395] RBP: 880425843e18 R08: 
a000b188 R09: 00017ffcda00b180
2017-05-12T04:46:48.216490+08:00[   31.008753] R10: 00017ffcda00b180 R11: 
03ff R12: 880428845880
2017-05-12T04:46:48.417821+08:00[   31.210077] R13: 880429178c00 R14: 
880425236a00 R15: 0400
2017-05-12T04:46:48.565850+08:00[   31.358107] FS:  () 
GS:88042f54() knlGS:
2017-05-12T04:46:48.969712+08:00[   31.761961] CS:  0010 DS:  ES:  CR0: 
80050033
2017-05-12T04:46:49.527370+08:00[   32.319621] CR2: 0010 CR3: 
00042569b000 CR4: 001407e0
2017-05-12T04:46:49.996451+08:00[   32.788704] DR0:  DR1: 
 DR2: 
2017-05-12T04:46:50.008769+08:00[   32.801019] DR3:  DR6: 
0ff0 DR7: 0400
2017-05-12T04:46:50.025615+08:00[   32.817840] Stack:
2017-05-12T04:46:50.033198+08:00[   32.825441]  880425843e60 
8109d8ab 000129178c18 
2017-05-12T04:46:50.058276+08:00[   32.850530]  880429178c18 
8804288458b0 880425fe6780 880428845880
2017-05-12T04:46:50.088493+08:00[   32.880752]  880429178c00 
880425843ec0 8109e67b 880425843fd8
2017-05-12T04:46:50.116903+08:00[   32.909158] Call Trace:
2017-05-12T04:46:50.126325+08:00[   32.918578]  [] 
process_one_work+0x17b/0x470
2017-05-12T04:46:50.135293+08:00[   32.927550]  [] 
worker_thread+0x11b/0x400
2017-05-12T04:46:50.144728+08:00[   32.936983]  [] ? 
rescuer_thread+0x400/0x400
2017-05-12T04:46:50.174593+08:00[   32.966841]  [] 
kthread+0xcf/0xe0
2017-05-12T04:46:50.189138+08:00[   32.981394]  [] ? 
finish_task_switch+0x56/0x180
2017-05-12T04:46:50.197718+08:00[   32.989977]  [] ? 
kthread_create_on_node+0x140/0x140
2017-05-12T04:46:50.208049+08:00[   33.000291]  [] 
ret_from_fork+0x58/0x90
2017-05-12T04:46:50.216998+08:00[   33.009262]  [] ? 
kthread_create_on_node+0x140/0x140
2017-05-12T04:46:50.227458+08:00[   33.019717] Code: c7 c2 0d 88 00 a0 48 c7 c6 
10 81 00 a0 48 c7 c7 2b 89 00 a0 31 c0 e8 cc 72 63 e1 eb a9 48 8b 05 ad 8d 00 
00 48 c7 c7 28 98 00 a0 <48> 8b 70 10 31 c0 e8 b1 72 63 e1 48 8b 05 94 8d 00 00 
ff 50 10 
2017-05-12T04:46:51.568674+08:00[   34.360929] RIP  [] 
reset_interrupt+0x82/0xa0 [floppy]
2017-05-12T04:46:52.196079+08:00[   34.988324]  RSP 
2017-05-12T04:46:52.212738+08:00[   35.004995] CR2: 0010
2017-05-12T04:46:52.229520+08:00[   35.021777] ---[ end trace 9881c03b6b437287 
]---
2017-05-12T04:46:52.253266+08:00[   35.045520] Kernel panic - not syncing: 
Fatal exception

does anyone have seen this bug? reset_interrupt+0x82/0xa0 [floppy]

2017-05-16 Thread Xishi Qiu

2017-05-12T04:46:36.373001+08:00[[32m  OK  [0m] Reached target System 
Initialization.
2017-05-12T04:46:36.385253+08:00[[32m  OK  [0m] Reached target Basic System.
2017-05-12T04:46:43.049936+08:00[   25.839157] BUG: unable to handle kernel [   
25.841509] floppy0: no floppy controllers found
2017-05-12T04:46:43.597615+08:00NULL pointer dereference[   25.841740] work 
still pending
2017-05-12T04:46:44.376251+08:00 at 0010
2017-05-12T04:46:44.621685+08:00[   27.413928] IP: [] 
reset_interrupt+0x82/0xa0 [floppy]
2017-05-12T04:46:45.324827+08:00[   28.117060] PGD 0 
2017-05-12T04:46:45.615632+08:00[   28.407879] Oops:  [#1] SMP 
2017-05-12T04:46:45.710419+08:00[   28.501984] Modules linked in: virtio(+) 
crc32c_intel(+) floppy(+)
2017-05-12T04:46:45.899891+08:00[   28.692147] CPU: 13 PID: 159 Comm: 
kworker/u32:1 Not tainted 3.10.0-327.28.56.6.x86_64 #1
2017-05-12T04:46:46.187787+08:00[   28.980030] Hardware name: QEMU Standard PC 
(i440FX + PIIX, 1996), BIOS 
rel-1.9.1-0-gb3ef39f-20170509_142013-build9a64a246a222 04/01/2014
2017-05-12T04:46:46.597071+08:00[   29.385246] Workqueue: floppy 
reset_interrupt [floppy]
2017-05-12T04:46:46.785518+08:00[   29.577762] task: 880425fe6780 ti: 
88042584 task.ti: 88042584
2017-05-12T04:46:47.059479+08:00[   29.851728] RIP: 0010:[]  
[] reset_interrupt+0x82/0xa0 [floppy]
2017-05-12T04:46:47.473387+08:00[   30.231355] RSP: 0018:880425843e18  
EFLAGS: 00010202
2017-05-12T04:46:47.695370+08:00[   30.487624] RAX:  RBX: 
a000b180 RCX: dead00200200
2017-05-12T04:46:47.886238+08:00[   30.678467] RDX:  RSI: 
00c200c0 RDI: a0009828
2017-05-12T04:46:48.054132+08:00[   30.846395] RBP: 880425843e18 R08: 
a000b188 R09: 00017ffcda00b180
2017-05-12T04:46:48.216490+08:00[   31.008753] R10: 00017ffcda00b180 R11: 
03ff R12: 880428845880
2017-05-12T04:46:48.417821+08:00[   31.210077] R13: 880429178c00 R14: 
880425236a00 R15: 0400
2017-05-12T04:46:48.565850+08:00[   31.358107] FS:  () 
GS:88042f54() knlGS:
2017-05-12T04:46:48.969712+08:00[   31.761961] CS:  0010 DS:  ES:  CR0: 
80050033
2017-05-12T04:46:49.527370+08:00[   32.319621] CR2: 0010 CR3: 
00042569b000 CR4: 001407e0
2017-05-12T04:46:49.996451+08:00[   32.788704] DR0:  DR1: 
 DR2: 
2017-05-12T04:46:50.008769+08:00[   32.801019] DR3:  DR6: 
0ff0 DR7: 0400
2017-05-12T04:46:50.025615+08:00[   32.817840] Stack:
2017-05-12T04:46:50.033198+08:00[   32.825441]  880425843e60 
8109d8ab 000129178c18 
2017-05-12T04:46:50.058276+08:00[   32.850530]  880429178c18 
8804288458b0 880425fe6780 880428845880
2017-05-12T04:46:50.088493+08:00[   32.880752]  880429178c00 
880425843ec0 8109e67b 880425843fd8
2017-05-12T04:46:50.116903+08:00[   32.909158] Call Trace:
2017-05-12T04:46:50.126325+08:00[   32.918578]  [] 
process_one_work+0x17b/0x470
2017-05-12T04:46:50.135293+08:00[   32.927550]  [] 
worker_thread+0x11b/0x400
2017-05-12T04:46:50.144728+08:00[   32.936983]  [] ? 
rescuer_thread+0x400/0x400
2017-05-12T04:46:50.174593+08:00[   32.966841]  [] 
kthread+0xcf/0xe0
2017-05-12T04:46:50.189138+08:00[   32.981394]  [] ? 
finish_task_switch+0x56/0x180
2017-05-12T04:46:50.197718+08:00[   32.989977]  [] ? 
kthread_create_on_node+0x140/0x140
2017-05-12T04:46:50.208049+08:00[   33.000291]  [] 
ret_from_fork+0x58/0x90
2017-05-12T04:46:50.216998+08:00[   33.009262]  [] ? 
kthread_create_on_node+0x140/0x140
2017-05-12T04:46:50.227458+08:00[   33.019717] Code: c7 c2 0d 88 00 a0 48 c7 c6 
10 81 00 a0 48 c7 c7 2b 89 00 a0 31 c0 e8 cc 72 63 e1 eb a9 48 8b 05 ad 8d 00 
00 48 c7 c7 28 98 00 a0 <48> 8b 70 10 31 c0 e8 b1 72 63 e1 48 8b 05 94 8d 00 00 
ff 50 10 
2017-05-12T04:46:51.568674+08:00[   34.360929] RIP  [] 
reset_interrupt+0x82/0xa0 [floppy]
2017-05-12T04:46:52.196079+08:00[   34.988324]  RSP 
2017-05-12T04:46:52.212738+08:00[   35.004995] CR2: 0010
2017-05-12T04:46:52.229520+08:00[   35.021777] ---[ end trace 9881c03b6b437287 
]---
2017-05-12T04:46:52.253266+08:00[   35.045520] Kernel panic - not syncing: 
Fatal exception

Re: [RESENT PATCH] x86/mem: fix the offset overflow when read/write mem

2017-05-09 Thread Xishi Qiu

On 2017/5/9 23:46, Rik van Riel wrote:

> On Thu, 2017-05-04 at 10:28 +0800, zhong jiang wrote:
>> On 2017/5/4 2:46, Rik van Riel wrote:
> 
>>> However, it is not as easy as simply checking the
>>> end against __pa(high_memory). Some systems have
>>> non-contiguous physical memory ranges, with gaps
>>> of invalid addresses in-between.
>>
>>  The invalid physical address means that it is used as
>>  io mapped. not in system ram region. /dev/mem is not
>>  access to them , is it right?
> 
> Not necessarily. Some systems simply have large
> gaps in physical memory access. Their memory map
> may look like this:
> 
> |MM|IO||..||
> 
> Where M is memory, IO is IO space, and the
> dots are simply a gap in physical address
> space with no valid accesses at all.
>

Hi Rik,

Do you mean IO space is allowed to access from mmap /dev/mem?

Thanks,
Xishi Qiu

 
>>> At that point, is the complexity so much that it no
>>> longer makes sense to try to protect against root
>>> crashing the system?
>>>
>>
>>  your suggestion is to let the issue along without any protection.
>>  just root user know what they are doing.
> 
> Well, root already has other ways to crash the system.
> 
> Implementing validation on /dev/mem may make sense if
> it can be done in a simple way, but may not be worth
> it if it becomes too complex.
>

Re: [RESENT PATCH] x86/mem: fix the offset overflow when read/write mem

2017-05-09 Thread Xishi Qiu

On 2017/5/9 23:46, Rik van Riel wrote:

> On Thu, 2017-05-04 at 10:28 +0800, zhong jiang wrote:
>> On 2017/5/4 2:46, Rik van Riel wrote:
> 
>>> However, it is not as easy as simply checking the
>>> end against __pa(high_memory). Some systems have
>>> non-contiguous physical memory ranges, with gaps
>>> of invalid addresses in-between.
>>
>>  The invalid physical address means that it is used as
>>  io mapped. not in system ram region. /dev/mem is not
>>  access to them , is it right?
> 
> Not necessarily. Some systems simply have large
> gaps in physical memory access. Their memory map
> may look like this:
> 
> |MM|IO||..||
> 
> Where M is memory, IO is IO space, and the
> dots are simply a gap in physical address
> space with no valid accesses at all.
>

Hi Rik,

Do you mean IO space is allowed to access from mmap /dev/mem?

Thanks,
Xishi Qiu

 
>>> At that point, is the complexity so much that it no
>>> longer makes sense to try to protect against root
>>> crashing the system?
>>>
>>
>>  your suggestion is to let the issue along without any protection.
>>  just root user know what they are doing.
> 
> Well, root already has other ways to crash the system.
> 
> Implementing validation on /dev/mem may make sense if
> it can be done in a simple way, but may not be worth
> it if it becomes too complex.
>

Re: [RFC] dev/mem: "memtester -p 0x6c80000000000 10G" cause crash

2017-05-02 Thread Xishi Qiu

On 2017/5/2 17:16, Michal Hocko wrote:

> On Tue 02-05-17 16:52:00, Xishi Qiu wrote:
>> On 2017/5/2 16:43, Michal Hocko wrote:
>>
>>> On Tue 02-05-17 15:59:23, Xishi Qiu wrote:
>>>> Hi, I use "memtester -p 0x6c800 10G" to test physical address 
>>>> 0x6c800
>>>> Because this physical address is invalid, and valid_mmap_phys_addr_range()
>>>> always return 1, so it causes crash.
>>>>
>>>> My question is that should the user assure the physical address is valid?
>>>
>>> We already seem to be checking range_is_allowed(). What is your
>>> CONFIG_STRICT_DEVMEM setting? The code seems to be rather confusing but
>>> my assumption is that you better know what you are doing when mapping
>>> this file.
>>>
>>
>> HI Michal,
>>
>> CONFIG_STRICT_DEVMEM=y, and range_is_allowed() will skip memory, but
>> 0x6c800 is not memory, it is just a invalid address, so it cause
>> crash. 
> 
> OK, I only now looked at the value. It is beyond addressable limit
> (for 47b address space). None of the checks seems to stop this because
> range_is_allowed() resp. its devmem_is_allowed() will allow it as a
> non RAM (!page_is_ram check). I am not really sure how to fix this or
> whether even we should try to fix this particular problem. As I've said
> /dev/mem is dangerous and you should better know what you are doing when
> accessing it.
> 

OK, I know, thank you!

Thanks,
Xishi Qiu

Re: [RFC] dev/mem: "memtester -p 0x6c80000000000 10G" cause crash

2017-05-02 Thread Xishi Qiu

On 2017/5/2 17:16, Michal Hocko wrote:

> On Tue 02-05-17 16:52:00, Xishi Qiu wrote:
>> On 2017/5/2 16:43, Michal Hocko wrote:
>>
>>> On Tue 02-05-17 15:59:23, Xishi Qiu wrote:
>>>> Hi, I use "memtester -p 0x6c800 10G" to test physical address 
>>>> 0x6c800
>>>> Because this physical address is invalid, and valid_mmap_phys_addr_range()
>>>> always return 1, so it causes crash.
>>>>
>>>> My question is that should the user assure the physical address is valid?
>>>
>>> We already seem to be checking range_is_allowed(). What is your
>>> CONFIG_STRICT_DEVMEM setting? The code seems to be rather confusing but
>>> my assumption is that you better know what you are doing when mapping
>>> this file.
>>>
>>
>> HI Michal,
>>
>> CONFIG_STRICT_DEVMEM=y, and range_is_allowed() will skip memory, but
>> 0x6c800 is not memory, it is just a invalid address, so it cause
>> crash. 
> 
> OK, I only now looked at the value. It is beyond addressable limit
> (for 47b address space). None of the checks seems to stop this because
> range_is_allowed() resp. its devmem_is_allowed() will allow it as a
> non RAM (!page_is_ram check). I am not really sure how to fix this or
> whether even we should try to fix this particular problem. As I've said
> /dev/mem is dangerous and you should better know what you are doing when
> accessing it.
> 

OK, I know, thank you!

Thanks,
Xishi Qiu

Re: [RFC] dev/mem: "memtester -p 0x6c80000000000 10G" cause crash

2017-05-02 Thread Xishi Qiu

On 2017/5/2 16:43, Michal Hocko wrote:

> On Tue 02-05-17 15:59:23, Xishi Qiu wrote:
>> Hi, I use "memtester -p 0x6c800 10G" to test physical address 
>> 0x6c800
>> Because this physical address is invalid, and valid_mmap_phys_addr_range()
>> always return 1, so it causes crash.
>>
>> My question is that should the user assure the physical address is valid?
> 
> We already seem to be checking range_is_allowed(). What is your
> CONFIG_STRICT_DEVMEM setting? The code seems to be rather confusing but
> my assumption is that you better know what you are doing when mapping
> this file.
> 

HI Michal,

CONFIG_STRICT_DEVMEM=y, and range_is_allowed() will skip memory, but
0x6c800 is not memory, it is just a invalid address, so it cause
crash. 
You mean the user should assure the physical address is valid, right?

Thanks,
Xishi Qiu

>> ...
>> [ 169.147578] ? panic+0x1f1/0x239
>> [ 169.150789] oops_end+0xb8/0xd0
>> [ 169.153910] pgtable_bad+0x8a/0x95
>> [ 169.157294] __do_page_fault+0x3aa/0x4a0
>> [ 169.161194] do_page_fault+0x30/0x80
>> [ 169.164750] ? do_syscall_64+0x175/0x180
>> [ 169.168649] page_fault+0x28/0x30
>>
>> Thanks,
>> Xishi Qiu
>

Re: [RFC] dev/mem: "memtester -p 0x6c80000000000 10G" cause crash

2017-05-02 Thread Xishi Qiu

On 2017/5/2 16:43, Michal Hocko wrote:

> On Tue 02-05-17 15:59:23, Xishi Qiu wrote:
>> Hi, I use "memtester -p 0x6c800 10G" to test physical address 
>> 0x6c800
>> Because this physical address is invalid, and valid_mmap_phys_addr_range()
>> always return 1, so it causes crash.
>>
>> My question is that should the user assure the physical address is valid?
> 
> We already seem to be checking range_is_allowed(). What is your
> CONFIG_STRICT_DEVMEM setting? The code seems to be rather confusing but
> my assumption is that you better know what you are doing when mapping
> this file.
> 

HI Michal,

CONFIG_STRICT_DEVMEM=y, and range_is_allowed() will skip memory, but
0x6c800 is not memory, it is just a invalid address, so it cause
crash. 
You mean the user should assure the physical address is valid, right?

Thanks,
Xishi Qiu

>> ...
>> [ 169.147578] ? panic+0x1f1/0x239
>> [ 169.150789] oops_end+0xb8/0xd0
>> [ 169.153910] pgtable_bad+0x8a/0x95
>> [ 169.157294] __do_page_fault+0x3aa/0x4a0
>> [ 169.161194] do_page_fault+0x30/0x80
>> [ 169.164750] ? do_syscall_64+0x175/0x180
>> [ 169.168649] page_fault+0x28/0x30
>>
>> Thanks,
>> Xishi Qiu
>

[RFC] dev/mem: "memtester -p 0x6c80000000000 10G" cause crash

2017-05-02 Thread Xishi Qiu

Hi, I use "memtester -p 0x6c800 10G" to test physical address 
0x6c800
Because this physical address is invalid, and valid_mmap_phys_addr_range()
always return 1, so it causes crash.

My question is that should the user assure the physical address is valid?

...
[ 169.147578] ? panic+0x1f1/0x239
[ 169.150789] oops_end+0xb8/0xd0
[ 169.153910] pgtable_bad+0x8a/0x95
[ 169.157294] __do_page_fault+0x3aa/0x4a0
[ 169.161194] do_page_fault+0x30/0x80
[ 169.164750] ? do_syscall_64+0x175/0x180
[ 169.168649] page_fault+0x28/0x30

Thanks,
Xishi Qiu

[RFC] dev/mem: "memtester -p 0x6c80000000000 10G" cause crash

2017-05-02 Thread Xishi Qiu

Hi, I use "memtester -p 0x6c800 10G" to test physical address 
0x6c800
Because this physical address is invalid, and valid_mmap_phys_addr_range()
always return 1, so it causes crash.

My question is that should the user assure the physical address is valid?

...
[ 169.147578] ? panic+0x1f1/0x239
[ 169.150789] oops_end+0xb8/0xd0
[ 169.153910] pgtable_bad+0x8a/0x95
[ 169.157294] __do_page_fault+0x3aa/0x4a0
[ 169.161194] do_page_fault+0x30/0x80
[ 169.164750] ? do_syscall_64+0x175/0x180
[ 169.168649] page_fault+0x28/0x30

Thanks,
Xishi Qiu

Re: NULL pointer dereference in the kernel 3.10

2017-04-10 Thread Xishi Qiu

On 2017/4/10 17:37, Hillf Danton wrote:

> On April 10, 2017 4:57 PM Xishi Qiu wrote: 
>> On 2017/4/10 14:42, Hillf Danton wrote:
>>
>>> On April 08, 2017 9:40 PM zhong Jiang wrote:
>>>>
>>>> when runing the stabile docker cases in the vm.   The following issue will 
>>>> come up.
>>>>
>>>> #40 [8801b57ffb30] async_page_fault at 8165c9f8
>>>> [exception RIP: down_read_trylock+5]
>>>> RIP: 810aca65  RSP: 8801b57ffbe8  RFLAGS: 00010202
>>>> RAX:   RBX: 88018ae858c1  RCX: 
>>>> RDX:   RSI:   RDI: 0008
>>>> RBP: 8801b57ffc10   R8: ea0006903de0   R9: 8800b3c61810
>>>> R10: 22cb  R11:   R12: 88018ae858c0
>>>> R13: ea0006903dc0  R14: 0008  R15: ea0006903dc0
>>>> ORIG_RAX:   CS: 0010  SS: 
>>>> #41 [8801b57ffbe8] page_lock_anon_vma_read at 811b241c
>>>> #42 [8801b57ffc18] page_referenced at 811b26a7
>>>> #43 [8801b57ffc90] shrink_active_list at 8118d634
>>>> #44 [8801b57ffd48] balance_pgdat at 8118f088
>>>> #45 [8801b57ffe20] kswapd at 8118f633
>>>> #46 [8801b57ffec8] kthread at 810a795f
>>>> #47 [8801b57fff50] ret_from_fork at 81665398
>>>> crash> struct page.mapping ea0006903dc0
>>>>   mapping = 0x88018ae858c1
>>>> crash> struct anon_vma 0x88018ae858c0
>>>> struct anon_vma {
>>>>   root = 0x0,
>>>>   rwsem = {
>>>> count = 0,
>>>> wait_lock = {
>>>>   raw_lock = {
>>>> {
>>>>   head_tail = 1,
>>>>   tickets = {
>>>> head = 1,
>>>> tail = 0
>>>>   }
>>>> }
>>>>   }
>>>> },
>>>> wait_list = {
>>>>   next = 0x0,
>>>>   prev = 0x0
>>>> }
>>>>   },
>>>>   refcount = {
>>>> counter = 0
>>>>   },
>>>>   rb_root = {
>>>> rb_node = 0x0
>>>>   }
>>>> }
>>>>
>>>> This maks me wonder,  the anon_vma do not come from slab structure.
>>>> and the content is abnormal. IMO,  At least anon_vma->root will not NULL.
>>>> The issue can be reproduced every other week.
>>>>
>>> Check please if commit
>>> 624483f3ea8 ("mm: rmap: fix use-after-free in __put_anon_vma")
>>> is included in the 3.10 you are running.
>>>
>> We missed this patch in RHEL 7.2
>> Could you please give more details for how it triggered?
> 
> Sorry, I could not. 
> I guess it is UAF as described in the log of that commit.
> And if it works for you, we know how.
> 
> Hillf
> 

__put_anon_vma|   page_lock_anon_vma_read
  anon_vma_free(root) |
  | root_anon_vma = ACCESS_ONCE(anon_vma->root)
  | down_read_trylock(_anon_vma->rwsem)
  anon_vma_free(anon_vma) |

I find anon_vma was created by SLAB_DESTROY_BY_RCU, so it will not merge
by other slabs, and free_slab() will not free it during 
page_lock_anon_vma_read(),
because it holds rcu_read_lock(), right?

If root_anon_vma was reuse by someone, why "crash> struct anon_vma"
shows almost zero?

Thanks,
Xishi Qiu

> 
> 
> 
> .
>

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1005 matches

Mail list logo