RE: [Makedumpfile PATCH V2 2/4] x86_64: translate all VA to PA using page table values

2016-12-08 Thread Atsushi Kumagai
Hello Pratyush,

>---
> arch/x86_64.c  | 42 --
> makedumpfile.h |  4 ++--
> 2 files changed, 10 insertions(+), 36 deletions(-)
>
>diff --git a/arch/x86_64.c b/arch/x86_64.c
>index eba725e41aac..9afa38fd141a 100644
>--- a/arch/x86_64.c
>+++ b/arch/x86_64.c
>@@ -203,6 +203,12 @@ vtop4_x86_64(unsigned long vaddr)
> {
>   unsigned long page_dir, pml4, pgd_paddr, pgd_pte, pmd_paddr, pmd_pte;
>   unsigned long pte_paddr, pte;
>+  unsigned long phys_base;
>+
>+  if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
>+  phys_base = info->phys_base;
>+  else
>+  phys_base = 0;
>
>   if (SYMBOL(init_level4_pgt) == NOT_FOUND_SYMBOL) {
>   ERRMSG("Can't get the symbol of init_level4_pgt.\n");
>@@ -212,9 +218,9 @@ vtop4_x86_64(unsigned long vaddr)
>   /*
>* Get PGD.
>*/
>-  page_dir  = SYMBOL(init_level4_pgt);
>+  page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base;

I found that this change breaks the backward compatibility for
kernel 2.6.21 or older since phys_base was introduced in kernel 2.6.22
by the commit below:

  commit 1ab60e0f72f71ec54831e525a3e1154f1c092408
  Author: Vivek Goyal 
  Date:   Wed May 2 19:27:07 2007 +0200

  [PATCH] x86-64: Relocatable Kernel Support

There is no problem if phys_base is always 0 in older kernel, but
get_phys_base_x86_64() calculates "phys_base = 0x10" from my vmcore:

  Type   Offset VirtAddr   PhysAddr
 FileSizMemSiz  Flags  Align
  NOTE   0x0190 0x 0x
 0x0590 0x0590 0
  LOAD   0x0720 0x8000 0x0010// 
CONFIG_PHYSICAL_START = 0x10
 0x008b2000 0x008b2000  RWE0
  LOAD   0x008b2720 0x8100 0x
 0x000a 0x000a  RWE0
  LOAD   0x00952720 0x8110 0x0010
 0x00f0 0x00f0  RWE0
  LOAD   0x01852720 0x81000500 0x0500
 0xcaf7 0xcaf7  RWE0
  LOAD   0xcc7c2720 0x8101 0x0001
 0x7000 0x7000  RWE0

Of course we shouldn't use that invalid phys_base:

  crash> sym init_level4_pgt
  80101000 (T) init_level4_pgt
  crash> vtop 80101000
  VIRTUAL   PHYSICAL
  80101000  101000   // just "VIRTUAL - __START_KERNEL_map"

  PML4 DIRECTORY: 80101000
  PAGE DIRECTORY: 103027
 PUD: 103ff0 => 105027
 PMD: 105000 => 1e3
PAGE: 0  (2MB)

  PTE  PHYSICAL  FLAGS
  1e3  0 (PRESENT|RW|ACCESSED|DIRTY|PSE|GLOBAL)

PAGEPHYSICAL  MAPPING   INDEX CNT FLAGS
  81000500483810100000  1 400
  crash>

At first I thought about setting 0 to phys_base if the kernel is
older than 2.6.22, but unfortunately we can't get the kernel version
before getting correct phys_base since VtoP is necessary to read
system_utsname. 
(and 2.6.21 doesn't have VMCOREINFO, OSRELEASE can't be used too.)

Do you have any ideas for this issue ?


Thanks,
Atsushi Kumagai


>   page_dir += pml4_index(vaddr) * sizeof(unsigned long);
>-  if (!readmem(VADDR, page_dir, , sizeof pml4)) {
>+  if (!readmem(PADDR, page_dir, , sizeof pml4)) {
>   ERRMSG("Can't get pml4 (page_dir:%lx).\n", page_dir);
>   return NOT_PADDR;
>   }
>@@ -285,38 +291,6 @@ vtop4_x86_64(unsigned long vaddr)
>   return (pte & ENTRY_MASK) + PAGEOFFSET(vaddr);
> }
>
>-unsigned long long
>-vaddr_to_paddr_x86_64(unsigned long vaddr)
>-{
>-  unsigned long phys_base;
>-  unsigned long long paddr;
>-
>-  /*
>-   * Check the relocatable kernel.
>-   */
>-  if (SYMBOL(phys_base) != NOT_FOUND_SYMBOL)
>-  phys_base = info->phys_base;
>-  else
>-  phys_base = 0;
>-
>-  if (is_vmalloc_addr_x86_64(vaddr)) {
>-  if ((paddr = vtop4_x86_64(vaddr)) == NOT_PADDR) {
>-  ERRMSG("Can't convert a virtual address(%lx) to " \
>-  "physical address.\n", vaddr);
>-  return NOT_PADDR;
>-  }
>-  } else if (vaddr >= __START_KERNEL_map) {
>-  paddr = vaddr - __START_KERNEL_map + phys_base;
>-
>-  } else {
>-  if (is_xen_memory())
>-  paddr = vaddr - PAGE_OFFSET_XEN_DOM0;
>-  else
>-  paddr = vaddr - PAGE_OFFSET;
>-  }
>-  return paddr;
>-}
>-
> /*
>  * for Xen extraction
>  */
>diff --git a/makedumpfile.h b/makedumpfile.h
>index a5955ff750e5..13559651feb6 100644
>--- a/makedumpfile.h

Re: [PATCH v2] kexec: add cond_resched into kimage_alloc_crash_control_pages

2016-12-08 Thread Xunlei Pang
On 12/09/2016 at 01:13 PM, zhong jiang wrote:
> On 2016/12/8 17:41, Xunlei Pang wrote:
>> On 12/08/2016 at 10:37 AM, zhongjiang wrote:
>>> From: zhong jiang 
>>>
>>> A soft lookup will occur when I run trinity in syscall kexec_load.
>>> the corresponding stack information is as follows.
>>>
>>> [  237.235937] BUG: soft lockup - CPU#6 stuck for 22s! [trinity-c6:13859]
>>> [  237.242699] Kernel panic - not syncing: softlockup: hung tasks
>>> [  237.248573] CPU: 6 PID: 13859 Comm: trinity-c6 Tainted: G   O L 
>>> V---   3.10.0-327.28.3.35.zhongjiang.x86_64 #1
>>> [  237.259984] Hardware name: Huawei Technologies Co., Ltd. Tecal BH622 
>>> V2/BC01SRSA0, BIOS RMIBV386 06/30/2014
>>> [  237.269752]  8187626b 18cfde31 88184c803e18 
>>> 81638f16
>>> [  237.277471]  88184c803e98 8163278f 0008 
>>> 88184c803ea8
>>> [  237.285190]  88184c803e48 18cfde31 88184c803e67 
>>> 
>>> [  237.292909] Call Trace:
>>> [  237.295404][] dump_stack+0x19/0x1b
>>> [  237.301352]  [] panic+0xd8/0x214
>>> [  237.306196]  [] watchdog_timer_fn+0x1cc/0x1e0
>>> [  237.312157]  [] ? watchdog_enable+0xc0/0xc0
>>> [  237.317955]  [] __hrtimer_run_queues+0xd2/0x260
>>> [  237.324087]  [] hrtimer_interrupt+0xb0/0x1e0
>>> [  237.329963]  [] ? call_softirq+0x1c/0x30
>>> [  237.335500]  [] local_apic_timer_interrupt+0x37/0x60
>>> [  237.342228]  [] smp_apic_timer_interrupt+0x3f/0x60
>>> [  237.348771]  [] apic_timer_interrupt+0x6d/0x80
>>> [  237.354967][] ? 
>>> kimage_alloc_control_pages+0x80/0x270
>>> [  237.362875]  [] ? kmem_cache_alloc_trace+0x1ce/0x1f0
>>> [  237.369592]  [] ? do_kimage_alloc_init+0x1f/0x90
>>> [  237.375992]  [] kimage_alloc_init+0x12a/0x180
>>> [  237.382103]  [] SyS_kexec_load+0x20a/0x260
>>> [  237.387957]  [] system_call_fastpath+0x16/0x1b
>>>
>>> the first time allocate control pages may take too much time because
>>> crash_res.end can be set to a higher value. we need to add cond_resched
>>> to avoid the issue.
>>>
>>> The patch have been tested and above issue is not appear.
>>>
>>> Signed-off-by: zhong jiang 
>>> ---
>>>  kernel/kexec_core.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>>
>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>> index 5616755..bfc9621 100644
>>> --- a/kernel/kexec_core.c
>>> +++ b/kernel/kexec_core.c
>>> @@ -441,6 +441,8 @@ static struct page 
>>> *kimage_alloc_crash_control_pages(struct kimage *image,
>>> while (hole_end <= crashk_res.end) {
>>> unsigned long i;
>>>  
>>> +   cond_resched();
>>> +
>> I can't see why it would take a long time to loop inside, the job it does is 
>> simply to find a control area
>> not overlapped with image->segment[], you can see the loop "for (i = 0; i < 
>> image->nr_segments; i++)",
>> @hole_end will be advanced to the end of its next nearby segment once 
>> overlap was detected each loop,
>> also there are limited (<=16) segments, so it won't take long to locate the 
>> right area.
>>
>> Am I missing something?
>>
>> Regards,
>> Xunlei
>   if the crashkernel = auto is set in cmdline.  it represent crashk_res.end 
> will exceed to 4G, the first allocate control pages will
>   loop  million times. if we set crashk_res.end to the higher value manually, 
>  you can image

How does "loop million times" happen? See my inlined comments prefixed with 
"pxl".

kimage_alloc_crash_control_pages():
while (hole_end <= crashk_res.end) {
unsigned long i;

if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
break;
/* See if I overlap any of the segments */
for (i = 0; i < image->nr_segments; i++) {  // pxl: max 16 loops, all 
existent segments are not overlapped, though may not sorted.
unsigned long mstart, mend;

mstart = image->segment[i].mem;
mend   = mstart + image->segment[i].memsz - 1;
if ((hole_end >= mstart) && (hole_start <= mend)) {
/* Advance the hole to the end of the segment */
hole_start = (mend + (size - 1)) & ~(size - 1);
hole_end   = hole_start + size - 1;
break;  // pxl: If overlap was found, break for loop, @hole_end 
starts after the overlapped segment area, and will while loop again
}
}
/* If I don't overlap any segments I have found my hole! */
if (i == image->nr_segments) {
pages = pfn_to_page(hole_start >> PAGE_SHIFT);
image->control_page = hole_end;
break;   // pxl: no overlap with all the segments, get the result 
and break the while loop. END.
}   
}

So, the worst "while" loops in theory would be (image->nr_segments + 1), no?

Regards,
Xunlei

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2] kexec: add cond_resched into kimage_alloc_crash_control_pages

2016-12-08 Thread zhong jiang
On 2016/12/9 13:19, Eric W. Biederman wrote:
> zhong jiang  writes:
>
>> On 2016/12/8 17:41, Xunlei Pang wrote:
>>> On 12/08/2016 at 10:37 AM, zhongjiang wrote:
 From: zhong jiang 

> [snip]
 diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
 index 5616755..bfc9621 100644
 --- a/kernel/kexec_core.c
 +++ b/kernel/kexec_core.c
 @@ -441,6 +441,8 @@ static struct page 
 *kimage_alloc_crash_control_pages(struct kimage *image,
while (hole_end <= crashk_res.end) {
unsigned long i;
  
 +  cond_resched();
 +
>>> I can't see why it would take a long time to loop inside, the job it does 
>>> is simply to find a control area
>>> not overlapped with image->segment[], you can see the loop "for (i = 0; i < 
>>> image->nr_segments; i++)",
>>> @hole_end will be advanced to the end of its next nearby segment once 
>>> overlap was detected each loop,
>>> also there are limited (<=16) segments, so it won't take long to locate the 
>>> right area.
>>>
>>> Am I missing something?
>>>
>>> Regards,
>>> Xunlei
>>   if the crashkernel = auto is set in cmdline.  it represent crashk_res.end 
>> will exceed to 4G, the first allocate control pages will
>>   loop  million times. if we set crashk_res.end to the higher value
>>   manually,  you can image
> Or in short the cond_resched is about keeping things reasonable when the
> loop has worst case behavior.
>
> Eric
>
>
  Yes,   Thank you reply and comment.

  Regards,
  zhongjiang


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2] kexec: add cond_resched into kimage_alloc_crash_control_pages

2016-12-08 Thread Eric W. Biederman
zhong jiang  writes:

> On 2016/12/8 17:41, Xunlei Pang wrote:
>> On 12/08/2016 at 10:37 AM, zhongjiang wrote:
>>> From: zhong jiang 
>>>
[snip]
>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>> index 5616755..bfc9621 100644
>>> --- a/kernel/kexec_core.c
>>> +++ b/kernel/kexec_core.c
>>> @@ -441,6 +441,8 @@ static struct page 
>>> *kimage_alloc_crash_control_pages(struct kimage *image,
>>> while (hole_end <= crashk_res.end) {
>>> unsigned long i;
>>>  
>>> +   cond_resched();
>>> +
>> I can't see why it would take a long time to loop inside, the job it does is 
>> simply to find a control area
>> not overlapped with image->segment[], you can see the loop "for (i = 0; i < 
>> image->nr_segments; i++)",
>> @hole_end will be advanced to the end of its next nearby segment once 
>> overlap was detected each loop,
>> also there are limited (<=16) segments, so it won't take long to locate the 
>> right area.
>>
>> Am I missing something?
>>
>> Regards,
>> Xunlei
>   if the crashkernel = auto is set in cmdline.  it represent crashk_res.end 
> will exceed to 4G, the first allocate control pages will
>   loop  million times. if we set crashk_res.end to the higher value
>   manually,  you can image

Or in short the cond_resched is about keeping things reasonable when the
loop has worst case behavior.

Eric

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2] kexec: add cond_resched into kimage_alloc_crash_control_pages

2016-12-08 Thread zhong jiang
On 2016/12/8 17:41, Xunlei Pang wrote:
> On 12/08/2016 at 10:37 AM, zhongjiang wrote:
>> From: zhong jiang 
>>
>> A soft lookup will occur when I run trinity in syscall kexec_load.
>> the corresponding stack information is as follows.
>>
>> [  237.235937] BUG: soft lockup - CPU#6 stuck for 22s! [trinity-c6:13859]
>> [  237.242699] Kernel panic - not syncing: softlockup: hung tasks
>> [  237.248573] CPU: 6 PID: 13859 Comm: trinity-c6 Tainted: G   O L 
>> V---   3.10.0-327.28.3.35.zhongjiang.x86_64 #1
>> [  237.259984] Hardware name: Huawei Technologies Co., Ltd. Tecal BH622 
>> V2/BC01SRSA0, BIOS RMIBV386 06/30/2014
>> [  237.269752]  8187626b 18cfde31 88184c803e18 
>> 81638f16
>> [  237.277471]  88184c803e98 8163278f 0008 
>> 88184c803ea8
>> [  237.285190]  88184c803e48 18cfde31 88184c803e67 
>> 
>> [  237.292909] Call Trace:
>> [  237.295404][] dump_stack+0x19/0x1b
>> [  237.301352]  [] panic+0xd8/0x214
>> [  237.306196]  [] watchdog_timer_fn+0x1cc/0x1e0
>> [  237.312157]  [] ? watchdog_enable+0xc0/0xc0
>> [  237.317955]  [] __hrtimer_run_queues+0xd2/0x260
>> [  237.324087]  [] hrtimer_interrupt+0xb0/0x1e0
>> [  237.329963]  [] ? call_softirq+0x1c/0x30
>> [  237.335500]  [] local_apic_timer_interrupt+0x37/0x60
>> [  237.342228]  [] smp_apic_timer_interrupt+0x3f/0x60
>> [  237.348771]  [] apic_timer_interrupt+0x6d/0x80
>> [  237.354967][] ? 
>> kimage_alloc_control_pages+0x80/0x270
>> [  237.362875]  [] ? kmem_cache_alloc_trace+0x1ce/0x1f0
>> [  237.369592]  [] ? do_kimage_alloc_init+0x1f/0x90
>> [  237.375992]  [] kimage_alloc_init+0x12a/0x180
>> [  237.382103]  [] SyS_kexec_load+0x20a/0x260
>> [  237.387957]  [] system_call_fastpath+0x16/0x1b
>>
>> the first time allocate control pages may take too much time because
>> crash_res.end can be set to a higher value. we need to add cond_resched
>> to avoid the issue.
>>
>> The patch have been tested and above issue is not appear.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>  kernel/kexec_core.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index 5616755..bfc9621 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -441,6 +441,8 @@ static struct page 
>> *kimage_alloc_crash_control_pages(struct kimage *image,
>>  while (hole_end <= crashk_res.end) {
>>  unsigned long i;
>>  
>> +cond_resched();
>> +
> I can't see why it would take a long time to loop inside, the job it does is 
> simply to find a control area
> not overlapped with image->segment[], you can see the loop "for (i = 0; i < 
> image->nr_segments; i++)",
> @hole_end will be advanced to the end of its next nearby segment once overlap 
> was detected each loop,
> also there are limited (<=16) segments, so it won't take long to locate the 
> right area.
>
> Am I missing something?
>
> Regards,
> Xunlei
  if the crashkernel = auto is set in cmdline.  it represent crashk_res.end 
will exceed to 4G, the first allocate control pages will
  loop  million times. if we set crashk_res.end to the higher value manually,  
you can image

  Thanks
  zhongjiang
>>  if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
>>  break;
>>  /* See if I overlap any of the segments */
>
> .
>



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] kexec-tools/x86: get_kernel_vaddr_and_size off-by-one fix

2016-12-08 Thread Simon Horman
On Thu, Dec 08, 2016 at 10:52:22AM +0800, Dave Young wrote:
> I got below error while tesing kexec -p:
> "Can't find kernel text map area from kcore"
> 
> The case is the pt_load start addr was same as stext_sym. The checking
> code should really be saddr <= stext_sym so that the right pt_load area
> includes stext_sym can be matched.
> 
> This was not reported by people previously because it will fail over to
> use hardcode X86_64__START_KERNEL_map to match the pt_load areas again
> in later code and it sometimes succeeds because of kernel address
> randomization.
> 
> With this change according to my test stext_sym checking can garantee
> falling into right pt_load area if we get correct stext_sym.
> 
> Signed-off-by: Dave Young 

Thanks, applied.

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2] kexec: add cond_resched into kimage_alloc_crash_control_pages

2016-12-08 Thread Xunlei Pang
On 12/08/2016 at 10:37 AM, zhongjiang wrote:
> From: zhong jiang 
>
> A soft lookup will occur when I run trinity in syscall kexec_load.
> the corresponding stack information is as follows.
>
> [  237.235937] BUG: soft lockup - CPU#6 stuck for 22s! [trinity-c6:13859]
> [  237.242699] Kernel panic - not syncing: softlockup: hung tasks
> [  237.248573] CPU: 6 PID: 13859 Comm: trinity-c6 Tainted: G   O L 
> V---   3.10.0-327.28.3.35.zhongjiang.x86_64 #1
> [  237.259984] Hardware name: Huawei Technologies Co., Ltd. Tecal BH622 
> V2/BC01SRSA0, BIOS RMIBV386 06/30/2014
> [  237.269752]  8187626b 18cfde31 88184c803e18 
> 81638f16
> [  237.277471]  88184c803e98 8163278f 0008 
> 88184c803ea8
> [  237.285190]  88184c803e48 18cfde31 88184c803e67 
> 
> [  237.292909] Call Trace:
> [  237.295404][] dump_stack+0x19/0x1b
> [  237.301352]  [] panic+0xd8/0x214
> [  237.306196]  [] watchdog_timer_fn+0x1cc/0x1e0
> [  237.312157]  [] ? watchdog_enable+0xc0/0xc0
> [  237.317955]  [] __hrtimer_run_queues+0xd2/0x260
> [  237.324087]  [] hrtimer_interrupt+0xb0/0x1e0
> [  237.329963]  [] ? call_softirq+0x1c/0x30
> [  237.335500]  [] local_apic_timer_interrupt+0x37/0x60
> [  237.342228]  [] smp_apic_timer_interrupt+0x3f/0x60
> [  237.348771]  [] apic_timer_interrupt+0x6d/0x80
> [  237.354967][] ? 
> kimage_alloc_control_pages+0x80/0x270
> [  237.362875]  [] ? kmem_cache_alloc_trace+0x1ce/0x1f0
> [  237.369592]  [] ? do_kimage_alloc_init+0x1f/0x90
> [  237.375992]  [] kimage_alloc_init+0x12a/0x180
> [  237.382103]  [] SyS_kexec_load+0x20a/0x260
> [  237.387957]  [] system_call_fastpath+0x16/0x1b
>
> the first time allocate control pages may take too much time because
> crash_res.end can be set to a higher value. we need to add cond_resched
> to avoid the issue.
>
> The patch have been tested and above issue is not appear.
>
> Signed-off-by: zhong jiang 
> ---
>  kernel/kexec_core.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index 5616755..bfc9621 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -441,6 +441,8 @@ static struct page 
> *kimage_alloc_crash_control_pages(struct kimage *image,
>   while (hole_end <= crashk_res.end) {
>   unsigned long i;
>  
> + cond_resched();
> +

I can't see why it would take a long time to loop inside, the job it does is 
simply to find a control area
not overlapped with image->segment[], you can see the loop "for (i = 0; i < 
image->nr_segments; i++)",
@hole_end will be advanced to the end of its next nearby segment once overlap 
was detected each loop,
also there are limited (<=16) segments, so it won't take long to locate the 
right area.

Am I missing something?

Regards,
Xunlei

>   if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
>   break;
>   /* See if I overlap any of the segments */


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec