The problem below may or may not be related to problems recently reported here:
- "Opensolaris guest eats 100% cpu"
http://www.opensolaris.org/jive/thread.jspa?threadID=57568&tstart=0
- "Libmicro issue in XVM Guest"
http://www.opensolaris.org/jive/thread.jspa?threadID=57630&tstart=0
I'm observing *excessive high* number of pagefaults after a process has forked.
I'm observing this both in 32-bit dom0 OpenSolaris kernels, and also in 32-bit
domU OpenSolaris kernels, when running on a 32-bit 3.1.2-xvm hypervisor.
Problem is much worse in the PV domU OpenSolaris kernel. I'm using current
OpenSolaris bits compiled from the mercurial repository (post snv_88).
Systems where I observed this issue:
1. ASUS M2NPV-VM mainboard, AMD Athlon(tm) 64 X2 Dual Core Processor 6400+
8GB of memory
Note: xen 3.1.2-xvm hypervisor was booted with option "mem=4G" or "mem=2G"
2. ASUS M2N-SLI deluxe, AMD Athlon(tm) 64 X2 Dual Core Processor 4200+
2GB of memory
3. Toshiba Tecra S1, Pentium M, 2GB of memory
Test case is this:
# cat fork.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/wait.h>
static void
fill(int n, void *p)
{
char data[4096];
memset(data, data[0], sizeof(data));
if (--n > 0)
fill(n, data);
}
int
main(int argc, char **argv)
{
pid_t p;
int status;
fill(8, NULL);
switch (p = fork()) {
case -1:
perror("fork");
exit(1);
case 0:
/*sleep(1);*/
fill(8, NULL);
_exit(0);
default:
fill(8, NULL);
wait(&status);
break;
}
exit(0);
}
# cc -o fork fork.c
# cat pagefault.d
#!/usr/sbin/dtrace -s
#pragma D option flowindent
BEGIN
{
type[0 /*F_INVAL*/] = "F_INVAL";
type[1 /*F_PROT*/] = "F_PROT";
type[2 /*F_SOFTLOCK*/] = "F_SOFTLOCK";
type[3 /*F_SOFTUNLOCK*/] = "F_SOFTUNLOCK";
rw[0 /*S_OTHER*/] = "S_OTHER";
rw[1 /*S_READ*/] = "S_READ";
rw[2 /*S_WRITE*/] = "S_WRITE";
rw[3 /*S_EXEC*/] = "S_EXEC";
rw[4 /*S_CREATE*/] = "S_CREATE";
rw[5 /*S_READ_NOCOW*/] = "S_READ_NOCOW";
flt_cnt = 0;
}
fbt::pagefault:entry
{
this->addr = (caddr_t)arg0;
this->type = (enum fault_type)arg1;
this->rw = (enum seg_rw)arg2;
this->iskernel = arg3;
}
fbt::pagefault:entry
/(uint64_t)this->addr < 0x08048000 && execname == "fork"/
{
@fault[execname, type[this->type], this->addr, rw[this->rw],
uregs[R_PC]] = count();
printf("prog %s(%d), addr %a %s %s, pc %a trap %x err %x",
execname, pid, this->addr, type[this->type], rw[this->rw],
uregs[R_PC], uregs[R_TRAPNO], uregs[R_ERR]);
self->trace = 1;
}
fbt::pagefault:return
/self->trace/
{
printf("ret: %x", arg1);
self->trace = 0;
}
END
{
printa("prog %s, type %s, addr %a %s, pc %a, count [EMAIL PROTECTED]",
@fault);
}
/*
fbt::segvn_fault:entry
/self->trace
&& ((uregs[R_PC] & 0xffff) == 0x4a3b || (uregs[R_PC] & 0xffff) == 0x4a3c)
&& flt_cnt < 5/
{
self->trace_me = 1;
flt_cnt++;
}
fbt:::entry
/self->trace_me/
{
}
fbt:::return
/self->trace_me/
{
printf("returns %x", arg1);
}
fbt::segvn_fault:return
/self->trace_me/
{
self->trace_me = 0;
}
fbt::x86pte_cas:entry
/self->trace_me/
{
this->ht = (htable_t *)arg0;
this->entry = (uint_t)arg1;
this->old = (x86pte_t)((uint32_t)arg2 | (arg3 << 32));
this->new = (x86pte_t)((uint32_t)arg4 | (arg5 << 32));
printf("entry %x, %llx -> %llx", this->entry, this->old, this->new);
}
fbt::x86pte_set:entry
/self->trace_me/
{
this->ht = (htable_t *)arg0;
this->entry = (uint_t)arg1;
this->new = (x86pte_t)((uint32_t)arg2 | (arg3 << 32));
printf("entry %x, new %llx", this->entry, this->new);
}
fbt::x86pte_inval:entry
/self->trace_me/
{
this->ht = (htable_t *)arg0;
this->entry = (uint_t)arg1;
this->expect = (x86pte_t)((uint32_t)arg2 | (arg3 << 32));
printf("entry %x, expect %llx", this->entry, this->expect);
}
fbt::HYPERVISOR_mmu_update:entry
/self->trace_me/
{
this->req = (mmu_update_t *)arg0;
this->count = (int)arg1;
this->success_count = (int *)arg2;
this->domain_id = (domid_t)arg3;
printf("req[0/%d]: ptr %p, val %llx", this->count,
this->req[0].ptr, this->req[0].val);
}
fbt::HYPERVISOR_mmuext_op:entry
/self->trace_me/
{
this->req2 = (struct mmuext_op *)arg0;
this->count = (int)arg1;
this->success_count = (int *)arg2;
this->domain_id = (domid_t)arg3;
printf("req[0/%d]: cmd %x, addr %p", this->count,
this->req2[0].cmd, this->req2[0].arg1.linear_addr);
}
fbt::HYPERVISOR_update_va_mapping:entry
/self->trace_me/
{
this->va = (ulong_t)arg0;
this->new_pte = ((uint32_t)arg1 | (arg2 << 32));
this->flags = (ulong_t)arg3;
printf("va %p, new_pte %llx, flags %lx", this->va, this->new_pte,
this->flags);
}
*/
# dtrace -s pagefault.d -c ./fork
The dtrace script produces a list of stack pagefaults and a summary
at the end, the stack pagefault summary looks something like this:
prog fork, type F_INVAL, addr 0x8047db8 S_READ, pc 0xd2b04a3b, count 1
prog fork, type F_PROT, addr 0x803fd78 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8040d88 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8041d98 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8042da8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8043db8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8044dc8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8045dd8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_PROT, addr 0x8046de8 S_WRITE, pc 0x8050a80, count 1
prog fork, type F_INVAL, addr 0x803fd7c S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8040d8c S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8041d9c S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8042dac S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8043dbc S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8044dcc S_READ, pc 0x8050a79, count 2
prog fork, type F_INVAL, addr 0x8045ddc S_READ, pc 0x8050a79, count 2
prog fork, type F_PROT, addr 0x8047dbc S_WRITE, pc 0xd2b04a3c, count 2
prog fork, type F_INVAL, addr 0x8047dbc S_WRITE, pc 0xd2b04a3c, count 2647
The problem is the 2647 pagefaults that we get in the forked child at
PC 0xd2b04a3c (which translates to libc.so.1`__forkx+0xc) trying to
write to the stack at address 0x8047dbc and failing with a page-not-present
pagefault (trap 0xe error 0x6).
> 0xd2b04a3c::dis
libc.so.1`__forkx: popl %ecx
libc.so.1`__forkx+1: pushl $0x0
libc.so.1`__forkx+3: pushl %ecx
libc.so.1`__forkx+4: movl $0x8e,%eax
libc.so.1`__forkx+9: int $0x91
libc.so.1`__forkx+0xb: popl %ecx <<<<
0xd2b04a3b [1]
libc.so.1`__forkx+0xc: movl %ecx,0x0(%esp) <<<< 0xd2b04a3c [2]
libc.so.1`__forkx+0x10: jb -0x80326 <libc.so.1`__cerror>
libc.so.1`__forkx+0x16: testl %edx,%edx
libc.so.1`__forkx+0x18: je +0x2 <libc.so.1`__forkx+0x1c>
libc.so.1`__forkx+0x1a: xorl %eax,%eax
libc.so.1`__forkx+0x1c: ret
We get one F_INVAL / S_READ stack page fault at __forkx+b [1],
where the popl %ecx tries to read something from the stack.
This apparently installs a readonly shared page from the
parent's address space.
One of the two F_PROT / S_WRITE stack page faults at __forkx+c [2]
does the copy-on-write operation and installs a private writable
copy of the stack page.
Now I would expect that we can finally write to the stack in the
child process. But instead we now get *lots* of F_INVAL / S_WRITE
stack page faults at address [2]. That doesn't seem to be
correct. The pagefault handler doesn't seem to change anything
in the mmu for these pagefaults because everything appears to be
set up correctly; all that is done is an INVLPG for the fault address on
the stack though the hypervisor via a call to
HYPERVISOR_mmuext_op(MMUEXT_INVLPG_LOCAL). The
process repeats a few thousand times.
After a while the problem disappears automagically and the
forked child process starts to run.
Of cause these high number of pagefaults is really bad for the
system's performance, the kernel uses almost 100% of system
cpu time to handle them.
Btw. the problem cannot be reproduced when running a
32-bit PV domU on a 64-bit dom0. Or when running
both dom0 and domU in 64-bit.
This message posted from opensolaris.org
_______________________________________________
xen-discuss mailing list
[email protected]