Re: freebsd-5.4-stable panics

2005-10-15 Thread Rob Watt
On Thu, 13 Oct 2005, Rob Watt wrote:

 The test machine did panic. Unfortunately I was not running with
 BREAK_TO_DEBUGGER. I will re-run the tests with all of the debugging
 options we were using before, and then send you the trace info.

Unfortunately I was not able to reproduce the panics with
BREAK_TO_DEBUGGER compiled into the kernel. I have no log messages or
core info from the first panic/reboot. We've been having other problems
with our 5 machines, so it's possible (and likely) that the reboot I
saw was unrelated to your patch. I ran our stress tests for about 14 more
hours without incident. Before we were able to trigger the kern_proc bug
on a 6.0 machine within 0-2 hours.

We've only experienced the kern_proc bug in 6.0. The bugs we've
experienced in 5.4 are multicast/network-threading related. So
unfortunately the only thing we can really test for is whether the machine
is still stable with your patch.

I've had to put all of my test machines back into production running 6.0.
If I can free a 5 SMP machine for more tests, I will do so, but at this
point it would good if someone else can test the 5.4 patch. I can't
post our simulations, but something similar to Antoine Pelisse's  test
program should be sufficient to see if the patch works.

thanks for your help.

-
Rob Watt


___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-12 Thread Rob Watt
  On Fri, 7 Oct 2005, Don Lewis wrote:
 I MFC'ed the fix to RELENG_6 last week, but the patch didn't apply
 cleanly to RELENG_5.  I tweaked the patch for RELENG_5 and tested it on
 a UP box.  I'd like to get some testing on SMP hardware before I commit
 it to RELENG_5, just to make sure that I don't destabilize -STABLE.  I
 do want to get the fix into RELENG_5, since this thread originated with
 a complaint about 5.4-STABLE.

I should be able to have a 5.4 machine available to test this tonight. Can
you send me the tweaked patch?

-
Rob Watt
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-12 Thread Don Lewis
On 12 Oct, Rob Watt wrote:
  On Fri, 7 Oct 2005, Don Lewis wrote:
 I MFC'ed the fix to RELENG_6 last week, but the patch didn't apply
 cleanly to RELENG_5.  I tweaked the patch for RELENG_5 and tested it on
 a UP box.  I'd like to get some testing on SMP hardware before I commit
 it to RELENG_5, just to make sure that I don't destabilize -STABLE.  I
 do want to get the fix into RELENG_5, since this thread originated with
 a complaint about 5.4-STABLE.
 
 I should be able to have a 5.4 machine available to test this tonight. Can
 you send me the tweaked patch?

I found a couple little nits that I fixed in this version:

Index: sys/kern/kern_proc.c
===
RCS file: /home/ncvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.215.2.6
diff -u -r1.215.2.6 kern_proc.c
--- sys/kern/kern_proc.c22 Mar 2005 13:40:23 -  1.215.2.6
+++ sys/kern/kern_proc.c12 Oct 2005 19:13:14 -
@@ -72,6 +72,8 @@
 
 static void doenterpgrp(struct proc *, struct pgrp *);
 static void orphanpg(struct pgrp *pg);
+static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
+static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp);
 static void pgadjustjobc(struct pgrp *pgrp, int entering);
 static void pgdelete(struct pgrp *);
 static int proc_ctor(void *mem, int size, void *arg, int flags);
@@ -601,33 +603,22 @@
}
 }
 #endif /* DDB */
-void
-fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp);
 
 /*
- * Fill in a kinfo_proc structure for the specified process.
+ * Clear kinfo_proc and fill in any information that is common
+ * to all threads in the process.
  * Must be called with the target process locked.
  */
-void
-fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
-{
-   fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp);
-}
-
-void
-fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
+static void
+fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
 {
-   struct proc *p;
struct thread *td0;
-   struct ksegrp *kg;
struct tty *tp;
struct session *sp;
struct timeval tv;
struct ucred *cred;
struct sigacts *ps;
 
-   p = td-td_proc;
-
bzero(kp, sizeof(*kp));
 
kp-ki_structsize = sizeof(*kp);
@@ -685,7 +676,8 @@
kp-ki_tsize = vm-vm_tsize;
kp-ki_dsize = vm-vm_dsize;
kp-ki_ssize = vm-vm_ssize;
-   }
+   } else if (p-p_state == PRS_ZOMBIE)
+   kp-ki_stat = SZOMB;
if ((p-p_sflag  PS_INMEM)  p-p_stats) {
kp-ki_start = p-p_stats-p_start;
timevaladd(kp-ki_start, boottime);
@@ -704,71 +696,6 @@
kp-ki_nice = p-p_nice;
bintime2timeval(p-p_runtime, tv);
kp-ki_runtime = tv.tv_sec * (u_int64_t)100 + tv.tv_usec;
-   if (p-p_state != PRS_ZOMBIE) {
-#if 0
-   if (td == NULL) {
-   /* XXXKSE: This should never happen. */
-   printf(fill_kinfo_proc(): pid %d has no threads!\n,
-   p-p_pid);
-   mtx_unlock_spin(sched_lock);
-   return;
-   }
-#endif
-   if (td-td_wmesg != NULL) {
-   strlcpy(kp-ki_wmesg, td-td_wmesg,
-   sizeof(kp-ki_wmesg));
-   }
-   if (TD_ON_LOCK(td)) {
-   kp-ki_kiflag |= KI_LOCKBLOCK;
-   strlcpy(kp-ki_lockname, td-td_lockname,
-   sizeof(kp-ki_lockname));
-   }
-
-   if (p-p_state == PRS_NORMAL) { /*  XXXKSE very approximate */
-   if (TD_ON_RUNQ(td) ||
-   TD_CAN_RUN(td) ||
-   TD_IS_RUNNING(td)) {
-   kp-ki_stat = SRUN;
-   } else if (P_SHOULDSTOP(p)) {
-   kp-ki_stat = SSTOP;
-   } else if (TD_IS_SLEEPING(td)) {
-   kp-ki_stat = SSLEEP;
-   } else if (TD_ON_LOCK(td)) {
-   kp-ki_stat = SLOCK;
-   } else {
-   kp-ki_stat = SWAIT;
-   }
-   } else {
-   kp-ki_stat = SIDL;
-   }
-
-   kg = td-td_ksegrp;
-
-   /* things in the KSE GROUP */
-   kp-ki_estcpu = kg-kg_estcpu;
-   kp-ki_slptime = kg-kg_slptime;
-   kp-ki_pri.pri_user = kg-kg_user_pri;
-   kp-ki_pri.pri_class = kg-kg_pri_class;
-
-   /* Things in the thread */
-   kp-ki_wchan = td-td_wchan;
-   kp-ki_pri.pri_level = td-td_priority;
-   kp-ki_pri.pri_native = td-td_base_pri;
-   kp-ki_lastcpu = td-td_lastcpu;
-   kp-ki_oncpu = 

Re: freebsd-5.4-stable panics

2005-10-11 Thread Rob Watt
On Mon, 10 Oct 2005, Rob Watt wrote:

 Don,

 On Fri, 7 Oct 2005, Don Lewis wrote:

  Both HEAD and RELENG_6 have been patched.  I've tested the following
  patch for RELENG_5 on a uniprocessor sparc64 box.  I'd appreciate it if
  anyone who was running into this problem on RELENG_5 with SMP hardare
  could test it before I do the MFC.

 We have a machine running with those patches applied. We need to do some
 other tests on it today, but tonight we will run our threaded applications
 that trigger the kern_proc problem in top. We should have results tomorrow
 morning.

Don,

I had misunderstood what you had asked. I tested this on a 6.0 machine. I
could not crash an amd64 SMP box running 6.0-BETA5 with this patch. I do
not have a test box running RELENG_5 to try this patch on right now. If I
can setup a test box I will let you know our results, but that may take a
day or two.

-
Rob Watt
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-11 Thread Don Lewis
On 11 Oct, Rob Watt wrote:
 On Mon, 10 Oct 2005, Rob Watt wrote:
 
 Don,

 On Fri, 7 Oct 2005, Don Lewis wrote:

  Both HEAD and RELENG_6 have been patched.  I've tested the following
  patch for RELENG_5 on a uniprocessor sparc64 box.  I'd appreciate it if
  anyone who was running into this problem on RELENG_5 with SMP hardare
  could test it before I do the MFC.

 We have a machine running with those patches applied. We need to do some
 other tests on it today, but tonight we will run our threaded applications
 that trigger the kern_proc problem in top. We should have results tomorrow
 morning.
 
 Don,
 
 I had misunderstood what you had asked. I tested this on a 6.0 machine. I
 could not crash an amd64 SMP box running 6.0-BETA5 with this patch. I do
 not have a test box running RELENG_5 to try this patch on right now. If I
 can setup a test box I will let you know our results, but that may take a
 day or two.

I MFC'ed the fix to RELENG_6 last week, but the patch didn't apply
cleanly to RELENG_5.  I tweaked the patch for RELENG_5 and tested it on
a UP box.  I'd like to get some testing on SMP hardware before I commit
it to RELENG_5, just to make sure that I don't destabilize -STABLE.  I
do want to get the fix into RELENG_5, since this thread originated with
a complaint about 5.4-STABLE.

___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-10 Thread Rob Watt
Don,

On Fri, 7 Oct 2005, Don Lewis wrote:

 Both HEAD and RELENG_6 have been patched.  I've tested the following
 patch for RELENG_5 on a uniprocessor sparc64 box.  I'd appreciate it if
 anyone who was running into this problem on RELENG_5 with SMP hardare
 could test it before I do the MFC.

We have a machine running with those patches applied. We need to do some
other tests on it today, but tonight we will run our threaded applications
that trigger the kern_proc problem in top. We should have results tomorrow
morning.

-
Rob Watt
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-08 Thread Don Lewis
On  3 Oct, Rob Watt wrote:

 We noticed the patches from Don Lewis, but have not tested them yet. We
 weren't sure if we could just apply those patches against 6.0-BETA5, or
 whether we should wait for them to be MFC'd.

Both HEAD and RELENG_6 have been patched.  I've tested the following
patch for RELENG_5 on a uniprocessor sparc64 box.  I'd appreciate it if
anyone who was running into this problem on RELENG_5 with SMP hardare
could test it before I do the MFC.


Index: sys/kern/kern_proc.c
===
RCS file: /home/ncvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.215.2.6
diff -u -r1.215.2.6 kern_proc.c
--- sys/kern/kern_proc.c22 Mar 2005 13:40:23 -  1.215.2.6
+++ sys/kern/kern_proc.c7 Oct 2005 23:17:26 -
@@ -72,6 +72,8 @@
 
 static void doenterpgrp(struct proc *, struct pgrp *);
 static void orphanpg(struct pgrp *pg);
+static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
+static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp);
 static void pgadjustjobc(struct pgrp *pgrp, int entering);
 static void pgdelete(struct pgrp *);
 static int proc_ctor(void *mem, int size, void *arg, int flags);
@@ -601,33 +603,22 @@
}
 }
 #endif /* DDB */
-void
-fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp);
 
 /*
- * Fill in a kinfo_proc structure for the specified process.
+ * Clear kinfo_proc and fill in any information that is common
+ * to all threads in the process.
  * Must be called with the target process locked.
  */
-void
-fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
-{
-   fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp);
-}
-
-void
-fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
+static void
+fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
 {
-   struct proc *p;
struct thread *td0;
-   struct ksegrp *kg;
struct tty *tp;
struct session *sp;
struct timeval tv;
struct ucred *cred;
struct sigacts *ps;
 
-   p = td-td_proc;
-
bzero(kp, sizeof(*kp));
 
kp-ki_structsize = sizeof(*kp);
@@ -685,7 +676,8 @@
kp-ki_tsize = vm-vm_tsize;
kp-ki_dsize = vm-vm_dsize;
kp-ki_ssize = vm-vm_ssize;
-   }
+   } else if (p-p_state != PRS_ZOMBIE)
+   kp-ki_stat = SZOMB;
if ((p-p_sflag  PS_INMEM)  p-p_stats) {
kp-ki_start = p-p_stats-p_start;
timevaladd(kp-ki_start, boottime);
@@ -704,71 +696,6 @@
kp-ki_nice = p-p_nice;
bintime2timeval(p-p_runtime, tv);
kp-ki_runtime = tv.tv_sec * (u_int64_t)100 + tv.tv_usec;
-   if (p-p_state != PRS_ZOMBIE) {
-#if 0
-   if (td == NULL) {
-   /* XXXKSE: This should never happen. */
-   printf(fill_kinfo_proc(): pid %d has no threads!\n,
-   p-p_pid);
-   mtx_unlock_spin(sched_lock);
-   return;
-   }
-#endif
-   if (td-td_wmesg != NULL) {
-   strlcpy(kp-ki_wmesg, td-td_wmesg,
-   sizeof(kp-ki_wmesg));
-   }
-   if (TD_ON_LOCK(td)) {
-   kp-ki_kiflag |= KI_LOCKBLOCK;
-   strlcpy(kp-ki_lockname, td-td_lockname,
-   sizeof(kp-ki_lockname));
-   }
-
-   if (p-p_state == PRS_NORMAL) { /*  XXXKSE very approximate */
-   if (TD_ON_RUNQ(td) ||
-   TD_CAN_RUN(td) ||
-   TD_IS_RUNNING(td)) {
-   kp-ki_stat = SRUN;
-   } else if (P_SHOULDSTOP(p)) {
-   kp-ki_stat = SSTOP;
-   } else if (TD_IS_SLEEPING(td)) {
-   kp-ki_stat = SSLEEP;
-   } else if (TD_ON_LOCK(td)) {
-   kp-ki_stat = SLOCK;
-   } else {
-   kp-ki_stat = SWAIT;
-   }
-   } else {
-   kp-ki_stat = SIDL;
-   }
-
-   kg = td-td_ksegrp;
-
-   /* things in the KSE GROUP */
-   kp-ki_estcpu = kg-kg_estcpu;
-   kp-ki_slptime = kg-kg_slptime;
-   kp-ki_pri.pri_user = kg-kg_user_pri;
-   kp-ki_pri.pri_class = kg-kg_pri_class;
-
-   /* Things in the thread */
-   kp-ki_wchan = td-td_wchan;
-   kp-ki_pri.pri_level = td-td_priority;
-   kp-ki_pri.pri_native = td-td_base_pri;
-   kp-ki_lastcpu = td-td_lastcpu;
-   kp-ki_oncpu = td-td_oncpu;
-   kp-ki_tdflags = td-td_flags;
-   kp-ki_tid = td-td_tid;
-   kp-ki_numthreads = p-p_numthreads;
-   

re: freebsd-5.4-stable panics

2005-10-04 Thread Rob Watt
 It turns out that the sysctl buffer is already wired in one of the two
 cases
 that this function is called, so I moved the wiring up to the upper
layer
 in
 the other case and cut out a bunch of the locking gymnastics as a
result.
 Can you try this patch?

 Index: kern_proc.c
 ===
 RCS file: /usr/cvs/src/sys/kern/kern_proc.c,v
 retrieving revision 1.231
 diff -u -r1.231 kern_proc.c
 --- kern_proc.c 27 Sep 2005 18:03:15 - 1.231
 +++ kern_proc.c 30 Sep 2005 17:04:57 -
 @@ -875,22 +875,16 @@

 if (flags  KERN_PROC_NOTHREADS) {
 fill_kinfo_proc(p, kinfo_proc);
 - PROC_UNLOCK(p);
 error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
 sizeof(kinfo_proc));
 - PROC_LOCK(p);
 } else {
 - _PHOLD(p);
 FOREACH_THREAD_IN_PROC(p, td) {
 fill_kinfo_thread(td, kinfo_proc);
 - PROC_UNLOCK(p);
 error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
 sizeof(kinfo_proc));
 - PROC_LOCK(p);
 if (error)
 break;
 }
 - _PRELE(p);
 }
 PROC_UNLOCK(p);
 if (error)
 @@ -932,6 +926,9 @@
 if (oid_number == KERN_PROC_PID) {
 if (namelen != 1)
 return (EINVAL);
 + error = sysctl_wire_old_buffer(req, 0);
 + if (error)
 + return (error);
 p = pfind((pid_t)name[0]);
 if (!p)
 return (ESRCH);

John,

We tried this patch and were able to run our simulations (and top) for 3
days straight without crashing. Since we were panicking every 3-6 hours
before when running top, this seems to have fixed the problem.

We noticed the patches from Don Lewis, but have not tested them yet. We
weren't sure if we could just apply those patches against 6.0-BETA5, or
whether we should wait for them to be MFC'd.

-
Rob Watt
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-04 Thread Don Lewis
On  3 Oct, Rob Watt wrote:
 It turns out that the sysctl buffer is already wired in one of the two
 cases
 that this function is called, so I moved the wiring up to the upper
 layer
 in
 the other case and cut out a bunch of the locking gymnastics as a
 result.
 Can you try this patch?

 Index: kern_proc.c
 ===
 RCS file: /usr/cvs/src/sys/kern/kern_proc.c,v
 retrieving revision 1.231
 diff -u -r1.231 kern_proc.c
 --- kern_proc.c 27 Sep 2005 18:03:15 - 1.231
 +++ kern_proc.c 30 Sep 2005 17:04:57 -
 @@ -875,22 +875,16 @@

 if (flags  KERN_PROC_NOTHREADS) {
 fill_kinfo_proc(p, kinfo_proc);
 - PROC_UNLOCK(p);
 error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
 sizeof(kinfo_proc));
 - PROC_LOCK(p);
 } else {
 - _PHOLD(p);
 FOREACH_THREAD_IN_PROC(p, td) {
 fill_kinfo_thread(td, kinfo_proc);
 - PROC_UNLOCK(p);
 error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
 sizeof(kinfo_proc));
 - PROC_LOCK(p);
 if (error)
 break;
 }
 - _PRELE(p);
 }
 PROC_UNLOCK(p);
 if (error)
 @@ -932,6 +926,9 @@
 if (oid_number == KERN_PROC_PID) {
 if (namelen != 1)
 return (EINVAL);
 + error = sysctl_wire_old_buffer(req, 0);
 + if (error)
 + return (error);
 p = pfind((pid_t)name[0]);
 if (!p)
 return (ESRCH);
 
 John,
 
 We tried this patch and were able to run our simulations (and top) for 3
 days straight without crashing. Since we were panicking every 3-6 hours
 before when running top, this seems to have fixed the problem.
 
 We noticed the patches from Don Lewis, but have not tested them yet. We
 weren't sure if we could just apply those patches against 6.0-BETA5, or
 whether we should wait for them to be MFC'd.

I haven't tried applying my patch to RELENG_5 yet, but hope to do so in
the next few days in preparation for doing a MFC.  If any changes are
required, I can send you a copy of the patch.


___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-02 Thread Don Lewis
On  1 Oct, Don Lewis wrote:
 On 30 Sep, John Baldwin wrote:

 It turns out that the sysctl buffer is already wired in one of the two cases 
 that this function is called, so I moved the wiring up to the upper layer in 
 the other case and cut out a bunch of the locking gymnastics as a result.  

 
 Index: kern_proc.c
 ===
 RCS file: /usr/cvs/src/sys/kern/kern_proc.c,v
 retrieving revision 1.231
 diff -u -r1.231 kern_proc.c
 --- kern_proc.c  27 Sep 2005 18:03:15 -  1.231
 +++ kern_proc.c  30 Sep 2005 17:04:57 -
 @@ -875,22 +875,16 @@
  
  if (flags  KERN_PROC_NOTHREADS) {
  fill_kinfo_proc(p, kinfo_proc);
 -PROC_UNLOCK(p);
  error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
 sizeof(kinfo_proc));
 -PROC_LOCK(p);
  } else {
 -_PHOLD(p);
  FOREACH_THREAD_IN_PROC(p, td) {
  fill_kinfo_thread(td, kinfo_proc);
 -PROC_UNLOCK(p);
  error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
 sizeof(kinfo_proc));
 -PROC_LOCK(p);
  if (error)
  break;
  }
 -_PRELE(p);
  }
  PROC_UNLOCK(p);
  if (error)
 @@ -932,6 +926,9 @@
  if (oid_number == KERN_PROC_PID) {
  if (namelen != 1) 
  return (EINVAL);
 +error = sysctl_wire_old_buffer(req, 0);
 +if (error)
 +return (error); 
  p = pfind((pid_t)name[0]);
  if (!p)
  return (ESRCH);
 
 
 sched_lock needs to be grabbed before the FOREACH_THREAD_IN_PROC loop.
 
 Can _PHOLD()/_PRELE() be dropped?

It turns out that fill_kinfo_thread() grabs a bunch of locks to grab
things out of struct proc, which breaks badly if sched_lock is grabbed
before calling fill_kinfo_thread().

I refactored fill_kinfo_thread() into two functions, one of which
doesn't need any additional locks and only gathers per-thread data, and
a new function, fill_kinfo_proc_only(), which gathers the data that is
common to all theads and can be called before grabbing sched_lock.  This
should be more efficient if there is more than one thread because the
per-process data is only gathered once, and only the per-thread data in
kinfo_proc is overwritten for each thread.

Index: kern_proc.c
===
RCS file: /home/ncvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.231
diff -u -r1.231 kern_proc.c
--- kern_proc.c 27 Sep 2005 18:03:15 -  1.231
+++ kern_proc.c 2 Oct 2005 08:48:56 -
@@ -73,6 +73,8 @@
 
 static void doenterpgrp(struct proc *, struct pgrp *);
 static void orphanpg(struct pgrp *pg);
+static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
+static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp);
 static void pgadjustjobc(struct pgrp *pgrp, int entering);
 static void pgdelete(struct pgrp *);
 static int proc_ctor(void *mem, int size, void *arg, int flags);
@@ -596,33 +598,22 @@
}
 }
 #endif /* DDB */
-void
-fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp);
 
 /*
- * Fill in a kinfo_proc structure for the specified process.
+ * Clear kinfo_proc and fill in any information that is common
+ * to all threads in the process.
  * Must be called with the target process locked.
  */
-void
-fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
-{
-   fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp);
-}
-
-void
-fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
+static void
+fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
 {
-   struct proc *p;
struct thread *td0;
-   struct ksegrp *kg;
struct tty *tp;
struct session *sp;
struct timeval tv;
struct ucred *cred;
struct sigacts *ps;
 
-   p = td-td_proc;
-
bzero(kp, sizeof(*kp));
 
kp-ki_structsize = sizeof(*kp);
@@ -684,78 +675,14 @@
kp-ki_tsize = vm-vm_tsize;
kp-ki_dsize = vm-vm_dsize;
kp-ki_ssize = vm-vm_ssize;
-   }
+   } else if (p-p_state == PRS_ZOMBIE)
+   kp-ki_stat = SZOMB;
kp-ki_sflag = p-p_sflag;
kp-ki_swtime = p-p_swtime;
kp-ki_pid = p-p_pid;
kp-ki_nice = p-p_nice;
bintime2timeval(p-p_rux.rux_runtime, tv);
kp-ki_runtime = tv.tv_sec * (u_int64_t)100 + tv.tv_usec;
-   if (p-p_state != PRS_ZOMBIE) {
-#if 0
-   if (td == NULL) {
-   /* XXXKSE: This should never happen. */
-   printf(fill_kinfo_proc(): pid %d has no threads!\n,
-   p-p_pid);
-   mtx_unlock_spin(sched_lock);
-   return;
-   }

Re: freebsd-5.4-stable panics

2005-10-02 Thread Don Lewis
On  2 Oct, Don Lewis wrote:

 It turns out that fill_kinfo_thread() grabs a bunch of locks to grab
 things out of struct proc, which breaks badly if sched_lock is grabbed
 before calling fill_kinfo_thread().
 
 I refactored fill_kinfo_thread() into two functions, one of which
 doesn't need any additional locks and only gathers per-thread data, and
 a new function, fill_kinfo_proc_only(), which gathers the data that is
 common to all theads and can be called before grabbing sched_lock.  This
 should be more efficient if there is more than one thread because the
 per-process data is only gathered once, and only the per-thread data in
 kinfo_proc is overwritten for each thread.

[ snip ]

After fixing a few whitespace nits and one minor buglet, I commited my
patch to HEAD, in kern_proc.c 1.232.  I hope to be able to MFC it soon.

___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-01 Thread Antoine Pelisse
On 9/30/05, John Baldwin [EMAIL PROTECTED] wrote:

 On Friday 30 September 2005 11:25 am, Antoine Pelisse wrote:
  On 9/30/05, John Baldwin [EMAIL PROTECTED] wrote:
   On Friday 30 September 2005 05:24 am, Antoine Pelisse wrote:
Hi Robert,
I don't think your patch is correct, the total linked list can be
broken
   
while the lock is released, thus just passing the link may not be
enough I have submitted a PR[1] for this a month ago but nobody took
care of it yet Regards,
Antoine Pelisse
   
[1] http://www.freebsd.org/cgi/query-pr.cgi?pr=kern/84684
  
   I think this patch looks ok. Robert, can you get the original panic on
   this
   thread tested against this patch?
 
  I had a small program which could reproduce this panic in 10 seconds, it
  was basically creating empty threads and calling kvm_getprocs() in the
 same
  time. Anyway the patch was able to stop the program from panicing.
  The panic is also reproducible in RELENG_6 and HEAD IIRC.

 It turns out that the sysctl buffer is already wired in one of the two
 cases
 that this function is called, so I moved the wiring up to the upper layer
 in
 the other case and cut out a bunch of the locking gymnastics as a result.
 Can you try this patch?

 Index: kern_proc.c
 ===
 RCS file: /usr/cvs/src/sys/kern/kern_proc.c,v
 retrieving revision 1.231
 diff -u -r1.231 kern_proc.c
 --- kern_proc.c 27 Sep 2005 18:03:15 - 1.231
 +++ kern_proc.c 30 Sep 2005 17:04:57 -
 @@ -875,22 +875,16 @@

 if (flags  KERN_PROC_NOTHREADS) {
 fill_kinfo_proc(p, kinfo_proc);
 - PROC_UNLOCK(p);
 error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
 sizeof(kinfo_proc));
 - PROC_LOCK(p);
 } else {
 - _PHOLD(p);
 FOREACH_THREAD_IN_PROC(p, td) {
 fill_kinfo_thread(td, kinfo_proc);
 - PROC_UNLOCK(p);
 error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
 sizeof(kinfo_proc));
 - PROC_LOCK(p);
 if (error)
 break;
 }
 - _PRELE(p);
 }
 PROC_UNLOCK(p);
 if (error)
 @@ -932,6 +926,9 @@
 if (oid_number == KERN_PROC_PID) {
 if (namelen != 1)
 return (EINVAL);
 + error = sysctl_wire_old_buffer(req, 0);
 + if (error)
 + return (error);
 p = pfind((pid_t)name[0]);
 if (!p)
 return (ESRCH);

 --
 John Baldwin [EMAIL PROTECTED]  http://www.FreeBSD.org/~jhb/
 Power Users Use the Power to Serve = http://www.FreeBSD.org

 Hi John,
 I'm sorry I can't test it right now, I'm in a foreign country for three
months and can only connect to the internet through the university
connection.
I'll be back home mid-december. Maybe Rob should take care of testing it.
 Regards,
Antoine Pelisse
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-01 Thread Don Lewis
On 30 Sep, Antoine Pelisse wrote:
  Hi Robert,
 I don't think your patch is correct, the total linked list can be broken
 while the lock is released, thus just passing the link may not be enough
 I have submitted a PR[1] for this a month ago but nobody took care of it yet

There are two problems with your patch:

sched_lock needs to be held while iterating over the threads

sysctl_kern_proc() calls sysctl_out_proc() multiple times in a
loop in the !KERN_PROC_PID case, so the buffer needs to be wired
before calling sysctl_out_proc().

Is _PHOLD()/_PRELE() needed if we don't drop PROC_LOCK?

Passing a size estimate to sysctl_wire_old_buffer() is desirable, but
sysctl_out_proc() would need some restructuring to do this correctly.

___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-10-01 Thread Don Lewis
On 30 Sep, John Baldwin wrote:
 On Friday 30 September 2005 11:25 am, Antoine Pelisse wrote:
 On 9/30/05, John Baldwin [EMAIL PROTECTED] wrote:
  On Friday 30 September 2005 05:24 am, Antoine Pelisse wrote:
   Hi Robert,
   I don't think your patch is correct, the total linked list can be
   broken
  
   while the lock is released, thus just passing the link may not be
   enough I have submitted a PR[1] for this a month ago but nobody took
   care of it yet Regards,
   Antoine Pelisse
  
   [1] http://www.freebsd.org/cgi/query-pr.cgi?pr=kern/84684
 
  I think this patch looks ok. Robert, can you get the original panic on
  this
  thread tested against this patch?

  I had a small program which could reproduce this panic in 10 seconds, it
 was basically creating empty threads and calling kvm_getprocs() in the same
 time. Anyway the patch was able to stop the program from panicing.
 The panic is also reproducible in RELENG_6 and HEAD IIRC.
 
 It turns out that the sysctl buffer is already wired in one of the two cases 
 that this function is called, so I moved the wiring up to the upper layer in 
 the other case and cut out a bunch of the locking gymnastics as a result.  
 Can you try this patch?
 
 Index: kern_proc.c
 ===
 RCS file: /usr/cvs/src/sys/kern/kern_proc.c,v
 retrieving revision 1.231
 diff -u -r1.231 kern_proc.c
 --- kern_proc.c   27 Sep 2005 18:03:15 -  1.231
 +++ kern_proc.c   30 Sep 2005 17:04:57 -
 @@ -875,22 +875,16 @@
  
   if (flags  KERN_PROC_NOTHREADS) {
   fill_kinfo_proc(p, kinfo_proc);
 - PROC_UNLOCK(p);
   error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
  sizeof(kinfo_proc));
 - PROC_LOCK(p);
   } else {
 - _PHOLD(p);
   FOREACH_THREAD_IN_PROC(p, td) {
   fill_kinfo_thread(td, kinfo_proc);
 - PROC_UNLOCK(p);
   error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
  sizeof(kinfo_proc));
 - PROC_LOCK(p);
   if (error)
   break;
   }
 - _PRELE(p);
   }
   PROC_UNLOCK(p);
   if (error)
 @@ -932,6 +926,9 @@
   if (oid_number == KERN_PROC_PID) {
   if (namelen != 1) 
   return (EINVAL);
 + error = sysctl_wire_old_buffer(req, 0);
 + if (error)
 + return (error); 
   p = pfind((pid_t)name[0]);
   if (!p)
   return (ESRCH);
 

sched_lock needs to be grabbed before the FOREACH_THREAD_IN_PROC loop.

Can _PHOLD()/_PRELE() be dropped?

___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


freebsd-5.4-stable panics

2005-09-30 Thread Antoine Pelisse
 Hi Robert,
I don't think your patch is correct, the total linked list can be broken
while the lock is released, thus just passing the link may not be enough
I have submitted a PR[1] for this a month ago but nobody took care of it yet
  Regards,
Antoine Pelisse

[1] http://www.freebsd.org/cgi/query-pr.cgi?pr=kern/84684
  On 9/29/05, Robert Watson [EMAIL PROTECTED] wrote:

 On Thu, 29 Sep 2005, Rob Watt wrote:

  On Thu, 29 Sep 2005, Robert Watson wrote:
 
  Could you dump the contents of *td and *td-td_proc for me? I'm quite
  interested to know what the value in td-td_proc-p_state is, among
 other
  things. If I could also have you generate a dump of the KSE group
  structures in td-td_proc-p_ksegrps and the threads in
  td-td_proc-p_threads.
 
  I've attached a file with many of the values you have asked for. We
  looked at some of the threads referenced by td-td_proc-p_threads, but
  we weren't sure we were walking the list correctly. Do you have any tips

  for walking those thread lists?
 
  Could you tell me if the program named by p-p_comm is linked against a
  threading library? If it's a custom app, you may already know, and if
  not, you can run ldd on the application to see what it is linked
  against.
 
  The programs named by p-p_comm is linked against the pthreads library.

 This seems to be enough information to at least track this down a bit:
 td_ksegrp is NULL, rather than a corrupt value, which suggests that the
 thread is incompletely initialized. Other hints that this are the case
 are that td_critnest is 1 (as is set when it is allocated), and the state
 is TDS_INACTIVE. Some other fields are set though, such as td_oncpu,
 which is normally initialized to NOCPU.

  (kgdb) p *td
  $1 = {td_proc = 0xff004aa9f000, td_ksegrp = 0x0, td_plist =
  {tqe_next = 0xff 00b4798000,
  tqe_prev = 0xff00a97ae010}, td_kglist = {tqe_next =
  0xff00b4798000,
  tqe_prev = 0xff00a97ae020}, td_slpq = {tqe_next = 0x0, tqe_prev
  = 0x ff001fac7c10}, td_lockq = {
  tqe_next = 0xff00a97ae000, tqe_prev = 0xb6797a70},
  td_runq = {tq e_next = 0x0,
  tqe_prev = 0x80608180}, td_selq = {tqh_first = 0x0, tqh_last
  = 0xfff fff00633112c0},
  td_sleepqueue = 0xff00382b0400, td_turnstile = 0xff00c1712900,
  td_umtx q = 0xff00d1207080,
  td_tid = 100253, td_flags = 16777216, td_inhibitors = 0, td_pflags =
  128, td_d upfd = 0, td_wchan = 0x0,
  td_wmesg = 0x0, td_lastcpu = 2 '\002', td_oncpu = 2 '\002',
  td_owepreempt = 0 '\0', td_locks = 0,
  td_blocked = 0x0, td_ithd = 0x0, td_lockname = 0x0, td_contested =
  {lh_first =
  0x0}, td_sleeplocks = 0x0,
  td_intr_nesting_level = 0, td_pinned = 0, td_mailbox = 0x0, td_ucred =
  0xf f00ad18f200,
  td_standin = 0x0, td_upcall = 0x0, td_sticks = 0, td_uuticks = 0,
  td_usticks =
  0, td_intrval = 0,
  td_oldsigmask = {__bits = {0, 0, 0, 0}}, td_sigmask = {__bits =
  {4294967295, 4 294967295, 4294967295,
  4294967295}}, td_siglist = {__bits = {0, 0, 0, 0}}, td_generation
  = 14, td _sigstk = {ss_sp = 0x0,
  ss_size = 0, ss_flags = 0}, td_kflags = 0, td_xsig = 0,
  td_profil_addr = 0, td_profil_ticks = 0,
  td_base_pri = 182 '\u', td_priority = 182 '\u', td_pcb =
  0xb68 dcd10, td_state = TDS_INACTIVE,
  td_retval = {1, 29309280}, td_slpcallout = {c_links = {sle = {sle_next
  = 0x0},
  tqe = {tqe_next = 0x0,
  tqe_prev = 0xff001fac7d80}}, c_time = 55907602, c_arg =
  0xff0063 311260,
  c_func = 0x802e32a0 sleepq_timeout, c_mtx = 0x0, c_flags =
  16}, td _frame = 0xb68dcc40,
  td_kstack_obj = 0xff0087f93d20, td_kstack = 18446744072477315072,
  td_kstac k_pages = 4,
  td_altkstack_obj = 0x0, td_altkstack = 0, td_altkstack_pages = 0,
  td_critnest = 1, td_md = {
  md_spinlock_count = 1, md_saved_flags = 582}, td_sched =
  0xff0063311488}

 I'm not familiar with the internals of the thread and KSE life cycle here,

 so I think we'll need to look to those more familiar with this to
 understand what of two things may be going on:

 (1) Is the fact that td_ksegrp != NULL an invariant for a connected
 thread, and that kern_proc is relying on that but the thread code is
 failing to implement it safely?

 (2) Is td_ksegrp sometimes left legitimately as NULL as part of the thread
 life cycle, and that kern_proc incorrectly assumes that it is never
 NULL when hooked up to a thread.

 This suggests a possible work-around of simply testing td_ksegrp for NULL
 in kern_proc in order to avoid this, while attempting to resolve whether
 an invariant is violated (or incorrectly assumed), which might require
 some serious thinking and a solution that is non-trivial. Something like
 the following might work in the mean time:

 Index: kern_proc.c
 ===
 RCS file: /home/ncvs/src/sys/kern/kern_proc.c,v
 retrieving revision 1.231
 diff -u -r1.231 kern_proc.c
 --- kern_proc.c 27 Sep 2005 18:03:15 - 1.231
 +++ kern_proc.c 29 Sep 

Re: freebsd-5.4-stable panics

2005-09-30 Thread Rob Watt
Robert,

We have gotten some more information from our type1 crash:

sh lockedvnods
Locked vnodes

sh alllocks
Process 2204 (dataplay) thread 0xff00b1726a000 (100214)
exclusive sleep mutex inp (udpinp) f = 0 (0xff00cc90fcc8) locked @
/usr/src/sys/netinet/udp_usrreq.c:762
Process 62 (pagedaemon) thread 0xff00e358c280 (100049)
exclusive sleep mutex UMA lock r = 0 (0x8068bf80) locked @
/usr/src/sys/vm/uma_core.c:1491
exclusive sleep mutex Giant r = 0 (0x8062ed80) locked @
/usr/src/sys/vm/vm_pageout.c:717
Process 48 (swi1:net) thread 0xff00e3597780 (100027)
exclusive sleep mutex IPFW static rules r = 0 (0x8067ae50) locked
@ /usr/src/sys/netinet/ip_fw2.c:149

sh pcpu
cpuid=0
currthread  = 0xff00e358c280: pid 63 pagedaemon
currpcb = 0xb34e3d10
fpcurrthread= none
idle thread = 0xff00e35b6000: pid 14 (idle cpu0)
spin locks held =

sh pcpu 1
cpuid=1
currthread  = 0xff00e358b3c80: pid 13 idle cpu1
currpcb = 0xffb34e7d10
fpcurrthread= none
idle thread = 0xff00e358b3c80: pid 13 (idle cpu1)
spin locks held =

sh pcpu 2
cpuid=2
currthread  = 0xff00e35e4000: pid 2715 bonnie
currpcb = 0xffb636dd10
fpcurrthread= none
idle thread = 0xff00e35b3a00: pid 12 (idle cpu2)
spin locks held =

sh pcpu 3
cpuid=3
currthread  = 0xff00e35aea00: pid 40 irq27: em1 em2
currpcb = 0xffb34b6d10
fpcurrthread= none
idle thread = 0xff00e35b3780: pid 11 (idle cpu0)
spin locks held =


I have attached the core output as type1-core.2.txt, but unfortunately it
does not help us determine the area of code that triggered the exception.

If I can get more DDB output from the type2 crash I will post it.

There is some encouraging news: since we stopped running top, our
6.0-BETA5 test machine has not crashed (its been running tests now for
over 26 hours).

We also started running tests on a dual single-core machine running
5-STABLE. That machine has been running for 50 hours without crashing.

This means that we are now only hitting these bugs with dual dual-core
machines running 5-STABLE.

-
Rob WattDDB:
sh lockedvnods
Locked vnodes

sh alllocks
Process 2204 (dataplay) thread 0xff00b1726a000 (100214)
exclusive sleep mutex inp (udpinp) f = 0 (0xff00cc90fcc8) locked @ 
/usr/src/sys/netinet/udp_usrreq.c:762
Process 62 (pagedaemon) thread 0xff00e358c280 (100049)
exclusive sleep mutex UMA lock r = 0 (0x8068bf80) locked @ 
/usr/src/sys/vm/uma_core.c:1491
exclusive sleep mutex Giant r = 0 (0x8062ed80) locked @ 
/usr/src/sys/vm/vm_pageout.c:717
Process 48 (swi1:net) thread 0xff00e3597780 (100027)
exclusive sleep mutex IPFW static rules r = 0 (0x8067ae50) locked @ 
/usr/src/sys/netinet/ip_fw2.c:149

sh pcpu
cpuid=0
currthread  = 0xff00e358c280: pid 63 pagedaemon
currpcb = 0xb34e3d10
fpcurrthread= none
idle thread = 0xff00e35b6000: pid 14 (idle cpu0)
spin locks held =

sh pcpu 1
cpuid=1
currthread  = 0xff00e358b3c80: pid 13 idle cpu1
currpcb = 0xffb34e7d10
fpcurrthread= none
idle thread = 0xff00e358b3c80: pid 13 idle cpu1
spin locks held =

sh pcpu 2
cpuid=2
currthread  = 0xff00e35e4000: pid 2715 bonnie
currpcb = 0xffb636dd10
fpcurrthread= none
idle thread = 0xff00e35b3a00: pid 12 idle cpu2
spin locks held =

sh pcpu 3 
cpuid=3
currthread  = 0xff00e35aea00: pid 40 irq27: em1 em2
currpcb = 0xffb34b6d10
fpcurrthread= none
idle thread = 0xff00e35b3780: pid 11 idle cpu0
spin locks held =



KGDB:
Unread portion of the kernel message buffer:
panic: No TID bitmap?
cpuid = 0
KDB: enter: panic

#0  doadump () at pcpu.h:167
167 pcpu.h: No such file or directory.
in pcpu.h
(kgdb) bt
#0  doadump () at pcpu.h:167
#1  0x801924f6 in db_fncall (dummy1=0, dummy2=0, dummy3=0, dummy4=0x0) 
at /usr/src/sys/ddb/db_command.c:531
#2  0x80192985 in db_command_loop () at 
/usr/src/sys/ddb/db_command.c:349
#3  0x80194833 in db_trap (type=-1286719648, code=0) at 
/usr/src/sys/ddb/db_main.c:221
#4  0x802cb8f0 in kdb_trap (type=3, code=0, tf=0x0) at 
/usr/src/sys/kern/subr_kdb.c:470
#5  0x804169dc in trap (frame=
  {tf_rdi = 0, tf_rsi = -2136928256, tf_rdx = 0, tf_rcx = 523776, tf_r8 = 
-1286719440, tf_r9 = 10, tf_rax = 18, tf_rbx = -2142686258, tf_rbp = 
-1286719200, tf_r10 = 20765, tf_r11 = 0, tf_r12 = 0, tf_r13 = 256, tf_r14 = 
-1095697382784, tf_r15 = 768605, tf_trapno = 3, tf_addr = 0, tf_flags = 256, 
tf_err = 0, tf_rip = -2144554161, tf_cs = 8, tf_rflags = 642, tf_rsp = 
-1286719200, tf_ss = 16}) at /usr/src/sys/amd64/amd64/trap.c:431
#6  0x804046fb in calltrap () at 
/usr/src/sys/amd64/amd64/exception.S:171
#7  0x in ?? ()
#8  0x80a11000 in ?? ()
#9  0x in ?? ()
#10 0x0007fe00 in ?? ()
#11 0xb34e3830 in 

Re: freebsd-5.4-stable panics

2005-09-30 Thread Rob Watt
On Thu, 29 Sep 2005, Robert Watson wrote:

 Could you dump the contents of *td and *td-td_proc for me?  I'm quite
 interested to know what the value in td-td_proc-p_state is, among other
 things.  If I could also have you generate a dump of the KSE group
 structures in td-td_proc-p_ksegrps and the threads in
 td-td_proc-p_threads.

I've attached a file with many of the values you have asked for. We
looked at some of the threads referenced by td-td_proc-p_threads, but we
weren't sure we were walking the list correctly. Do you have any tips for
walking those thread lists?


 Could you tell me if the program named by p-p_comm is linked against a
 threading library?  If it's a custom app, you may already know, and if
 not, you can run ldd on the application to see what it is linked against.


The programs named by p-p_comm is linked against the pthreads library.

 Depending on how much time you have available, it might make sense for me
 to grab from you a copy of your source tree, compiled kernel with debug
 symbols, and core dump.

We can upload the source, kernel etc somewhere, but uncompressed that is
about 5G of data. What is the best way to get that to you?

Thanks.

-
Rob Watt

6.0-BETA5.kgdb.out
Description: Binary data
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]

Re: freebsd-5.4-stable panics

2005-09-30 Thread John Baldwin
On Friday 30 September 2005 05:24 am, Antoine Pelisse wrote:
  Hi Robert,
 I don't think your patch is correct, the total linked list can be broken
 while the lock is released, thus just passing the link may not be enough
 I have submitted a PR[1] for this a month ago but nobody took care of it
 yet Regards,
 Antoine Pelisse

 [1] http://www.freebsd.org/cgi/query-pr.cgi?pr=kern/84684

I think this patch looks ok.  Robert, can you get the original panic on this 
thread tested against this patch?

   On 9/29/05, Robert Watson [EMAIL PROTECTED] wrote:
  On Thu, 29 Sep 2005, Rob Watt wrote:
   On Thu, 29 Sep 2005, Robert Watson wrote:
   Could you dump the contents of *td and *td-td_proc for me? I'm quite
   interested to know what the value in td-td_proc-p_state is, among
 
  other
 
   things. If I could also have you generate a dump of the KSE group
   structures in td-td_proc-p_ksegrps and the threads in
   td-td_proc-p_threads.
  
   I've attached a file with many of the values you have asked for. We
   looked at some of the threads referenced by td-td_proc-p_threads, but
   we weren't sure we were walking the list correctly. Do you have any
   tips
  
   for walking those thread lists?
  
   Could you tell me if the program named by p-p_comm is linked against
   a threading library? If it's a custom app, you may already know, and
   if not, you can run ldd on the application to see what it is linked
   against.
  
   The programs named by p-p_comm is linked against the pthreads library.
 
  This seems to be enough information to at least track this down a bit:
  td_ksegrp is NULL, rather than a corrupt value, which suggests that the
  thread is incompletely initialized. Other hints that this are the case
  are that td_critnest is 1 (as is set when it is allocated), and the state
  is TDS_INACTIVE. Some other fields are set though, such as td_oncpu,
  which is normally initialized to NOCPU.
 
   (kgdb) p *td
   $1 = {td_proc = 0xff004aa9f000, td_ksegrp = 0x0, td_plist =
   {tqe_next = 0xff 00b4798000,
   tqe_prev = 0xff00a97ae010}, td_kglist = {tqe_next =
   0xff00b4798000,
   tqe_prev = 0xff00a97ae020}, td_slpq = {tqe_next = 0x0, tqe_prev
   = 0x ff001fac7c10}, td_lockq = {
   tqe_next = 0xff00a97ae000, tqe_prev = 0xb6797a70},
   td_runq = {tq e_next = 0x0,
   tqe_prev = 0x80608180}, td_selq = {tqh_first = 0x0, tqh_last
   = 0xfff fff00633112c0},
   td_sleepqueue = 0xff00382b0400, td_turnstile = 0xff00c1712900,
   td_umtx q = 0xff00d1207080,
   td_tid = 100253, td_flags = 16777216, td_inhibitors = 0, td_pflags =
   128, td_d upfd = 0, td_wchan = 0x0,
   td_wmesg = 0x0, td_lastcpu = 2 '\002', td_oncpu = 2 '\002',
   td_owepreempt = 0 '\0', td_locks = 0,
   td_blocked = 0x0, td_ithd = 0x0, td_lockname = 0x0, td_contested =
   {lh_first =
   0x0}, td_sleeplocks = 0x0,
   td_intr_nesting_level = 0, td_pinned = 0, td_mailbox = 0x0, td_ucred =
   0xf f00ad18f200,
   td_standin = 0x0, td_upcall = 0x0, td_sticks = 0, td_uuticks = 0,
   td_usticks =
   0, td_intrval = 0,
   td_oldsigmask = {__bits = {0, 0, 0, 0}}, td_sigmask = {__bits =
   {4294967295, 4 294967295, 4294967295,
   4294967295}}, td_siglist = {__bits = {0, 0, 0, 0}}, td_generation
   = 14, td _sigstk = {ss_sp = 0x0,
   ss_size = 0, ss_flags = 0}, td_kflags = 0, td_xsig = 0,
   td_profil_addr = 0, td_profil_ticks = 0,
   td_base_pri = 182 '\u', td_priority = 182 '\u', td_pcb =
   0xb68 dcd10, td_state = TDS_INACTIVE,
   td_retval = {1, 29309280}, td_slpcallout = {c_links = {sle = {sle_next
   = 0x0},
   tqe = {tqe_next = 0x0,
   tqe_prev = 0xff001fac7d80}}, c_time = 55907602, c_arg =
   0xff0063 311260,
   c_func = 0x802e32a0 sleepq_timeout, c_mtx = 0x0, c_flags =
   16}, td _frame = 0xb68dcc40,
   td_kstack_obj = 0xff0087f93d20, td_kstack = 18446744072477315072,
   td_kstac k_pages = 4,
   td_altkstack_obj = 0x0, td_altkstack = 0, td_altkstack_pages = 0,
   td_critnest = 1, td_md = {
   md_spinlock_count = 1, md_saved_flags = 582}, td_sched =
   0xff0063311488}
 
  I'm not familiar with the internals of the thread and KSE life cycle
  here,
 
  so I think we'll need to look to those more familiar with this to
  understand what of two things may be going on:
 
  (1) Is the fact that td_ksegrp != NULL an invariant for a connected
  thread, and that kern_proc is relying on that but the thread code is
  failing to implement it safely?
 
  (2) Is td_ksegrp sometimes left legitimately as NULL as part of the
  thread life cycle, and that kern_proc incorrectly assumes that it is
  never NULL when hooked up to a thread.
 
  This suggests a possible work-around of simply testing td_ksegrp for NULL
  in kern_proc in order to avoid this, while attempting to resolve whether
  an invariant is violated (or incorrectly assumed), which might require
  some serious thinking and a solution that is non-trivial. Something like
  the following might work in the 

freebsd-5.4-stable panics

2005-09-30 Thread Antoine Pelisse
On 9/30/05, John Baldwin [EMAIL PROTECTED] wrote:

 On Friday 30 September 2005 05:24 am, Antoine Pelisse wrote:
  Hi Robert,
  I don't think your patch is correct, the total linked list can be broken

  while the lock is released, thus just passing the link may not be enough
  I have submitted a PR[1] for this a month ago but nobody took care of it
  yet Regards,
  Antoine Pelisse
 
  [1] http://www.freebsd.org/cgi/query-pr.cgi?pr=kern/84684

 I think this patch looks ok. Robert, can you get the original panic on
 this
 thread tested against this patch?

 I had a small program which could reproduce this panic in 10 seconds, it
was basically creating empty threads and calling kvm_getprocs() in the same
time. Anyway the patch was able to stop the program from panicing.
The panic is also reproducible in RELENG_6 and HEAD IIRC.

 On 9/29/05, Robert Watson [EMAIL PROTECTED] wrote:
   On Thu, 29 Sep 2005, Rob Watt wrote:
On Thu, 29 Sep 2005, Robert Watson wrote:
Could you dump the contents of *td and *td-td_proc for me? I'm
 quite
interested to know what the value in td-td_proc-p_state is, among

  
   other
  
things. If I could also have you generate a dump of the KSE group
structures in td-td_proc-p_ksegrps and the threads in
td-td_proc-p_threads.
   
I've attached a file with many of the values you have asked for. We
looked at some of the threads referenced by td-td_proc-p_threads,
 but
we weren't sure we were walking the list correctly. Do you have any
tips
   
for walking those thread lists?
   
Could you tell me if the program named by p-p_comm is linked
 against
a threading library? If it's a custom app, you may already know,
 and
if not, you can run ldd on the application to see what it is linked
against.
   
The programs named by p-p_comm is linked against the pthreads
 library.
  
   This seems to be enough information to at least track this down a bit:
   td_ksegrp is NULL, rather than a corrupt value, which suggests that
 the
   thread is incompletely initialized. Other hints that this are the case
   are that td_critnest is 1 (as is set when it is allocated), and the
 state
   is TDS_INACTIVE. Some other fields are set though, such as td_oncpu,
   which is normally initialized to NOCPU.
  
(kgdb) p *td
$1 = {td_proc = 0xff004aa9f000, td_ksegrp = 0x0, td_plist =
{tqe_next = 0xff 00b4798000,
tqe_prev = 0xff00a97ae010}, td_kglist = {tqe_next =
0xff00b4798000,
tqe_prev = 0xff00a97ae020}, td_slpq = {tqe_next = 0x0, tqe_prev
= 0x ff001fac7c10}, td_lockq = {
tqe_next = 0xff00a97ae000, tqe_prev = 0xb6797a70},
td_runq = {tq e_next = 0x0,
tqe_prev = 0x80608180}, td_selq = {tqh_first = 0x0, tqh_last
= 0xfff fff00633112c0},
td_sleepqueue = 0xff00382b0400, td_turnstile =
 0xff00c1712900,
td_umtx q = 0xff00d1207080,
td_tid = 100253, td_flags = 16777216, td_inhibitors = 0, td_pflags =

128, td_d upfd = 0, td_wchan = 0x0,
td_wmesg = 0x0, td_lastcpu = 2 '\002', td_oncpu = 2 '\002',
td_owepreempt = 0 '\0', td_locks = 0,
td_blocked = 0x0, td_ithd = 0x0, td_lockname = 0x0, td_contested =
{lh_first =
0x0}, td_sleeplocks = 0x0,
td_intr_nesting_level = 0, td_pinned = 0, td_mailbox = 0x0, td_ucred
 =
0xf f00ad18f200,
td_standin = 0x0, td_upcall = 0x0, td_sticks = 0, td_uuticks = 0,
td_usticks =
0, td_intrval = 0,
td_oldsigmask = {__bits = {0, 0, 0, 0}}, td_sigmask = {__bits =
{4294967295, 4 294967295, 4294967295,
4294967295}}, td_siglist = {__bits = {0, 0, 0, 0}}, td_generation
= 14, td _sigstk = {ss_sp = 0x0,
ss_size = 0, ss_flags = 0}, td_kflags = 0, td_xsig = 0,
td_profil_addr = 0, td_profil_ticks = 0,
td_base_pri = 182 '\u', td_priority = 182 '\u', td_pcb =
0xb68 dcd10, td_state = TDS_INACTIVE,
td_retval = {1, 29309280}, td_slpcallout = {c_links = {sle =
 {sle_next
= 0x0},
tqe = {tqe_next = 0x0,
tqe_prev = 0xff001fac7d80}}, c_time = 55907602, c_arg =
0xff0063 311260,
c_func = 0x802e32a0 sleepq_timeout, c_mtx = 0x0, c_flags =
16}, td _frame = 0xb68dcc40,
td_kstack_obj = 0xff0087f93d20, td_kstack =
 18446744072477315072,
td_kstac k_pages = 4,
td_altkstack_obj = 0x0, td_altkstack = 0, td_altkstack_pages = 0,
td_critnest = 1, td_md = {
md_spinlock_count = 1, md_saved_flags = 582}, td_sched =
0xff0063311488}
  
   I'm not familiar with the internals of the thread and KSE life cycle
   here,
  
   so I think we'll need to look to those more familiar with this to
   understand what of two things may be going on:
  
   (1) Is the fact that td_ksegrp != NULL an invariant for a connected
   thread, and that kern_proc is relying on that but the thread code is
   failing to implement it safely?
  
   (2) Is td_ksegrp sometimes left legitimately as NULL as part of the
   thread 

Re: freebsd-5.4-stable panics

2005-09-30 Thread John Baldwin
On Friday 30 September 2005 11:25 am, Antoine Pelisse wrote:
 On 9/30/05, John Baldwin [EMAIL PROTECTED] wrote:
  On Friday 30 September 2005 05:24 am, Antoine Pelisse wrote:
   Hi Robert,
   I don't think your patch is correct, the total linked list can be
   broken
  
   while the lock is released, thus just passing the link may not be
   enough I have submitted a PR[1] for this a month ago but nobody took
   care of it yet Regards,
   Antoine Pelisse
  
   [1] http://www.freebsd.org/cgi/query-pr.cgi?pr=kern/84684
 
  I think this patch looks ok. Robert, can you get the original panic on
  this
  thread tested against this patch?

  I had a small program which could reproduce this panic in 10 seconds, it
 was basically creating empty threads and calling kvm_getprocs() in the same
 time. Anyway the patch was able to stop the program from panicing.
 The panic is also reproducible in RELENG_6 and HEAD IIRC.

It turns out that the sysctl buffer is already wired in one of the two cases 
that this function is called, so I moved the wiring up to the upper layer in 
the other case and cut out a bunch of the locking gymnastics as a result.  
Can you try this patch?

Index: kern_proc.c
===
RCS file: /usr/cvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.231
diff -u -r1.231 kern_proc.c
--- kern_proc.c 27 Sep 2005 18:03:15 -  1.231
+++ kern_proc.c 30 Sep 2005 17:04:57 -
@@ -875,22 +875,16 @@
 
if (flags  KERN_PROC_NOTHREADS) {
fill_kinfo_proc(p, kinfo_proc);
-   PROC_UNLOCK(p);
error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
   sizeof(kinfo_proc));
-   PROC_LOCK(p);
} else {
-   _PHOLD(p);
FOREACH_THREAD_IN_PROC(p, td) {
fill_kinfo_thread(td, kinfo_proc);
-   PROC_UNLOCK(p);
error = SYSCTL_OUT(req, (caddr_t)kinfo_proc,
   sizeof(kinfo_proc));
-   PROC_LOCK(p);
if (error)
break;
}
-   _PRELE(p);
}
PROC_UNLOCK(p);
if (error)
@@ -932,6 +926,9 @@
if (oid_number == KERN_PROC_PID) {
if (namelen != 1) 
return (EINVAL);
+   error = sysctl_wire_old_buffer(req, 0);
+   if (error)
+   return (error); 
p = pfind((pid_t)name[0]);
if (!p)
return (ESRCH);

-- 
John Baldwin [EMAIL PROTECTED]http://www.FreeBSD.org/~jhb/
Power Users Use the Power to Serve  =  http://www.FreeBSD.org
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-09-29 Thread Rob Watt
Robert,

On Tue, 27 Sep 2005, Robert Watson wrote:

 Great.  As mentioned I'll be offline for about the next 48 hours, but back
 after then.  If we can get a nice clean crash out of this, would really be
 best.  If it's top panicking, it could well be due to a bug in the process
 monitoring code, in kern_proc.  We've run into bugs a few times there in
 the past, generally associated with threading or races in process
 creation/teardown, in which partially initialized (or torn down) processes
 are accessed by another thread and are in an unexpected state.

We re-compiled the kernel with 'options KDB_STOP_NMI', and were able to
get a much more full analysis of what was happening on the 6-BETA5 crash.

We crashed in top again, and it does look like we may have hit a kern_proc
bug.

in the attached file type3-core.txt you can see that it hits an exception
in:

0x802b897a is in fill_kinfo_thread
(/usr/src/sys/kern/kern_proc.c:736).
731 }
732
733 kg = td-td_ksegrp;
734
735 /* things in the KSE GROUP */
736 kp-ki_estcpu = kg-kg_estcpu;
737 kp-ki_slptime = kg-kg_slptime;
738 kp-ki_pri.pri_user = kg-kg_user_pri;
739 kp-ki_pri.pri_class = kg-kg_pri_class;
740
(kgdb) frame 8
#8  0x802b897a in fill_kinfo_thread (td=0xff0063311260,
kp=0xb62d8510)
at /usr/src/sys/kern/kern_proc.c:733
733 kg = td-td_ksegrp;
(kgdb) p kg-kg_estcpu
Cannot access memory at address 0x173
(kgdb) p td-td_ksegrp
$1 = (struct ksegrp *) 0x0
(kgdb) p kp-ki_estcpu
$2 = 0
(kgdb) p kg
$4 = (struct ksegrp *) 0x12b

it seems that kg is an invalid pointer.

We have started our tests again without running top.

Hope you have a great vacation.

-
Rob Watt

type3-core.txt
Description: Binary data
Copyright (c) 1992-2005 The FreeBSD Project.
Copyright (c) 1979, 1980, 1983, 1986, 1988, 1989, 1991, 1992, 1993, 1994
The Regents of the University of California. All rights reserved.
FreeBSD 6.0-BETA5 #1: Tue Sep 27 17:38:32 EDT 2005
[EMAIL PROTECTED]:/usr/obj/usr/src/sys/LOCAL-DEBUG-NMI
WARNING: WITNESS option enabled, expect reduced performance.
Timecounter i8254 frequency 1193182 Hz quality 0
CPU: Dual Core AMD Opteron(tm) Processor 275 (2190.05-MHz K8-class CPU)
  Origin = AuthenticAMD  Id = 0x20f12  Stepping = 2
  
Features=0x178bfbffFPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CLFLUSH,MMX,FXSR,SSE,SSE2,HTT
  Features2=0x1SSE3
  AMD Features=0xe2500800SYSCALL,NX,MMX+,b25,LM,3DNow+,3DNow
  Hyperthreading: 2 logical CPUs
real memory  = 3942580224 (3759 MB)
avail memory = 3807399936 (3631 MB)
ACPI APIC Table: A M I  OEMAPIC 
FreeBSD/SMP: Multiprocessor System Detected: 4 CPUs
 cpu0 (BSP): APIC ID:  0
 cpu1 (AP): APIC ID:  1
 cpu2 (AP): APIC ID:  2
 cpu3 (AP): APIC ID:  3
MADT: Forcing active-low polarity and level trigger for SCI
ioapic0 Version 1.1 irqs 0-23 on motherboard
ioapic1 Version 1.1 irqs 24-27 on motherboard
ioapic2 Version 1.1 irqs 28-31 on motherboard
acpi0: A M I OEMRSDT on motherboard
acpi0: Power Button (fixed)
pci_link0: ACPI PCI Link LNKA irq 10 on acpi0
pci_link1: ACPI PCI Link LNKB irq 5 on acpi0
pci_link2: ACPI PCI Link LNKC irq 11 on acpi0
pci_link3: ACPI PCI Link LNKD irq 9 on acpi0
Timecounter ACPI-fast frequency 3579545 Hz quality 1000
acpi_timer0: 24-bit timer at 3.579545MHz port 0x1008-0x100b on acpi0
cpu0: ACPI CPU on acpi0
acpi_throttle0: ACPI CPU Throttling on cpu0
cpu1: ACPI CPU on acpi0
cpu2: ACPI CPU on acpi0
cpu3: ACPI CPU on acpi0
pcib0: ACPI Host-PCI bridge port 0xcf8-0xcff on acpi0
pci0: ACPI PCI bus on pcib0
pcib1: ACPI PCI-PCI bridge at device 6.0 on pci0
pci3: ACPI PCI bus on pcib1
ohci0: OHCI (generic) USB controller mem 0xfeafc000-0xfeafcfff irq 19 at 
device 0.0 on pci3
ohci0: [GIANT-LOCKED]
usb0: OHCI version 1.0, legacy support
usb0: OHCI (generic) USB controller on ohci0
usb0: USB revision 1.0
uhub0: AMD OHCI root hub, class 9/0, rev 1.00/1.00, addr 1
uhub0: 3 ports with 3 removable, self powered
ohci1: OHCI (generic) USB controller mem 0xfeafd000-0xfeafdfff irq 19 at 
device 0.1 on pci3
ohci1: [GIANT-LOCKED]
usb1: OHCI version 1.0, legacy support
usb1: OHCI (generic) USB controller on ohci1
usb1: USB revision 1.0
uhub1: AMD OHCI root hub, class 9/0, rev 1.00/1.00, addr 1
uhub1: 3 ports with 3 removable, self powered
pci3: display, VGA at device 6.0 (no driver attached)
fxp0: Intel 82551 Pro/100 Ethernet port 0xbc00-0xbc3f mem 
0xfeafb000-0xfeafbfff,0xfeaa-0xfeab irq 18 at device 8.0 on pci3
miibus0: MII bus on fxp0
inphy0: i82555 10/100 media interface on miibus0
inphy0:  10baseT, 10baseT-FDX, 100baseTX, 100baseTX-FDX, auto
fxp0: Ethernet address: 00:e0:81:31:89:1c
isab0: PCI-ISA bridge at device 7.0 on pci0
isa0: ISA bus on isab0
atapci0: AMD 8111 UDMA133 controller port 
0x1f0-0x1f7,0x3f6,0x170-0x177,0x376,0xffa0-0xffaf at device 7.1 on pci0
ata0: ATA channel 0 on atapci0
ata1: 

Re: freebsd-5.4-stable panics

2005-09-29 Thread Robert Watson


On Wed, 28 Sep 2005, Rob Watt wrote:

We re-compiled the kernel with 'options KDB_STOP_NMI', and were able to 
get a much more full analysis of what was happening on the 6-BETA5 
crash.


Great.

We crashed in top again, and it does look like we may have hit a 
kern_proc bug.


This sounds good, or at least, promising.

in the attached file type3-core.txt you can see that it hits an 
exception in:


0x802b897a is in fill_kinfo_thread
(/usr/src/sys/kern/kern_proc.c:736).
731 }
732
733 kg = td-td_ksegrp;
734
735 /* things in the KSE GROUP */
736 kp-ki_estcpu = kg-kg_estcpu;
737 kp-ki_slptime = kg-kg_slptime;
738 kp-ki_pri.pri_user = kg-kg_user_pri;
739 kp-ki_pri.pri_class = kg-kg_pri_class;
740
(kgdb) frame 8
#8  0x802b897a in fill_kinfo_thread (td=0xff0063311260,
kp=0xb62d8510)
   at /usr/src/sys/kern/kern_proc.c:733
733 kg = td-td_ksegrp;
(kgdb) p kg-kg_estcpu
Cannot access memory at address 0x173
(kgdb) p td-td_ksegrp
$1 = (struct ksegrp *) 0x0
(kgdb) p kp-ki_estcpu
$2 = 0
(kgdb) p kg
$4 = (struct ksegrp *) 0x12b

it seems that kg is an invalid pointer.


Could you dump the contents of *td and *td-td_proc for me?  I'm quite 
interested to know what the value in td-td_proc-p_state is, among other 
things.  If I could also have you generate a dump of the KSE group 
structures in td-td_proc-p_ksegrps and the threads in 
td-td_proc-p_threads.


Could you tell me if the program named by p-p_comm is linked against a 
threading library?  If it's a custom app, you may already know, and if 
not, you can run ldd on the application to see what it is linked against.


Depending on how much time you have available, it might make sense for me 
to grab from you a copy of your source tree, compiled kernel with debug 
symbols, and core dump.



We have started our tests again without running top.

Hope you have a great vacation.


It was brief but very enjoyable, and quite disconnected :-).

Thanks,

Robert
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-09-29 Thread Robert Watson

On Thu, 29 Sep 2005, Rob Watt wrote:


On Thu, 29 Sep 2005, Robert Watson wrote:


Could you dump the contents of *td and *td-td_proc for me?  I'm quite
interested to know what the value in td-td_proc-p_state is, among other
things.  If I could also have you generate a dump of the KSE group
structures in td-td_proc-p_ksegrps and the threads in
td-td_proc-p_threads.


I've attached a file with many of the values you have asked for. We 
looked at some of the threads referenced by td-td_proc-p_threads, but 
we weren't sure we were walking the list correctly. Do you have any tips 
for walking those thread lists?


Could you tell me if the program named by p-p_comm is linked against a 
threading library?  If it's a custom app, you may already know, and if 
not, you can run ldd on the application to see what it is linked 
against.


The programs named by p-p_comm is linked against the pthreads library.


This seems to be enough information to at least track this down a bit: 
td_ksegrp is NULL, rather than a corrupt value, which suggests that the 
thread is incompletely initialized.  Other hints that this are the case 
are that td_critnest is 1 (as is set when it is allocated), and the state 
is TDS_INACTIVE.  Some other fields are set though, such as td_oncpu, 
which is normally initialized to NOCPU.



(kgdb) p *td
$1 = {td_proc = 0xff004aa9f000, td_ksegrp = 0x0, td_plist = 
{tqe_next = 0xff 00b4798000,
tqe_prev = 0xff00a97ae010}, td_kglist = {tqe_next = 
0xff00b4798000,
tqe_prev = 0xff00a97ae020}, td_slpq = {tqe_next = 0x0, tqe_prev 
= 0x ff001fac7c10}, td_lockq = {
tqe_next = 0xff00a97ae000, tqe_prev = 0xb6797a70}, 
td_runq = {tq e_next = 0x0,
tqe_prev = 0x80608180}, td_selq = {tqh_first = 0x0, tqh_last 
= 0xfff fff00633112c0},
  td_sleepqueue = 0xff00382b0400, td_turnstile = 0xff00c1712900, 
td_umtx q = 0xff00d1207080,
  td_tid = 100253, td_flags = 16777216, td_inhibitors = 0, td_pflags = 
128, td_d upfd = 0, td_wchan = 0x0,
  td_wmesg = 0x0, td_lastcpu = 2 '\002', td_oncpu = 2 '\002', 
td_owepreempt = 0 '\0', td_locks = 0,
  td_blocked = 0x0, td_ithd = 0x0, td_lockname = 0x0, td_contested = 
{lh_first =

 0x0}, td_sleeplocks = 0x0,
  td_intr_nesting_level = 0, td_pinned = 0, td_mailbox = 0x0, td_ucred = 
0xf f00ad18f200,
  td_standin = 0x0, td_upcall = 0x0, td_sticks = 0, td_uuticks = 0, 
td_usticks =

 0, td_intrval = 0,
  td_oldsigmask = {__bits = {0, 0, 0, 0}}, td_sigmask = {__bits = 
{4294967295, 4 294967295, 4294967295,
  4294967295}}, td_siglist = {__bits = {0, 0, 0, 0}}, td_generation 
= 14, td _sigstk = {ss_sp = 0x0,
ss_size = 0, ss_flags = 0}, td_kflags = 0, td_xsig = 0, 
td_profil_addr = 0, td_profil_ticks = 0,
  td_base_pri = 182 '\u', td_priority = 182 '\u', td_pcb = 
0xb68 dcd10, td_state = TDS_INACTIVE,
  td_retval = {1, 29309280}, td_slpcallout = {c_links = {sle = {sle_next 
= 0x0},

 tqe = {tqe_next = 0x0,
tqe_prev = 0xff001fac7d80}}, c_time = 55907602, c_arg = 
0xff0063 311260,
c_func = 0x802e32a0 sleepq_timeout, c_mtx = 0x0, c_flags = 
16}, td _frame = 0xb68dcc40,
  td_kstack_obj = 0xff0087f93d20, td_kstack = 18446744072477315072, 
td_kstac k_pages = 4,
  td_altkstack_obj = 0x0, td_altkstack = 0, td_altkstack_pages = 0, 
td_critnest = 1, td_md = {
md_spinlock_count = 1, md_saved_flags = 582}, td_sched = 
0xff0063311488}


I'm not familiar with the internals of the thread and KSE life cycle here, 
so I think we'll need to look to those more familiar with this to 
understand what of two things may be going on:


(1) Is the fact that td_ksegrp != NULL an invariant for a connected
thread, and that kern_proc is relying on that but the thread code is
failing to implement it safely?

(2) Is td_ksegrp sometimes left legitimately as NULL as part of the thread
life cycle, and that kern_proc incorrectly assumes that it is never
NULL when hooked up to a thread.

This suggests a possible work-around of simply testing td_ksegrp for NULL 
in kern_proc in order to avoid this, while attempting to resolve whether 
an invariant is violated (or incorrectly assumed), which might require 
some serious thinking and a solution that is non-trivial.  Something like 
the following might work in the mean time:


Index: kern_proc.c
===
RCS file: /home/ncvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.231
diff -u -r1.231 kern_proc.c
--- kern_proc.c 27 Sep 2005 18:03:15 -  1.231
+++ kern_proc.c 29 Sep 2005 20:50:33 -
@@ -882,6 +882,8 @@
} else {
_PHOLD(p);
FOREACH_THREAD_IN_PROC(p, td) {
+   if (td-td_ksegrp == NULL)
+   continue;
fill_kinfo_thread(td, kinfo_proc);
PROC_UNLOCK(p);
error = SYSCTL_OUT(req, 

Re: freebsd-5.4-stable panics

2005-09-28 Thread Rob Watt
On 9/27/05, Robert Watson [EMAIL PROTECTED] wrote:

 On Tue, 27 Sep 2005, Rob Watt wrote:


 Is this an SMP box?  If so, could you try compiling options KDB_STOP_NMI
 into your kernel -- you'll also need to set debug.kdb.stop_cpus_with_nmi=1
 in either loader.conf or at runtime with sysctls.

This is a dual-core dual 275 processor box. I have compiled the nmi
options into the kernel and we are now using that to test.


 The trap information you've provided indicates that it is likely a data
 NULL pointer dereference in the kernel (faulting address is a small
 increment above NULL).  The instruction pointer looks valid -- if you have
 a debugging copy of the kernel, could you load it into gdb and show me
 what line number / piece of code it's in?  you can use l
 *803b88ca to generate that, even without a live debugger session


this is the piece of code that was referenced by the ip:

(gdb) l *0x803b88ca
0x803b88ca is in nfsrv_lookup (/usr/src/sys/nfsserver/nfs_serv.c:670).
665 NFSD_UNLOCK();
666 mtx_lock(Giant);   /* VFS */
667 if (dirp)
668 vrele(dirp);
669 NDFREE(nd, NDF_ONLY_PNBUF);
670 if (ndp-ni_startdir)
671 vrele(ndp-ni_startdir);
672 if (ndp-ni_vp)
673 vput(ndp-ni_vp);
674 mtx_unlock(Giant); /* VFS */

we are not running nfsd (although we do use nfs and nfsiod), and none
of our processes should have been accessing nfs. Our processes are run
from an nfs mount but do not access any nfs mounted files.



 Do you have a testbed or set of test hosts set up so you can
 non-disruptively test change sets, btw?


yes we have 3 dual dual-core machines and 1 dual single-core machine
that we can use to test with.

Thanks!

-
Rob Watt
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-09-28 Thread Rob Watt
On Sun, 25 Sep 2005, Robert Watson wrote:


 On Fri, 23 Sep 2005, Jason Carroll wrote:
 5B
  There seem to be 2 types of crashes we see with pretty different stack
  traces.  What I'll call a type 1 crash, I believe, is often caused by
  one of the triggers I mention above.  A type 2 crash appears to happen
  spontaneously after the machine has been running for a while.
 
  I poked around using kgdb in a core file from a type 2 crash, and it
  appeared the system hung closing sockets (specifically cleaning up
  multicast state i think) while cleaning up one of our multicast
  applications (note the trace through sys_exit).  There's no reason this
  application should have been exiting unless it encountered some kind of
  error.

 Sounds nasty.  It's possible the two panics are related, especially if
 they involve a race in the multicast code, which could result in treading
 on other kernel memory, potentially leading to the thread related panic.
 My leaning would be that they are unrelated, but since we may be able to
 eliminate the multicast one (see below), that would be a good starting
 point.

 There are some other known stability nits in 6.x which are being worked
 on, but in general the network stack stability is higher in 6.x than 5.x
 when it comes to multicast due to the work I reference above.  If you run
 into any stability problems relating to the file system, set
 debug.mpsafevfs=0 in loader.conf -- there are a few bug fixes relating to
 running out of disk space or hitting quota limits that are fixed in HEAD,
 but not yet backported to 6.x.

Robert,

Thanks for your quick response and suggestions. We have now experienced
an additional type of crash. Type 3 is from 6.0-BETA5, it did not enter
the debugger at all and we could not generate a core.

Unfortunately the 6-BETA crash was completely different from everything
we've seen so far. The panic was related to a page fault and 'top' was the
active process. We are trying again to run our tests on 6.0, but if we
keep encountering other bugs, then those other bugs may prevent us
from determining if multicast is the problem.

We also ran our applications in 5-STABLE without reading from or writing
to disk (ie we ran the multicast data streams on a remote machine, and we
told our listener/rebroadcaster apps not to write to disk). In this
configuration we were able to run for 4 days without crashing. A few
hours before the crash we had introduced disk activity (bonnie
in a constant loop with 1G test file size). This crash was a type 1,
and we were not able to save a core. The longest we had gone before
without a crash was 6 hours, so it is possible that either load, or disk
activity help trigger the bugs we have seen.

files attached:
kernel-conf.txt (6.0 kernel)
type3-core.txt (copy of panic output to console)

We will update you with more info from our 6.0 tests when we have it.

We are in a bind right now. All modern hardware (ie emt64/amd64) only
seems to work with versions of freebsd that aren't stable when running our
applications. Many vendors do not even sell server hardware that is purely
i386. We never encountered these types of problems on freebsd 4.x, and
many of our 120+ i386 class machines that are running 4.x are showing
their age and need to be replaced. Assuming that the problems we are
experiencing are purely related to ths OS, we now don't have an OS to run
on the newer hardware we've been buying. We really need to find a way to
patch these problems or find a version of freebsd that supports our
platform and is stable. Obviously we appreciate the hard work that all of
you on the freebsd team do, and we are happy to do whatever we can to help
squash these bugs.

-
Rob Watt#
# GENERIC -- Generic kernel configuration file for FreeBSD/amd64
#
# For more information on this file, please read the handbook section on
# Kernel Configuration Files:
#
#
http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
#
# The handbook is also available locally in /usr/share/doc/handbook
# if you've installed the doc distribution, otherwise always see the
# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
# latest information.
#
# An exhaustive list of options and more detailed explanations of the
# device lines is also present in the ../../conf/NOTES and NOTES files.
# If you are in doubt as to the purpose or necessity of a line, check first
# in NOTES.
#
# $FreeBSD: src/sys/amd64/conf/GENERIC,v 1.421.2.11.2.1 2005/04/09 17:28:37 
kensmith Exp $

machine amd64
cpu HAMMER
ident   CUSTOM

# To statically compile in device wiring instead of /boot/device.hints
#hints  GENERIC.hints # Default places to look for devices.
makeoptions DEBUG=-g
options KDB
options DDB
options BREAK_TO_DEBUGGER
options INVARIANTS
options INVARIANT_SUPPORT
options WITNESS
options WITNESS_SKIPSPIN
#makeoptions COPTFLAGS=-O 

Re: freebsd-5.4-stable panics

2005-09-27 Thread Robert Watson


On Tue, 27 Sep 2005, Rob Watt wrote:

Thanks for your quick response and suggestions. We have now experienced 
an additional type of crash. Type 3 is from 6.0-BETA5, it did not enter 
the debugger at all and we could not generate a core.


Is this an SMP box?  If so, could you try compiling options KDB_STOP_NMI 
into your kernel -- you'll also need to set debug.kdb.stop_cpus_with_nmi=1 
in either loader.conf or at runtime with sysctls.  This will probably 
become the default at some point -- in the mean time, the default when 
entering the debugger on one CPU is to generate an IPI to the other CPUs 
telling them go into the debugger.  This works fine unless the CPU has 
interrupts disabled, such as if it's holding a spin lock in the scheduler, 
in which case the system will deadlock because that CPU won't acknowledge 
the IPI.  With the above option, a non-maskable interrupt is used to 
signal the other CPUs into the debugger, which gets into the debugger much 
more reliably.


The trap information you've provided indicates that it is likely a data 
NULL pointer dereference in the kernel (faulting address is a small 
increment above NULL).  The instruction pointer looks valid -- if you have 
a debugging copy of the kernel, could you load it into gdb and show me 
what line number / piece of code it's in?  you can use l 
*803b88ca to generate that, even without a live debugger session 
or core.  If you can get into DDB with the above, generally good starting 
point debugging information (ideally gathered with a serial console) is:


  trace # current thread trace
  show pcpu # current CPU data
  show pcpu 0   # CPU 0 data
  show pcpu 1   # CPU 1 data
  ...   # Any other CPUs
  ps# process listing
  show lockedvnods  # VFS locking information

If you have WITNESS compiled in, also:

  show alllocks

Unfortunately the 6-BETA crash was completely different from everything 
we've seen so far. The panic was related to a page fault and 'top' was 
the active process. We are trying again to run our tests on 6.0, but if 
we keep encountering other bugs, then those other bugs may prevent us 
from determining if multicast is the problem.


Let's see if we can get whatever this first bug you're hitting is fixed 
and see if we can get to the next original problems.


We also ran our applications in 5-STABLE without reading from or writing 
to disk (ie we ran the multicast data streams on a remote machine, and 
we told our listener/rebroadcaster apps not to write to disk). In this 
configuration we were able to run for 4 days without crashing. A few 
hours before the crash we had introduced disk activity (bonnie in a 
constant loop with 1G test file size). This crash was a type 1, and we 
were not able to save a core. The longest we had gone before without a 
crash was 6 hours, so it is possible that either load, or disk activity 
help trigger the bugs we have seen.


I'm heading off on a vacation for two days, and will be offline for that 
period, but if we can't easily get through solving 6.x problems on the 
host, I can backport a subset of the multicast fixes to 5.x and we can see 
if that fixes things up.  It may make sense to do this anyway, but I may 
not have an opportunity to go through the development and testing on that 
until after 6.0 is out the door.



files attached:
kernel-conf.txt (6.0 kernel)
type3-core.txt (copy of panic output to console)

We will update you with more info from our 6.0 tests when we have it.

We are in a bind right now. All modern hardware (ie emt64/amd64) only 
seems to work with versions of freebsd that aren't stable when running 
our applications. Many vendors do not even sell server hardware that is 
purely i386. We never encountered these types of problems on freebsd 
4.x, and many of our 120+ i386 class machines that are running 4.x are 
showing their age and need to be replaced. Assuming that the problems we 
are experiencing are purely related to ths OS, we now don't have an OS 
to run on the newer hardware we've been buying. We really need to find a 
way to patch these problems or find a version of freebsd that supports 
our platform and is stable. Obviously we appreciate the hard work that 
all of you on the freebsd team do, and we are happy to do whatever we 
can to help squash these bugs.


Hopefully we can get this fixed up as soon as possible.

Do you have a testbed or set of test hosts set up so you can 
non-disruptively test change sets, btw?


Thanks,

Robert N M Watson
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-09-27 Thread Robert Watson

On Tue, 27 Sep 2005, Rob Watt wrote:


this is the piece of code that was referenced by the ip:

(gdb) l *0x803b88ca
0x803b88ca is in nfsrv_lookup (/usr/src/sys/nfsserver/nfs_serv.c:670).
665 NFSD_UNLOCK();
666 mtx_lock(Giant);   /* VFS */
667 if (dirp)
668 vrele(dirp);
669 NDFREE(nd, NDF_ONLY_PNBUF);
670 if (ndp-ni_startdir)
671 vrele(ndp-ni_startdir);
672 if (ndp-ni_vp)
673 vput(ndp-ni_vp);
674 mtx_unlock(Giant); /* VFS */

we are not running nfsd (although we do use nfs and nfsiod), and none of 
our processes should have been accessing nfs. Our processes are run from 
an nfs mount but do not access any nfs mounted files.


That code is in the NFS server lookup code, so should be called as a 
result of a lookup by a remote client.  If the NFS server is not in use on 
the machine, this is most likely this is a quirk of gdb and instruction 
pointers, a run-time kernel/compile-time kernel mismatch, or something 
really nasty.  ndp should really never be NULL there, as it's used 
frequently prior to that point.  Let's hope for one of the former few 
options.


Do you have a testbed or set of test hosts set up so you can 
non-disruptively test change sets, btw?


yes we have 3 dual dual-core machines and 1 dual single-core machine 
that we can use to test with.


Great.  As mentioned I'll be offline for about the next 48 hours, but back 
after then.  If we can get a nice clean crash out of this, would really be 
best.  If it's top panicking, it could well be due to a bug in the process 
monitoring code, in kern_proc.  We've run into bugs a few times there in 
the past, generally associated with threading or races in process 
creation/teardown, in which partially initialized (or torn down) processes 
are accessed by another thread and are in an unexpected state.


Thanks,

Robert N M Watson
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


Re: freebsd-5.4-stable panics

2005-09-25 Thread Robert Watson


On Fri, 23 Sep 2005, Jason Carroll wrote:
5B
There seem to be 2 types of crashes we see with pretty different stack 
traces.  What I'll call a type 1 crash, I believe, is often caused by 
one of the triggers I mention above.  A type 2 crash appears to happen 
spontaneously after the machine has been running for a while.


I poked around using kgdb in a core file from a type 2 crash, and it 
appeared the system hung closing sockets (specifically cleaning up 
multicast state i think) while cleaning up one of our multicast 
applications (note the trace through sys_exit).  There's no reason this 
application should have been exiting unless it encountered some kind of 
error.


I'm attaching:
dmesg.txt
kernel-conf.txt (kernel config file)
type1-core.txt (a kgdb bt from a type1/triggered crash)
type2-core.txt (a kgdb bt from a type2/spontaneous crash)

I'm happy to dig for more information, recompile with different options, 
apply patches, or do anything else that might help get this problem 
diagnosed and fixed!


Hi there Jason!

Sounds nasty.  It's possible the two panics are related, especially if 
they involve a race in the multicast code, which could result in treading 
on other kernel memory, potentially leading to the thread related panic. 
My leaning would be that they are unrelated, but since we may be able to 
eliminate the multicast one (see below), that would be a good starting 
point.


In the 6.x branch, quite a bit of work has been done to improve locking in 
the multicast code, and several important races have been fixed relating 
to IP multicast.  These races tended to turn up on the following sorts of 
situations:


(1) Multi-threaded appplications changing the multicast properties, such
as membership, or a particular socket in parallel.

(2) Changes to multicast membership during high multicast I/O load on the
socket.  For example, adding or deleting multicast groups on socket on
CPU 0 while a packet is delivered to the same socket on CPU 1.

(3) Removal of real or synthetic interfaces involved in active multicast,
such as removal of pccards, vlans, etc during multicast I/O, or with
sockets bound to the interfaces.

These changes are not currently scheduled for a backport to 5.x, because 
they change the kernel network device driver API and ABI, requiring 
changes to and recompiling of third party device drivers.  A subset could 
be backported, subject to some limitations, but it would be good to 
confirm whether these changes actually affect the problems you're seeing 
before working through that. All the changes should appear in the most 
recent snapshot, BETA5.  Make sure to turn off extra kernel debugging 
features, such as WITNESS, INVARIANTS, and user space malloc debugging, if 
you start running into performance problems -- they have a big performance 
impact, although can be quite helpful in testing.  Normally we turn these 
off during the release candidate portion of the release cycle.


There are some other known stability nits in 6.x which are being worked 
on, but in general the network stack stability is higher in 6.x than 5.x 
when it comes to multicast due to the work I reference above.  If you run 
into any stability problems relating to the file system, set 
debug.mpsafevfs=0 in loader.conf -- there are a few bug fixes relating to 
running out of disk space or hitting quota limits that are fixed in HEAD, 
but not yet backported to 6.x.


Thanks,

Robert N M Watson
___
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to [EMAIL PROTECTED]


freebsd-5.4-stable panics

2005-09-23 Thread Jason Carroll
Hi--

I've been working on setting up a dual cpu, dual-core Opteron 275 with
freebsd-5.4-stable, but have been getting panics and reboots fairly
consistently.   I think the problem I'm seeing might be related to
this discussion:

http://groups.google.com/group/lucky.freebsd.current/browse_thread/thread/6abaddffadebfdfe/f251a4874c2be3b1?lnk=stq=freebsd+kernel+%22trap+9%22+closefrnum=3hl=en#f251a4874c2be3b1

but I can't be sure.

I have several applications (on the order of 10) that each receive and
send multicast data (each listens to 6-12 multicast streams and
broadcasts 1).  They also log to disk the data they broadcast.  These
applications each join all the groups they listen to at startup, and
never explicitly leave these groups.  These applications process
500-5000 packets per second between them in our environment.  The
machine usually panics after these applications have been up and
running for 30 min to 6 hours.  Several times the panic/reboot seems
to have been triggered by an independent operation from these
applications (copying a large file off the machine or moving a
directory that contained the log files)

After the first few panics, we rebuilt the kernel with trace and debug
options and have saved a few core files.

There seem to be 2 types of crashes we see with pretty different stack
traces.  What I'll call a type 1 crash, I believe, is often caused by
one of the triggers I mention above.  A type 2 crash appears to happen
spontaneously after the machine has been running for a while.

I poked around using kgdb in a core file from a type 2 crash, and it
appeared the system hung closing sockets (specifically cleaning up
multicast state i think) while cleaning up one of our multicast
applications (note the trace through sys_exit).  There's no reason
this application should have been exiting unless it encountered some
kind of error.

I'm attaching:
dmesg.txt
kernel-conf.txt (kernel config file)
type1-core.txt (a kgdb bt from a type1/triggered crash)
type2-core.txt (a kgdb bt from a type2/spontaneous crash)

I'm happy to dig for more information, recompile with different
options, apply patches, or do anything else that might help get this
problem diagnosed and fixed!

Thanks,
Jason Carroll
Copyright (c) 1992-2005 The FreeBSD Project.
Copyright (c) 1979, 1980, 1983, 1986, 1988, 1989, 1991, 1992, 1993, 1994
The Regents of the University of California. All rights reserved.
FreeBSD 5.4-STABLE #1: Wed Sep 21 16:25:57 EDT 2005
[EMAIL PROTECTED]:/usr/obj/usr/src/sys/LOCAL-DEBUG
WARNING: WITNESS option enabled, expect reduced performance.
Timecounter i8254 frequency 1193182 Hz quality 0
CPU: Dual Core AMD Opteron(tm) Processor 275 (2190.66-MHz K8-class CPU)
  Origin = AuthenticAMD  Id = 0x20f12  Stepping = 2
  
Features=0x178bfbffFPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CLFLUSH,MMX,FXSR,SSE,SSE2,HTT
  Features2=0x1SSE3
  AMD Features=0xe2500800SYSCALL,NX,MMX+,b25,LM,3DNow+,3DNow
  Hyperthreading: 2 logical CPUs
real memory  = 3942580224 (3759 MB)
avail memory = 3805609984 (3629 MB)
ACPI APIC Table: A M I  OEMAPIC 
FreeBSD/SMP: Multiprocessor System Detected: 4 CPUs
 cpu0 (BSP): APIC ID:  0
 cpu1 (AP): APIC ID:  1
 cpu2 (AP): APIC ID:  2
 cpu3 (AP): APIC ID:  3
MADT: Forcing active-low polarity and level trigger for SCI
ioapic0 Version 1.1 irqs 0-23 on motherboard
ioapic1 Version 1.1 irqs 24-27 on motherboard
ioapic2 Version 1.1 irqs 28-31 on motherboard
acpi0: A M I OEMRSDT on motherboard
acpi0: Power Button (fixed)
Timecounter ACPI-fast frequency 3579545 Hz quality 1000
acpi_timer0: 24-bit timer at 3.579545MHz port 0x1008-0x100b on acpi0
cpu0: ACPI CPU on acpi0
acpi_throttle0: ACPI CPU Throttling on cpu0
cpu1: ACPI CPU on acpi0
cpu2: ACPI CPU on acpi0
cpu3: ACPI CPU on acpi0
pcib0: ACPI Host-PCI bridge port 0xcf8-0xcff on acpi0
pci0: ACPI PCI bus on pcib0
pcib1: ACPI PCI-PCI bridge at device 6.0 on pci0
pci3: ACPI PCI bus on pcib1
ohci0: OHCI (generic) USB controller mem 0xfeafc000-0xfeafcfff irq 19 at 
device 0.0 on pci3
usb0: OHCI version 1.0, legacy support
usb0: OHCI (generic) USB controller on ohci0
usb0: USB revision 1.0
uhub0: AMD OHCI root hub, class 9/0, rev 1.00/1.00, addr 1
uhub0: 3 ports with 3 removable, self powered
ohci1: OHCI (generic) USB controller mem 0xfeafd000-0xfeafdfff irq 19 at 
device 0.1 on pci3
usb1: OHCI version 1.0, legacy support
usb1: OHCI (generic) USB controller on ohci1
usb1: USB revision 1.0
uhub1: AMD OHCI root hub, class 9/0, rev 1.00/1.00, addr 1
uhub1: 3 ports with 3 removable, self powered
pci3: display, VGA at device 6.0 (no driver attached)
fxp0: Intel 82551 Pro/100 Ethernet port 0xbc00-0xbc3f mem 
0xfeaa-0xfeab,0xfeafb000-0xfeafbfff irq 18 at device 8.0 on pci3
miibus0: MII bus on fxp0
inphy0: i82555 10/100 media interface on miibus0
inphy0:  10baseT, 10baseT-FDX, 100baseTX, 100baseTX-FDX, auto
fxp0: Ethernet address: 00:e0:81:31:89:1b
isab0: PCI-ISA bridge at device 7.0 on pci0
isa0: ISA bus on isab0
atapci0: