TLS bug?

2011-06-16 Thread Nathaniel W Filardo
I have a few applications (bonnie++ and mysql, specifically, both from
ports) which trip over the assertion in
lib/libc/stdlib/malloc.c:/^_malloc_thread_cleanup that
   assert(tcache != (void *)(uintptr_t)1);

I have patched malloc.c thus:

 --- a/lib/libc/stdlib/malloc.c
 +++ b/lib/libc/stdlib/malloc.c
 @@ -1108,7 +1108,7 @@ static __thread arena_t   *arenas_map TLS_MODEL;
  
  #ifdef MALLOC_TCACHE
  /* Map of thread-specific caches. */
 -static __thread tcache_t   *tcache_tls TLS_MODEL;
 +__thread tcache_t  *tcache_tls TLS_MODEL;
  
  /*
   * Number of cache slots for each bin in the thread cache, or 0 if tcache
   * is
 @@ -6184,10 +6184,17 @@ _malloc_thread_cleanup(void)
  #ifdef MALLOC_TCACHE
 tcache_t *tcache = tcache_tls;
  
 +fprintf(stderr, _m_t_c for %d:%lu with %p\n, 
 +   getpid(),
 +   (unsigned long) _pthread_self(),
 +   tcache);
 +
 if (tcache != NULL) {
 -   assert(tcache != (void *)(uintptr_t)1);
 -   tcache_destroy(tcache);
 -   tcache_tls = (void *)(uintptr_t)1;
 +   /* assert(tcache != (void *)(uintptr_t)1); */
 +   if((uintptr_t)tcache != (uintptr_t)1) {
 +   tcache_destroy(tcache);
 +   tcache_tls = (void *)(uintptr_t)1;
 +   }

and libthr/thread/thr_create.c thus:

 --- a/lib/libthr/thread/thr_create.c
 +++ b/lib/libthr/thread/thr_create.c
 @@ -243,6 +243,8 @@ create_stack(struct pthread_attr *pattr)
 return (ret);
  }
  
 +extern __thread void *tcache_tls;
 +
  static void
  thread_start(struct pthread *curthread)
  {
 @@ -280,6 +282,11 @@ thread_start(struct pthread *curthread)
 curthread-attr.stacksize_attr;
  #endif
  
 +fprintf(stderr, t_s for %d:%lu with %p\n,
 +getpid(),
 +(unsigned long) _pthread_self(),
 +tcache_tls);
 +
 /* Run the current thread's start routine with argument: */
 _pthread_exit(curthread-start_routine(curthread-arg));
  

to attempt to debug this issue.  With those changes in place, bonnie++'s
execution looks like this:

[...]
 Writing a byte at a time...done
 Writing intelligently...done
 Rewriting...done
 Reading a byte at a time...done
 Reading intelligently...done
 t_s for 79654:1086343168 with 0x0
 t_s for 79654:1086345216 with 0x0
 t_s for 79654:1086346240 with 0x0
 t_s for 79654:1086347264 with 0x0
 t_s for 79654:1086344192 with 0x0
 start 'em...done...done...done...done..._m_t_c for 79654:1086344192 with
 0x41404400
 _m_t_c for 79654:1086346240 with 0x40d2c400
 _m_t_c for 79654:1086343168 with 0x41404200
 _m_t_c for 79654:1086345216 with 0x41804200
 done...
 _m_t_c for 79654:1086347264 with 0x41004200
 Create files in sequential order...done.
 Stat files in sequential order...done.
 Delete files in sequential order...done.
 Create files in random order...done.
 Stat files in random order...done.
 Delete files in random order...done.
 1.96,1.96,hydra.priv.oc.ietfng.org,1,1308217772,10M,,7,81,2644,7,3577,14,34,93,+,+++,773.7,61,16,,,
 ,,2325,74,13016,99,2342,86,3019,91,11888,99,2184,89,16397ms,1237ms,671ms,2009ms,177us,1305ms,489ms,1029
 us,270ms,140ms,53730us,250ms
 Writing a byte at a time...done
 Writing intelligently...done
 Rewriting...done
 Reading a byte at a time...done
 Reading intelligently...done
 t_s for 79654:1086343168 with 0x1
 t_s for 79654:1086346240 with 0x1
 t_s for 79654:1086345216 with 0x1
 t_s for 79654:1086347264 with 0x1
 t_s for 79654:1086344192 with 0x1
 start 'em...done...done...done...done...done...
 _m_t_c for 79654:1086347264 with 0x1
 _m_t_c for 79654:1086344192 with 0x1
 _m_t_c for 79654:1086343168 with 0x1
[...]

So what seems to be happening is that the TLS area is being set up
incorrectly, eventually: rather than zeroing the tcache_tls value, it is
being set to 1, which means no tcache is ever allocated, so when we get
around to exiting, the assert trips.

Unfortunately, setting a breakpoint on __libc_allocate_tls seems to do bad
things to the kernel (inducing a SIR without any panic message).  I am
somewhat at a loss; help?

Thanks in advance!
--nwf;


pgpyJIoqnx75o.pgp
Description: PGP signature


Re: TLS bug?

2011-06-16 Thread Nathaniel W Filardo
Atcht; it's late.  I forgot to mention that this system is a sparc64 V240
2-way SMP machine.  It's running a kernel from 9.0-CURRENT r222833+262af52:
Tue Jun  7 18:47:35 EDT 2011 and a userland from a little later.

Sorry about that.
--nwf;

On Thu, Jun 16, 2011 at 03:31:38AM -0400, Nathaniel W Filardo wrote:
 I have a few applications (bonnie++ and mysql, specifically, both from
 ports) which trip over the assertion in
 lib/libc/stdlib/malloc.c:/^_malloc_thread_cleanup that
assert(tcache != (void *)(uintptr_t)1);
 
 I have patched malloc.c thus:
 
  --- a/lib/libc/stdlib/malloc.c
  +++ b/lib/libc/stdlib/malloc.c
  @@ -1108,7 +1108,7 @@ static __thread arena_t   *arenas_map 
  TLS_MODEL;
   
   #ifdef MALLOC_TCACHE
   /* Map of thread-specific caches. */
  -static __thread tcache_t   *tcache_tls TLS_MODEL;
  +__thread tcache_t  *tcache_tls TLS_MODEL;
   
   /*
* Number of cache slots for each bin in the thread cache, or 0 if tcache
* is
  @@ -6184,10 +6184,17 @@ _malloc_thread_cleanup(void)
   #ifdef MALLOC_TCACHE
  tcache_t *tcache = tcache_tls;
   
  +fprintf(stderr, _m_t_c for %d:%lu with %p\n, 
  +   getpid(),
  +   (unsigned long) _pthread_self(),
  +   tcache);
  +
  if (tcache != NULL) {
  -   assert(tcache != (void *)(uintptr_t)1);
  -   tcache_destroy(tcache);
  -   tcache_tls = (void *)(uintptr_t)1;
  +   /* assert(tcache != (void *)(uintptr_t)1); */
  +   if((uintptr_t)tcache != (uintptr_t)1) {
  +   tcache_destroy(tcache);
  +   tcache_tls = (void *)(uintptr_t)1;
  +   }
 
 and libthr/thread/thr_create.c thus:
 
  --- a/lib/libthr/thread/thr_create.c
  +++ b/lib/libthr/thread/thr_create.c
  @@ -243,6 +243,8 @@ create_stack(struct pthread_attr *pattr)
  return (ret);
   }
   
  +extern __thread void *tcache_tls;
  +
   static void
   thread_start(struct pthread *curthread)
   {
  @@ -280,6 +282,11 @@ thread_start(struct pthread *curthread)
  curthread-attr.stacksize_attr;
   #endif
   
  +fprintf(stderr, t_s for %d:%lu with %p\n,
  +getpid(),
  +(unsigned long) _pthread_self(),
  +tcache_tls);
  +
  /* Run the current thread's start routine with argument: */
  _pthread_exit(curthread-start_routine(curthread-arg));
   
 
 to attempt to debug this issue.  With those changes in place, bonnie++'s
 execution looks like this:
 
 [...]
  Writing a byte at a time...done
  Writing intelligently...done
  Rewriting...done
  Reading a byte at a time...done
  Reading intelligently...done
  t_s for 79654:1086343168 with 0x0
  t_s for 79654:1086345216 with 0x0
  t_s for 79654:1086346240 with 0x0
  t_s for 79654:1086347264 with 0x0
  t_s for 79654:1086344192 with 0x0
  start 'em...done...done...done...done..._m_t_c for 79654:1086344192 with
  0x41404400
  _m_t_c for 79654:1086346240 with 0x40d2c400
  _m_t_c for 79654:1086343168 with 0x41404200
  _m_t_c for 79654:1086345216 with 0x41804200
  done...
  _m_t_c for 79654:1086347264 with 0x41004200
  Create files in sequential order...done.
  Stat files in sequential order...done.
  Delete files in sequential order...done.
  Create files in random order...done.
  Stat files in random order...done.
  Delete files in random order...done.
  1.96,1.96,hydra.priv.oc.ietfng.org,1,1308217772,10M,,7,81,2644,7,3577,14,34,93,+,+++,773.7,61,16,,,
  ,,2325,74,13016,99,2342,86,3019,91,11888,99,2184,89,16397ms,1237ms,671ms,2009ms,177us,1305ms,489ms,1029
  us,270ms,140ms,53730us,250ms
  Writing a byte at a time...done
  Writing intelligently...done
  Rewriting...done
  Reading a byte at a time...done
  Reading intelligently...done
  t_s for 79654:1086343168 with 0x1
  t_s for 79654:1086346240 with 0x1
  t_s for 79654:1086345216 with 0x1
  t_s for 79654:1086347264 with 0x1
  t_s for 79654:1086344192 with 0x1
  start 'em...done...done...done...done...done...
  _m_t_c for 79654:1086347264 with 0x1
  _m_t_c for 79654:1086344192 with 0x1
  _m_t_c for 79654:1086343168 with 0x1
 [...]
 
 So what seems to be happening is that the TLS area is being set up
 incorrectly, eventually: rather than zeroing the tcache_tls value, it is
 being set to 1, which means no tcache is ever allocated, so when we get
 around to exiting, the assert trips.
 
 Unfortunately, setting a breakpoint on __libc_allocate_tls seems to do bad
 things to the kernel (inducing a SIR without any panic message).  I am
 somewhat at a loss; help?
 
 Thanks in advance!
 --nwf;




pgppZBaMjAyJt.pgp
Description: PGP signature


Re: ZFS panic with concurrent recv and read-heavy workload

2011-06-03 Thread Nathaniel W Filardo
I just got this on another machine, no heavy workload needed, just booting
and starting some jails.  Of interest, perhaps, both this and the machine
triggering the below panic are SMP V240s with 1.5GHz CPUs (though I will
confess that the machine in the original report may have had bad RAM).  I
have run a UP 1.2GHz V240 for months and never seen this panic.

This time the kernel is
 FreeBSD 9.0-CURRENT #9: Fri Jun  3 02:32:13 EDT 2011
csup'd immediately before building.  The full panic this time is
 panic: Lock buf_hash_table.ht_locks[i].ht_lock not exclusively locked @
 /usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c:4659

 cpuid = 1
 KDB: stack backtrace:
 panic() at panic+0x1c8
 _sx_assert() at _sx_assert+0xc4
 _sx_xunlock() at _sx_xunlock+0x98
 l2arc_feed_thread() at l2arc_feed_thread+0xeac
 fork_exit() at fork_exit+0x9c
 fork_trampoline() at fork_trampoline+0x8

 SC Alert: SC Request to send Break to host.
 KDB: enter: Line break on console
 [ thread pid 27 tid 100121 ]
 Stopped at  kdb_enter+0x80: ta  %xcc, 1
 db reset
 ttiimmeeoouutt  sshhuuiinngg  ddoowwnn  CCPPUUss..

Half of the memory in this machine is new (well, came with the machine) and
half is from the aforementioned UP V240 which seemed to work fine (I was
attempting an upgrade when this happened); none of it (or indeed any of the
hardware save the disk controller and disks) are common between this and the
machine reporting below.

Thoughts?  Any help would be greatly appreciated.
Thanks.
--nwf;

On Wed, Apr 06, 2011 at 04:00:43AM -0400, Nathaniel W Filardo wrote:
[...]
 panic: Lock buf_hash_table.ht_locks[i].ht_lock not exclusively locked @ 
 /usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c:1869

 cpuid = 1
 KDB: stack backtrace:
 panic() at panic+0x1c8
 _sx_assert() at _sx_assert+0xc4
 _sx_xunlock() at _sx_xunlock+0x98
 arc_evict() at arc_evict+0x614
 arc_get_data_buf() at arc_get_data_buf+0x360
 arc_buf_alloc() at arc_buf_alloc+0x94
 dmu_buf_will_fill() at dmu_buf_will_fill+0xfc
 dmu_write() at dmu_write+0xec
 dmu_recv_stream() at dmu_recv_stream+0x8a8
 zfs_ioc_recv() at zfs_ioc_recv+0x354
 zfsdev_ioctl() at zfsdev_ioctl+0xe0
 devfs_ioctl_f() at devfs_ioctl_f+0xe8
 kern_ioctl() at kern_ioctl+0x294
 ioctl() at ioctl+0x198
 syscallenter() at syscallenter+0x270
 syscall() at syscall+0x74
 -- syscall (54, FreeBSD ELF64, ioctl) %o7=0x40c13e24 --
 userland() at 0x40e72cc8
 user trace: trap %o7=0x40c13e24
 pc 0x40e72cc8, sp 0x7fd4641
 pc 0x40c158f4, sp 0x7fd4721
 pc 0x40c1e878, sp 0x7fd47f1
 pc 0x40c1ce54, sp 0x7fd8b01
 pc 0x40c1dbe0, sp 0x7fd9431
 pc 0x40c1f718, sp 0x7fdd741
 pc 0x10731c, sp 0x7fdd831
 pc 0x10c90c, sp 0x7fdd8f1
 pc 0x103ef0, sp 0x7fde1d1
 pc 0x4021aff4, sp 0x7fde291
 done
[...]


pgpz83vKmukl9.pgp
Description: PGP signature


ZFS panic with concurrent recv and read-heavy workload

2011-04-06 Thread Nathaniel W Filardo
When racing two workloads, one doing
  zfs recv -v -d testpool
and the other
  find /testpool -type f -print0 | xargs -0 sha1
I can (seemingly reliably) trigger this panic:

panic: Lock buf_hash_table.ht_locks[i].ht_lock not exclusively locked @ 
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c:1869
  

   
cpuid = 1   
   
KDB: stack backtrace:   
   
panic() at panic+0x1c8  
   
_sx_assert() at _sx_assert+0xc4 
   
_sx_xunlock() at _sx_xunlock+0x98   
   
arc_evict() at arc_evict+0x614  
   
arc_get_data_buf() at arc_get_data_buf+0x360
   
arc_buf_alloc() at arc_buf_alloc+0x94   
   
dmu_buf_will_fill() at dmu_buf_will_fill+0xfc
dmu_write() at dmu_write+0xec
dmu_recv_stream() at dmu_recv_stream+0x8a8  
   
zfs_ioc_recv() at zfs_ioc_recv+0x354
   
zfsdev_ioctl() at zfsdev_ioctl+0xe0 
   
devfs_ioctl_f() at devfs_ioctl_f+0xe8   
   
kern_ioctl() at kern_ioctl+0x294
   
ioctl() at ioctl+0x198
syscallenter() at syscallenter+0x270
syscall() at syscall+0x74   
   
-- syscall (54, FreeBSD ELF64, ioctl) %o7=0x40c13e24 -- 
   
userland() at 0x40e72cc8
   
user trace: trap %o7=0x40c13e24 
   
pc 0x40e72cc8, sp 0x7fd4641
pc 0x40c158f4, sp 0x7fd4721 
   
pc 0x40c1e878, sp 0x7fd47f1 
   
pc 0x40c1ce54, sp 0x7fd8b01 
   
pc 0x40c1dbe0, sp 0x7fd9431 
   
pc 0x40c1f718, sp 0x7fdd741 
   
pc 0x10731c, sp 0x7fdd831   
   
pc 0x10c90c, sp 0x7fdd8f1   
   
pc 0x103ef0, sp 0x7fde1d1   
   
pc 0x4021aff4, sp 0x7fde291 
   
done

The machine is a freshly installed and built sparc64 2-way SMP, running
today's -CURRENT with
http://people.freebsd.org/~mm/patches/zfs/zfs_ioctl_compat_bugfix.patch
applied.  Of note, it has only 1G of RAM in it, so kmem_max = 512M.

Thoughts?  More information?  Thanks in advance.
--nwf;


pgpo8tXy31jgF.pgp
Description: PGP signature


[sparc64] [panic] cheetah_ipi_selected: CPU can't IPI itself

2010-06-28 Thread Nathaniel W Filardo
Well, I'm back in the same town as my sparc64 and so csup'd, built, and
rebooted, trying to get more information about the vm object not owned
panic I reported a while ago.  To my dismay, I now get this panic, also late
enough in the boot process to be starting up jails:

panic: cheetah_ipi_selected: CPU can't IPI itself
cpuid = 0
KDB: stack backtrace:
panic() at panic+0x1c8
cheetah_ipi_selected() at cheetah_ipi_selected+0x48
tlb_page_demap() at tlb_page_demap+0xdc
pmap_copy_page() at pmap_copy_page+0x4c4
vm_fault() at vm_fault+0x13ec
trap_pfault() at trap_pfault+0x190
trap() at trap+0xd0
-- data access protection tar=0x224b93 sfar=0x224550 sfsr=0x85
%o7=0x4063398c --
userland() at 0x40633830
user trace: trap %o7=0x4063398c
...

And the system hangs; I had to use the ALOM to reboot it.
Sorry to not have more useful news.
--nwf;


pgpdTuFjRko2M.pgp
Description: PGP signature


[sparc64] [panic] mutex vm object not owned

2010-06-09 Thread Nathaniel W Filardo
Attempting to boot on (2-way SMP; SUN Fire V240) sparc64 a 9.0-CURRENT
kernel built on Jun 9 at 14:41, and fully csup'd before building (I don't
have the SVN revision number, sorry) yields, surprisingly late in the boot
process, this panic:

panic: mutex vm object not owned at /systank/src/sys/vm/vm_object.c:1692
cpuid = 0
KDB: stack backtrace:
panic() at panic+0x1c8
_mtx_assert() at _mtx_assert+0xb0
vm_object_collapse() at vm_object_collapse+0x28
vm_object_deallocate() at vm_object_deallocate+0x538
_vm_map_unlock() at _vm_map_unlock+0x64
vm_map_remove() at vm_map_remove+0x64
vmspace_exit() at vmspace_exit+0x100
exit1() at exit1+0x788
sys_exit() at sys_exit+0x10
syscallenter() at syscallenter+0x268
syscall() at syscall+0x74
-- syscall (1, FreeBSD ELF64, sys_exit) %o7=0x11980c --
userland() at 0x406fe8c8
user trace: trap %o7=0x11980c
pc 0x406fe8c8, sp 0x7fd7611
done
Uptime: 4m7s

The system was, at the time, attempting to bring up its jails.

Anything else that would be helpful to know?
Thanks.
--nwf;


pgp9SXSZG14Ki.pgp
Description: PGP signature