Revision: 75567
http://sourceforge.net/p/brlcad/code/75567
Author: starseeker
Date: 2020-04-23 03:34:04 +0000 (Thu, 23 Apr 2020)
Log Message:
-----------
Stash the two backtraces we have of the FreeBSD semaphore acquisition failure
bug in action - doubt they have enough information to be diagnostic but they're
the best starting point available...
Modified Paths:
--------------
brlcad/trunk/doc/CMakeLists.txt
Added Paths:
-----------
brlcad/trunk/doc/bsd_semaphore_bug.txt
Modified: brlcad/trunk/doc/CMakeLists.txt
===================================================================
--- brlcad/trunk/doc/CMakeLists.txt 2020-04-23 03:13:03 UTC (rev 75566)
+++ brlcad/trunk/doc/CMakeLists.txt 2020-04-23 03:34:04 UTC (rev 75567)
@@ -228,6 +228,7 @@
README.OSCON-2014
STARTERS
STRATEGY
+ bsd_semaphore_bug.txt
cmake_vars.txt
csv_to_comgeom.txt
implicit_constraints.txt
Added: brlcad/trunk/doc/bsd_semaphore_bug.txt
===================================================================
--- brlcad/trunk/doc/bsd_semaphore_bug.txt (rev 0)
+++ brlcad/trunk/doc/bsd_semaphore_bug.txt 2020-04-23 03:34:04 UTC (rev
75567)
@@ -0,0 +1,155 @@
+The below backtraces were able to catch the infamous "fatal semaphore
+acquisition failure" bug on BSD. This is a bit tricky to catch in the act, so
+the below backtraces are added here to serve as references. They document both
+the gdb backtrace and the trick used to successfully attach to it it (a long
+sleep call compiled in to the key failure area prior to the run, allowing us to
+get a debugger on the program before it actually vanishes.) This case is most
+reliably seen when running make regress in parallel mode - it doesn't usually
+trigger when running individual programs interactively, although those do
+fail occasionally.
+
+So far this has only been observed on FreeBSD.
+
+
+The first failure is from a GQA run:
+
+Architecture set to: x86_64--freebsd11.2.
+(lldb) bt
+* thread #1, name = 'gqa'
+ * frame #0: 0x0000000817433f5c libthr.so.3`_umtx_op_err + 12
+ frame #1: 0x000000081743056a
libthr.so.3`join_common(pthread=0x000000081ae18800,
thread_return=0x0000000000000000, abstime=0x0000000000000000) at thr_join.c:125
+ frame #2: 0x0000000814b9b841
libbu.so.20`bu_parallel(func=(libged.so.20`plane_worker at gqa.c:1223),
ncpu=16, arg=0x00007fffffffe660) at parallel.c:722
+ frame #3: 0x0000000800af161c libged.so.20`ged_gqa(gedp=0x000000081ae8d000,
argc=11, argv=0x000000081ae1c300) at gqa.c:2449
+ frame #4: 0x00000000004010c3 gqa`main(argc=12, argv=0x00007fffffffe828) at
gqa.c:102
+ frame #5: 0x0000000000400c05 gqa`_start + 149
+(lldb) thread list
+Process 56371 stopped
+* thread #1: tid = 101853, 0x0000000817433f5c libthr.so.3`_umtx_op_err + 12,
name = 'gqa'
+ thread #2: tid = 100989, 0x000000081792e6da libc.so.7`__sys_nanosleep + 10,
name = 'gqa'
+(lldb) thread select 2
+* thread #2, name = 'gqa'
+ frame #0: 0x000000081792e6da libc.so.7`__sys_nanosleep + 10
+libc.so.7`__sys_nanosleep:
+-> 0x81792e6da <+10>: jb 0x8179a05a4 ; .cerror
+ 0x81792e6e0 <+16>: retq
+ 0x81792e6e1: int3
+ 0x81792e6e2: int3
+(lldb) bt
+* thread #2, name = 'gqa'
+ * frame #0: 0x000000081792e6da libc.so.7`__sys_nanosleep + 10
+ frame #1: 0x0000000817428a2c
libthr.so.3`__thr_nanosleep(time_to_sleep=<unavailable>,
time_remaining=<unavailable>) at thr_syscalls.c:287
+ frame #2: 0x00000008178b19eb libc.so.7`__sleep(seconds=60000) at sleep.c:62
+ frame #3: 0x0000000814bb218c libbu.so.20`bu_semaphore_release(i=20) at
semaphore.c:272
+ frame #4: 0x0000000800aebc31 libged.so.20`hit(ap=0x00007fffdd7f9dd8,
PartHeadp=0x00007fffdd7f97e8, segs=0x00007fffdd7f98e8) at gqa.c:1093
+ frame #5: 0x0000000804062b12
librt.so.20`rt_shootray(ap=0x00007fffdd7f9dd8) at shoot.c:1273
+ frame #6: 0x0000000800aec6cf libged.so.20`plane_worker(cpu=5,
ptr=0x00007fffffffe660) at gqa.c:1280
+ frame #7: 0x0000000814b9bdcb
libbu.so.20`parallel_interface_arg(utd=0x000000081afedc80) at parallel.c:428
+ frame #8: 0x0000000817426026
libthr.so.3`thread_start(curthread=0x000000081ae18800) at thr_create.c:290
+(lldb) up
+frame #1: 0x0000000817428a2c
libthr.so.3`__thr_nanosleep(time_to_sleep=<unavailable>,
time_remaining=<unavailable>) at thr_syscalls.c:287
+ 284
+ 285 curthread = _get_curthread();
+ 286 _thr_cancel_enter(curthread);
+-> 287 ret = __sys_nanosleep(time_to_sleep, time_remaining);
+ 288 _thr_cancel_leave(curthread, 1);
+ 289
+ 290 return (ret);
+(lldb) up
+frame #2: 0x00000008178b19eb libc.so.7`__sleep(seconds=60000) at sleep.c:62
+ 59
+ 60 time_to_sleep.tv_sec = seconds;
+ 61 time_to_sleep.tv_nsec = 0;
+-> 62 if (((int (*)(const struct timespec *, struct timespec
*))
+ 63 __libc_interposing[INTERPOS_nanosleep])(
+ 64 &time_to_sleep, &time_remaining) != -1)
+ 65 return (0);
+(lldb) up
+frame #3: 0x0000000814bb218c libbu.so.20`bu_semaphore_release(i=20) at
semaphore.c:272
+ 269 # if defined(HAVE_PTHREAD_H)
+ 270 if (pthread_mutex_unlock(&bu_semaphores[i].mu)) {
+ 271 fprintf(stderr, "bu_semaphore_acquire():
pthread_mutex_unlock() failed on [%d]\n", i);
+-> 272 sleep(60000);
+ 273 bu_bomb("fatal semaphore acquisition failure");
+ 274 }
+ 275 # endif
+
+
+The second is from an rt raytrace of a bot:
+
+bin/rt".
+Architecture set to: x86_64--freebsd11.2.
+(lldb) bt
+* thread #1, name = 'rt'
+ * frame #0: 0x0000000800861f5c libthr.so.3`_umtx_op_err + 12
+ frame #1: 0x000000080085e56a
libthr.so.3`join_common(pthread=0x000000081aa3a400,
thread_return=0x0000000000000000, abstime=0x0000000000000000) at thr_join.c:125
+ frame #2: 0x000000081455d841
libbu.so.20`bu_parallel(func=(librt.so.20`_db_walk_dispatcher at
db_tree.c:1955), ncpu=16, arg=0x00007fffffffd700) at parallel.c:722
+ frame #3: 0x00000008032b0f8e
librt.so.20`db_walk_tree(dbip=0x000000081a861000, argc=1,
argv=0x00007fffffffe888, ncpu=16, init_state=0x00007fffffffd988,
reg_start_func=(librt.so.20`_rt_gettree_region_start at tree.c:109),
reg_end_func=(librt.so.20`_rt_gettree_region_end at tree.c:146),
leaf_func=(librt.so.20`_rt_gettree_leaf at tree.c:444),
client_data=0x00007fffffffd978) at db_tree.c:2199
+ frame #4: 0x00000008036661d7
librt.so.20`rt_gettrees_muves(rtip=0x000000081a8a7000,
attrs=0x0000000000000000, argc=1, argv=0x00007fffffffe888, ncpus=16) at
tree.c:783
+ frame #5: 0x0000000803669021
librt.so.20`rt_gettrees_and_attrs(rtip=0x000000081a8a7000,
attrs=0x0000000000000000, argc=1, argv=0x00007fffffffe888, ncpus=16) at
tree.c:887
+ frame #6: 0x0000000803669142
librt.so.20`rt_gettrees(rtip=0x000000081a8a7000, argc=1,
argv=0x00007fffffffe888, ncpus=16) at tree.c:914
+ frame #7: 0x0000000000406a00 rt`def_tree(rtip=0x000000081a8a7000) at
do.c:575
+ frame #8: 0x000000000040bd52 rt`main(argc=6, argv=0x00007fffffffe860) at
main.c:495
+ frame #9: 0x00000000004062e5 rt`_start + 149
+(lldb) thread list
+Process 7726 stopped
+* thread #1: tid = 100805, 0x0000000800861f5c libthr.so.3`_umtx_op_err + 12,
name = 'rt'
+ thread #2: tid = 101797, 0x0000000816ec76da libc.so.7`__sys_nanosleep + 10,
name = 'rt'
+(lldb) thread select 2
+* thread #2, name = 'rt'
+ frame #0: 0x0000000816ec76da libc.so.7`__sys_nanosleep + 10
+libc.so.7`__sys_nanosleep:
+-> 0x816ec76da <+10>: jb 0x816f395a4 ; .cerror
+ 0x816ec76e0 <+16>: retq
+ 0x816ec76e1: int3
+ 0x816ec76e2: int3
+(lldb) bt
+* thread #2, name = 'rt'
+ * frame #0: 0x0000000816ec76da libc.so.7`__sys_nanosleep + 10
+ frame #1: 0x0000000800856a2c
libthr.so.3`__thr_nanosleep(time_to_sleep=<unavailable>,
time_remaining=<unavailable>) at thr_syscalls.c:287
+ frame #2: 0x0000000816e4a9eb libc.so.7`__sleep(seconds=60000) at sleep.c:62
+ frame #3: 0x000000081457418c libbu.so.20`bu_semaphore_release(i=12) at
semaphore.c:272
+ frame #4: 0x000000080366a193
librt.so.20`_rt_find_identical_solid(mat=0x0000000000000000,
dp=0x000000081a8a2e98, rtip=0x000000081a8a7000) at tree.c:425
+ frame #5: 0x00000008036681e3
librt.so.20`_rt_gettree_leaf(tsp=0x000000081a9fa008, pathp=0x000000081a9fa150,
ip=0x00007fffd69eecc0, client_data=0x00007fffffffd978) at tree.c:491
+ frame #6: 0x00000008032ae1b1
librt.so.20`db_recurse(tsp=0x000000081a9fa008, pathp=0x000000081a9fa150,
region_start_statepp=0x00007fffd69eef60, client_data=0x00007fffffffd978) at
db_tree.c:1170
+ frame #7: 0x00000008032b35c9
librt.so.20`_db_walk_subtree(tp=0x000000081a81f200,
region_start_statepp=0x00007fffd69eef60,
leaf_func=(librt.so.20`_rt_gettree_leaf at tree.c:444),
client_data=0x00007fffffffd978, resp=0x000000000062e520) at db_tree.c:1898
+ frame #8: 0x00000008032b172f librt.so.20`_db_walk_dispatcher(cpu=16,
arg=0x00007fffffffd700) at db_tree.c:1992
+ frame #9: 0x000000081455ddcb
libbu.so.20`parallel_interface_arg(utd=0x000000081a9f85e0) at parallel.c:428
+ frame #10: 0x0000000800854026
libthr.so.3`thread_start(curthread=0x000000081aa3a400) at thr_create.c:290
+(lldb) up
+frame #1: 0x0000000800856a2c
libthr.so.3`__thr_nanosleep(time_to_sleep=<unavailable>,
time_remaining=<unavailable>) at thr_syscalls.c:287
+ 284
+ 285 curthread = _get_curthread();
+ 286 _thr_cancel_enter(curthread);
+-> 287 ret = __sys_nanosleep(time_to_sleep, time_remaining);
+ 288 _thr_cancel_leave(curthread, 1);
+ 289
+ 290 return (ret);
+(lldb) up
+frame #2: 0x0000000816e4a9eb libc.so.7`__sleep(seconds=60000) at sleep.c:62
+ 59
+ 60 time_to_sleep.tv_sec = seconds;
+ 61 time_to_sleep.tv_nsec = 0;
+-> 62 if (((int (*)(const struct timespec *, struct timespec
*))
+ 63 __libc_interposing[INTERPOS_nanosleep])(
+ 64 &time_to_sleep, &time_remaining) != -1)
+ 65 return (0);
+(lldb) up
+frame #3: 0x000000081457418c libbu.so.20`bu_semaphore_release(i=12) at
semaphore.c:272
+ 269 # if defined(HAVE_PTHREAD_H)
+ 270 if (pthread_mutex_unlock(&bu_semaphores[i].mu)) {
+ 271 fprintf(stderr, "bu_semaphore_acquire():
pthread_mutex_unlock() failed on [%d]\n", i);
+-> 272 sleep(60000);
+ 273 bu_bomb("fatal semaphore acquisition failure");
+ 274 }
+ 275 # endif
+(lldb) up
+frame #4: 0x000000080366a193
librt.so.20`_rt_find_identical_solid(mat=0x0000000000000000,
dp=0x000000081a8a2e98, rtip=0x000000081a8a7000) at tree.c:425
+ 422 */
+ 423 bu_semaphore_acquire(RT_SEM_STATS);
+ 424 stp->st_bit = rtip->nsolids++;
+-> 425 bu_semaphore_release(RT_SEM_STATS);
+ 426
+ 427 /*
+ 428 * Fill in the last little bit of the structure in full
parallel
+(lldb)
+
Property changes on: brlcad/trunk/doc/bsd_semaphore_bug.txt
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
_______________________________________________
BRL-CAD Source Commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/brlcad-commits