OK, so, I reinstalled with debug binaries, having figured out how to override global make.conf for specific builds. This morning, three jobs were hung, and once again it seemed to be random clients hung on random files. There was nothing much of interest showing in the client trace files (at d200) except for heartbeat messages one after another.
On trying to kill the first stuck job, nothing happened except that it did not terminate. On trying to kill the second, the director crashed again. Upon restarting the Director and re-queueing the failed jobs, all ran successfully. Here's the traceback: minbar:root:~:9 # cat /var/lib/bacula/bacula.15819.traceback [New LWP 15832] [New LWP 15833] [New LWP 23804] [New LWP 23810] [New LWP 23811] [New LWP 30797] [New LWP 22128] [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". 0x00007f74d89434c5 in nanosleep () from /lib64/libpthread.so.0 $1 = "17-Jun-2020 08:24:13\000\000\000\000\000\000\000\000\000" $2 = 0x5587048c6e40 <my_name> "minbar-dir" $3 = 0x55870660d0f8 "bacula-dir" $4 = 0x55870660d138 "/usr/sbin/bacula-dir" $5 = 0x0 $6 = "MySQL", '\000' <repeats 44 times> $7 = 0x7f74d89b44f4 "9.6.5 (11 June 2020)" $8 = 0x7f74d89b44d8 "x86_64-pc-linux-gnu" $9 = 0x7f74d89b44d1 "gentoo" $10 = 0x7f74d89b3ff3 "" $11 = "minbar", '\000' <repeats 43 times> $12 = 0x7f74d89b44ec "gentoo " Environment variable "TestName" not defined. #0 0x00007f74d89434c5 in nanosleep () from /lib64/libpthread.so.0 #1 0x00007f74d896c4e6 in bmicrosleep (sec=sec@entry=60, usec=usec@entry=0) at bsys.c:192 #2 0x000055870485c0d4 in wait_for_next_job (one_shot_job_to_run=<optimized out>) at scheduler.c:121 #3 0x000055870482e5f5 in main (argc=<optimized out>, argv=<optimized out>) at dird.c:387 Thread 8 (Thread 0x7f74c4ff9700 (LWP 22128)): #0 0x00007f74d8942dfc in read () from /lib64/libpthread.so.0 #1 0x00007f74d8996ed3 in BSOCKCORE::socketRead (this=0x7f74d0000d78, len=4, buf=0x7f74c4ff8df4, fd=<optimized out>) at ../lib/bsockcore.h:202 #2 BSOCKCORE::read_nbytes (nbytes=<optimized out>, ptr=0x7f74c4ff8df4 "t\177", this=<optimized out>) at bsockcore.c:1144 #3 BSOCKCORE::read_nbytes (this=0x7f74d0000d78, ptr=<optimized out>, nbytes=4) at bsockcore.c:1130 #4 0x00007f74d89703bd in BSOCK::recv (this=this@entry=0x7f74d0000d78) at bsock.c:441 #5 0x00005587048846d6 in handle_UA_client_request (arg=0x7f74d0000d78) at ua_server.c:144 #6 0x00007f74d89a55b5 in workq_server (arg=0x5587048c79c0 <ua_workq>) at workq.c:372 #7 0x00007f74d8938ea7 in start_thread () from /lib64/libpthread.so.0 #8 0x00007f74d85f7c6f in clone () from /lib64/libc.so.6 Thread 7 (Thread 0x7f74c77fe700 (LWP 30797)): #0 0x00007f74d89434c5 in nanosleep () from /lib64/libpthread.so.0 #1 0x00007f74d896c4e6 in bmicrosleep (sec=sec@entry=2, usec=usec@entry=0) at bsys.c:192 #2 0x000055870484fd92 in jobq_server (arg=0x5587048c76a0 <job_queue>) at jobq.c:616 #3 0x00007f74d8938ea7 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f74d85f7c6f in clone () from /lib64/libc.so.6 Thread 6 (Thread 0x7f74c5ffb700 (LWP 23811)): #0 0x00007f74d8942dfc in read () from /lib64/libpthread.so.0 #1 0x00007f74d8996ed3 in BSOCKCORE::socketRead (this=0x7f74b00045c8, len=4, buf=0x7f74c5ffa9b4, fd=<optimized out>) at ../lib/bsockcore.h:202 #2 BSOCKCORE::read_nbytes (nbytes=<optimized out>, ptr=0x7f74c5ffa9b4 "\372\377\377\377", this=<optimized out>) at bsockcore.c:1144 #3 BSOCKCORE::read_nbytes (this=0x7f74b00045c8, ptr=<optimized out>, nbytes=4) at bsockcore.c:1130 #4 0x00007f74d89703bd in BSOCK::recv (this=this@entry=0x7f74b00045c8) at bsock.c:441 #5 0x00005587048466e7 in bget_dirmsg (bs=bs@entry=0x7f74b00045c8) at getmsg.c:150 #6 0x0000558704834d78 in wait_for_job_termination (jcr=jcr@entry=0x55870667a8c8, timeout=timeout@entry=0) at backup.c:685 #7 0x0000558704837009 in do_backup (jcr=jcr@entry=0x55870667a8c8) at backup.c:633 #8 0x0000558704849318 in job_thread (arg=0x55870667a8c8) at job.c:453 #9 0x000055870484f7fb in jobq_server (arg=0x5587048c76a0 <job_queue>) at jobq.c:468 #10 0x00007f74d8938ea7 in start_thread () from /lib64/libpthread.so.0 #11 0x00007f74d85f7c6f in clone () from /lib64/libc.so.6 Thread 5 (Thread 0x7f74c67fc700 (LWP 23810)): #0 0x00007f74d89439ae in waitpid () from /lib64/libpthread.so.0 #1 0x00007f74d899a74c in signal_handler (sig=6) at signal.c:233 #2 <signal handler called> #3 0x00007f74d8537621 in raise () from /lib64/libc.so.6 #4 0x00007f74d852155b in abort () from /lib64/libc.so.6 #5 0x00007f74d7e88c01 in ?? () from /usr/lib64/libmysqlclient.so.21 #6 0x00007f74d7e8b5c7 in ?? () from /usr/lib64/libmysqlclient.so.21 #7 0x00007f74d7e398d0 in ?? () from /usr/lib64/libmysqlclient.so.21 #8 0x00007f74d7e39e1d in ?? () from /usr/lib64/libmysqlclient.so.21 #9 0x00007f74d7e35146 in mysql_send_query () from /usr/lib64/libmysqlclient.so.21 #10 0x00007f74d7e35385 in mysql_real_query () from /usr/lib64/libmysqlclient.so.21 #11 0x00007f74d89e0138 in BDB_MYSQL::sql_query (this=0x558706673108, query=0x7f74c801b320 "UPDATE Job SET JobStatus='f',EndTime='2020-06-17 08:24:13',ClientId=31,JobBytes=15644859,ReadBytes=0,JobFiles=19,JobErrors=1,VolSessionId=33,VolSessionTime=1592158122,PoolId=5,FileSetId=31,JobTDate=15"..., flags=0) at mysql.c:537 #12 0x00007f74d89f323c in BDB::UpdateDB (this=this@entry=0x558706673108, jcr=jcr@entry=0x558706672128, cmd=0x7f74c801b320 "UPDATE Job SET JobStatus='f',EndTime='2020-06-17 08:24:13',ClientId=31,JobBytes=15644859,ReadBytes=0,JobFiles=19,JobErrors=1,VolSessionId=33,VolSessionTime=1592158122,PoolId=5,FileSetId=31,JobTDate=15"..., can_be_empty=can_be_empty@entry=false, file=file@entry=0x7f74d8a040d7 "bdb.h", line=line@entry=140) at sql.c:474 #13 0x00007f74d8a0292c in BDB::bdb_update_job_end_record (this=0x558706673108, jcr=jcr@entry=0x558706672128, jr=jr@entry=0x558706672618) at sql_update.c:190 #14 0x000055870484aff1 in update_job_end_record (jcr=0x558706672128) at job.c:1369 #15 0x0000558704835c1c in backup_cleanup (jcr=jcr@entry=0x558706672128, TermCode=TermCode@entry=69) at backup.c:898 #16 0x00005587048491aa in job_thread (arg=0x558706672128) at job.c:455 #17 0x000055870484f7fb in jobq_server (arg=0x5587048c76a0 <job_queue>) at jobq.c:468 #18 0x00007f74d8938ea7 in start_thread () from /lib64/libpthread.so.0 #19 0x00007f74d85f7c6f in clone () from /lib64/libc.so.6 Thread 4 (Thread 0x7f74d561c700 (LWP 23804)): #0 0x00007f74d8942dfc in read () from /lib64/libpthread.so.0 #1 0x00007f74d8996ed3 in BSOCKCORE::socketRead (this=0x7f74cc000fd8, len=4, buf=0x7f74d561b9b4, fd=<optimized out>) at ../lib/bsockcore.h:202 #2 BSOCKCORE::read_nbytes (nbytes=<optimized out>, ptr=0x7f74d561b9b4 "\372\377\377\377", this=<optimized out>) at bsockcore.c:1144 #3 BSOCKCORE::read_nbytes (this=0x7f74cc000fd8, ptr=<optimized out>, nbytes=4) at bsockcore.c:1130 #4 0x00007f74d89703bd in BSOCK::recv (this=this@entry=0x7f74cc000fd8) at bsock.c:441 #5 0x00005587048466e7 in bget_dirmsg (bs=bs@entry=0x7f74cc000fd8) at getmsg.c:150 #6 0x0000558704834d78 in wait_for_job_termination (jcr=jcr@entry=0x558706683928, timeout=timeout@entry=0) at backup.c:685 #7 0x0000558704837009 in do_backup (jcr=jcr@entry=0x558706683928) at backup.c:633 #8 0x0000558704849318 in job_thread (arg=0x558706683928) at job.c:453 #9 0x000055870484f7fb in jobq_server (arg=0x5587048c76a0 <job_queue>) at jobq.c:468 #10 0x00007f74d8938ea7 in start_thread () from /lib64/libpthread.so.0 #11 0x00007f74d85f7c6f in clone () from /lib64/libc.so.6 Thread 3 (Thread 0x7f74d664d700 (LWP 15833)): #0 0x00007f74d893f878 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f74d89a4be9 in watchdog_thread (arg=<optimized out>) at watchdog.c:299 #2 0x00007f74d8938ea7 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f74d85f7c6f in clone () from /lib64/libc.so.6 Thread 2 (Thread 0x7f74d6e4e700 (LWP 15832)): #0 0x00007f74d85ef9d3 in select () from /lib64/libc.so.6 #1 0x00007f74d896fa38 in bnet_thread_server (addrs=addrs@entry=0x55870660c718, max_clients=20, client_wq=client_wq@entry=0x5587048c79c0 <ua_workq>, handle_client_request=handle_client_request@entry=0x558704884650 <handle_UA_client_request(void*)>) at bnet_server.c:166 #2 0x0000558704884296 in connect_thread (arg=0x55870660c718) at ua_server.c:85 #3 0x00007f74d8938ea7 in start_thread () from /lib64/libpthread.so.0 #4 0x00007f74d85f7c6f in clone () from /lib64/libc.so.6 Thread 1 (Thread 0x7f74d79110c0 (LWP 15819)): #0 0x00007f74d89434c5 in nanosleep () from /lib64/libpthread.so.0 #1 0x00007f74d896c4e6 in bmicrosleep (sec=sec@entry=60, usec=usec@entry=0) at bsys.c:192 #2 0x000055870485c0d4 in wait_for_next_job (one_shot_job_to_run=<optimized out>) at scheduler.c:121 #3 0x000055870482e5f5 in main (argc=<optimized out>, argv=<optimized out>) at dird.c:387 #0 0x00007f74d89434c5 in nanosleep () from /lib64/libpthread.so.0 No symbol table info available. #1 0x00007f74d896c4e6 in bmicrosleep (sec=sec@entry=60, usec=usec@entry=0) at bsys.c:192 192 bsys.c: No such file or directory. timeout = {tv_sec = 60, tv_nsec = 0} tv = {tv_sec = 1, tv_usec = 1578147840} tz = {tz_minuteswest = -661166920, tz_dsttime = 32628} stat = <optimized out> #2 0x000055870485c0d4 in wait_for_next_job (one_shot_job_to_run=<optimized out>) at scheduler.c:121 121 scheduler.c: No such file or directory. jcr = <optimized out> job = <optimized out> run = <optimized out> now = <optimized out> prev = <optimized out> first = false next_job = <optimized out> tjcr = <optimized out> #3 0x000055870482e5f5 in main (argc=<optimized out>, argv=<optimized out>) at dird.c:387 387 dird.c: No such file or directory. ch = <optimized out> jcr = <optimized out> no_signals = false uid = 0x7fff69664b24 "root" gid = 0x7fff69664b2c "bacula" mode = <optimized out> /usr/libexec/bacula/btraceback.gdb:24: Error in sourced command file: No frame at level 4. [Inferior 1 (process 15819) detached] Attempt to dump current JCRs. njcrs=6 threadid=0x7f74d79110c0 JobId=0 JobStatus=R jcr=0x55870664f4c8 name=*JobMonitor*.2020-06-16_11.24.20_01 use_count=1 killable=0 JobType=I JobLevel= sched_time=16-Jun-2020 11:24 start_time=16-Jun-2020 11:24 end_time=31-Dec-1969 19:00 wait_time=31-Dec-1969 19:00 db=(nil) db_batch=(nil) batch_started=0 wstore=0x558706615b78 rstore=(nil) wjcr=(nil) client=0x558706617798 reschedule_count=0 SD_msg_chan_started=0 threadid=0x7f74c67fc700 JobId=24993 JobStatus=f jcr=0x558706672128 name=Fisherprice_Backup.2020-06-17_04.30.00_11 use_count=1 killable=1 JobType=B JobLevel=I sched_time=17-Jun-2020 04:30 start_time=17-Jun-2020 04:30 end_time=17-Jun-2020 08:24 wait_time=31-Dec-1969 19:00 db=0x558706673108 db_batch=(nil) batch_started=0 wstore=0x558706615b78 rstore=(nil) wjcr=(nil) client=0x558706617c78 reschedule_count=0 SD_msg_chan_started=0 BDB=0x558706673108 db_name=bacula db_user=bacula connected=true cmd="UPDATE Job SET JobStatus='f',EndTime='2020-06-17 08:24:13',ClientId=31,JobBytes=15644859,ReadBytes=0,JobFiles=19,JobErrors=1,VolSessionId=33,VolSessionTime=1592158122,PoolId=5,FileSetId=31,JobTDate=1592396653,RealEndTime='2020-06-17 08:24:13',PriorJobId=0,HasBase=0,PurgedFiles=0 WHERE JobId=24993" changes=15 RWLOCK=0x558706673120 w_active=1 w_wait=0 threadid=0x7f74c5ffb700 JobId=24994 JobStatus=f jcr=0x55870667a8c8 name=Minbar_Backup.2020-06-17_04.30.00_12 use_count=1 killable=1 JobType=B JobLevel=I sched_time=17-Jun-2020 04:30 start_time=17-Jun-2020 04:30 end_time=31-Dec-1969 19:00 wait_time=31-Dec-1969 19:00 db=0x55870667b8a8 db_batch=(nil) batch_started=0 wstore=0x558706615b78 rstore=(nil) wjcr=(nil) client=0x558706617ee8 reschedule_count=0 SD_msg_chan_started=0 BDB=0x55870667b8a8 db_name=bacula db_user=bacula connected=true cmd="INSERT INTO File (FileIndex,JobId,PathId,FilenameId,LStat,MD5,DeltaSeq) VALUES (129,24994,4939,19751,'gC BgB9rk InJ B A A A I9a4 BAA BHw Be6PZK Be6PZK Be6PZN A A C','hm7Q72q+MloeGARz9VAJfw',0)" changes=161 RWLOCK=0x55870667b8c0 w_active=0 w_wait=0 threadid=0x7f74d561c700 JobId=24995 JobStatus=f jcr=0x558706683928 name=Narn_Backup.2020-06-17_04.30.00_13 use_count=1 killable=1 JobType=B JobLevel=I sched_time=17-Jun-2020 04:30 start_time=17-Jun-2020 04:30 end_time=31-Dec-1969 19:00 wait_time=31-Dec-1969 19:00 db=0x5587066854e8 db_batch=(nil) batch_started=0 wstore=0x558706615b78 rstore=(nil) wjcr=(nil) client=0x558706618158 reschedule_count=0 SD_msg_chan_started=0 BDB=0x5587066854e8 db_name=bacula db_user=bacula connected=true cmd="INSERT INTO File (FileIndex,JobId,PathId,FilenameId,LStat,MD5,DeltaSeq) VALUES (33,24995,2366,109,'c E EH9 R Po Bk A R CAA F Be6dSJ Bel6xw Bel6xw A A C','0',0)" changes=34 RWLOCK=0x558706685500 w_active=0 w_wait=0 threadid=0x7f74d79110c0 JobId=24996 JobStatus=p jcr=0x55870664d0c8 name=MySQL_Backup_New.2020-06-17_04.55.00_17 use_count=1 killable=0 JobType=B JobLevel=I sched_time=17-Jun-2020 04:55 start_time=17-Jun-2020 04:55 end_time=31-Dec-1969 19:00 wait_time=17-Jun-2020 04:55 db=0x55870664c688 db_batch=(nil) batch_started=0 wstore=0x558706615b78 rstore=(nil) wjcr=(nil) client=0x558706617ee8 reschedule_count=0 SD_msg_chan_started=0 BDB=0x55870664c688 db_name=bacula db_user=bacula connected=true cmd="SELECT PoolId,Name,NumVols,MaxVols,UseOnce,UseCatalog,AcceptAnyVolume,AutoPrune,Recycle,VolRetention,VolUseDuration,MaxVolJobs,MaxVolFiles,MaxVolBytes,PoolType,LabelType,LabelFormat,RecyclePoolId,ScratchPoolId,ActionOnPurge,CacheRetention FROM Pool WHERE Pool.Name='Incr-Disk'" changes=1 RWLOCK=0x55870664c6a0 w_active=0 w_wait=0 threadid=0x7f74c4ff9700 JobId=0 JobStatus=R jcr=0x7f74b40044f8 name=-Console-.2020-06-17_08.15.46_18 use_count=1 killable=0 JobType=U JobLevel= sched_time=17-Jun-2020 08:15 start_time=17-Jun-2020 08:15 end_time=31-Dec-1969 19:00 wait_time=31-Dec-1969 19:00 db=0x7f74b4001e28 db_batch=(nil) batch_started=0 wstore=0x558706615b78 rstore=(nil) wjcr=(nil) client=0x558706617ee8 reschedule_count=0 SD_msg_chan_started=0 BDB=0x7f74b4001e28 db_name=bacula db_user=bacula connected=true cmd="UPDATE Client SET AutoPrune=1,FileRetention=31104000,JobRetention=31104000,Uname='9.4.4 (28May19) x86_64-redhat-linux-gnu,redhat,' WHERE Name='fisherprice'" changes=81 RWLOCK=0x7f74b4001e40 w_active=0 w_wait=0 List plugins. Hook count=0 -- Phil Stracchino Babylon Communications ph...@caerllewys.net p...@co.ordinate.org Landline: +1.603.293.8485 Mobile: +1.603.998.6958 _______________________________________________ Bacula-devel mailing list Bacula-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/bacula-devel