On 2020-06-20 14:33, Phil Stracchino wrote:
> OK, two days with zero hung jobs.  I am proceeding with re-upgrading
> ONLY the Director (well, and that host's FD) to 9.6.5.


That got me three successful jobs, one failed, and two hung.  Here's the
failure:


21-Jun 04:30 minbar-dir JobId 25026: Fatal error: sql_create.c:968
Create db File
+record INSERT INTO File
(FileIndex,JobId,PathId,FilenameId,LStat,MD5,DeltaSeq)
+VALUES (2,25026,122083,109,'R0AAQAM E EHt H A A -B H IA F Be7wj5 BciQ9i
BciQ9i A A
+C','0',0) failed. ERR=Deadlock found when trying to get lock; try
restarting
+transaction21-Jun 04:30 minbar-dir JobId 25026: Fatal error:
catreq.c:513 Attribute
+create error: ERR=sql_create.c:968 Create db File record INSERT INTO File
+(FileIndex,JobId,PathId,FilenameId,LStat,MD5,DeltaSeq) VALUES
+(2,25026,122083,109,'R0AAQAM E EHt H A A -B H IA F Be7wj5 BciQ9i BciQ9i A A
+C','0',0) failed. ERR=Deadlock found when trying to get lock; try
restarting
+transaction21-Jun 04:30 asgard-fd JobId 25026: Error: bsock.c:383 Write
error
+sending 79 bytes to Storage daemon:asgard.caerllewys.net:9103:
ERR=Broken pipe


Once again it is somehow creating a local commit conflict on the
cluster.  I CAN configure Bacula to send all transactions to a single
node of the cluster instead of load-balancing them; however, Bacula
SHOULD be detecting reported deadlocks and retrying on its own.  There
are now several different MySQL synchronous¹-clustering technologies in
the market, principally Galera and Oracle's MySQL Cluster or whatever
they're calling it this week, they aren't difficult to support, and an
enterprise backup tool really should properly support them.


¹  Well, OK, Codership likes to use the phrase "virtually synchronous",
by which they mean that Galera replication tries to ensure that commits
are synchronous modulo network latency.  Oracle with its Paxos-based
algorithm disparages Galera while making vague promises about eventual
consistency.



Here's the state of the Director:

(gdb) attach 5160
Attaching to process 5160
[New LWP 5753]
[New LWP 5754]
[New LWP 17466]
[New LWP 17470]
[New LWP 24466]
[New LWP 26300]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
0x00007f3e6bc044c5 in nanosleep () from /lib64/libpthread.so.0
(gdb) thread apply all bt

Thread 7 (Thread 0x7f3e437fe700 (LWP 26300)):
#0  0x00007f3e6bc03dfc in read () from /lib64/libpthread.so.0
#1  0x00007f3e6bc57ed3 in BSOCKCORE::socketRead (this=0x7f3e64000d78,
len=4, buf=0x7f3e437fddf4, fd=<optimized out>) at ../lib/bsockcore.h:202
#2  BSOCKCORE::read_nbytes (nbytes=<optimized out>, ptr=0x7f3e437fddf4
">\177", this=<optimized out>) at bsockcore.c:1144
#3  BSOCKCORE::read_nbytes (this=0x7f3e64000d78, ptr=<optimized out>,
nbytes=4) at bsockcore.c:1130
#4  0x00007f3e6bc313bd in BSOCK::recv (this=this@entry=0x7f3e64000d78)
at bsock.c:441
#5  0x0000556eddfd86d6 in handle_UA_client_request (arg=0x7f3e64000d78)
at ua_server.c:144
#6  0x00007f3e6bc665b5 in workq_server (arg=0x556ede01b9c0 <ua_workq>)
at workq.c:372
#7  0x00007f3e6bbf9ea7 in start_thread () from /lib64/libpthread.so.0
#8  0x00007f3e6b8b8c6f in clone () from /lib64/libc.so.6

Thread 6 (Thread 0x7f3e597fa700 (LWP 24466)):
#0  0x00007f3e6bc044c5 in nanosleep () from /lib64/libpthread.so.0
#1  0x00007f3e6bc2d4e6 in bmicrosleep (sec=sec@entry=2,
usec=usec@entry=0) at bsys.c:192
#2  0x0000556eddfa3d92 in jobq_server (arg=0x556ede01b6a0 <job_queue>)
at jobq.c:616
#3  0x00007f3e6bbf9ea7 in start_thread () from /lib64/libpthread.so.0
#4  0x00007f3e6b8b8c6f in clone () from /lib64/libc.so.6

Thread 5 (Thread 0x7f3e59ffb700 (LWP 17470)):
#0  0x00007f3e6bc03dfc in read () from /lib64/libpthread.so.0
#1  0x00007f3e6bc57ed3 in BSOCKCORE::socketRead (this=0x7f3e48006778,
len=4, buf=0x7f3e59ffa9b4, fd=<optimized out>) at ../lib/bsockcore.h:202
#2  BSOCKCORE::read_nbytes (nbytes=<optimized out>, ptr=0x7f3e59ffa9b4
"\372\377\377\377", this=<optimized out>) at bsockcore.c:1144
#3  BSOCKCORE::read_nbytes (this=0x7f3e48006778, ptr=<optimized out>,
nbytes=4) at bsockcore.c:1130
#4  0x00007f3e6bc313bd in BSOCK::recv (this=this@entry=0x7f3e48006778)
at bsock.c:441
#5  0x0000556eddf9a6e7 in bget_dirmsg (bs=bs@entry=0x7f3e48006778) at
getmsg.c:150
#6  0x0000556eddf88d78 in wait_for_job_termination
(jcr=jcr@entry=0x556edf8d7a88, timeout=timeout@entry=0) at backup.c:685
#7  0x0000556eddf8b009 in do_backup (jcr=jcr@entry=0x556edf8d7a88) at
backup.c:633
#8  0x0000556eddf9d318 in job_thread (arg=0x556edf8d7a88) at job.c:453
#9  0x0000556eddfa37fb in jobq_server (arg=0x556ede01b6a0 <job_queue>)
at jobq.c:468
#10 0x00007f3e6bbf9ea7 in start_thread () from /lib64/libpthread.so.0
#11 0x00007f3e6b8b8c6f in clone () from /lib64/libc.so.6

Thread 4 (Thread 0x7f3e5affd700 (LWP 17466)):
#0  0x00007f3e6bc03dfc in read () from /lib64/libpthread.so.0
#1  0x00007f3e6bc57ed3 in BSOCKCORE::socketRead (this=0x7f3e50008e58,
len=4, buf=0x7f3e5affc9b4, fd=<optimized out>) at ../lib/bsockcore.h:202
#2  BSOCKCORE::read_nbytes (nbytes=<optimized out>, ptr=0x7f3e5affc9b4
"\372\377\377\377", this=<optimized out>) at bsockcore.c:1144
#3  BSOCKCORE::read_nbytes (this=0x7f3e50008e58, ptr=<optimized out>,
nbytes=4) at bsockcore.c:1130
#4  0x00007f3e6bc313bd in BSOCK::recv (this=this@entry=0x7f3e50008e58)
at bsock.c:441
#5  0x0000556eddf9a6e7 in bget_dirmsg (bs=bs@entry=0x7f3e50008e58) at
getmsg.c:150
#6  0x0000556eddf88d78 in wait_for_job_termination
(jcr=jcr@entry=0x556edf8cc618, timeout=timeout@entry=0) at backup.c:685
#7  0x0000556eddf8b009 in do_backup (jcr=jcr@entry=0x556edf8cc618) at
backup.c:633
#8  0x0000556eddf9d318 in job_thread (arg=0x556edf8cc618) at job.c:453
#9  0x0000556eddfa37fb in jobq_server (arg=0x556ede01b6a0 <job_queue>)
at jobq.c:468
#10 0x00007f3e6bbf9ea7 in start_thread () from /lib64/libpthread.so.0
#11 0x00007f3e6b8b8c6f in clone () from /lib64/libc.so.6

Thread 3 (Thread 0x7f3e6990e700 (LWP 5754)):
#0  0x00007f3e6bc00878 in pthread_cond_timedwait@@GLIBC_2.3.2 () from
/lib64/libpthread.so.0
#1  0x00007f3e6bc65be9 in watchdog_thread (arg=<optimized out>) at
watchdog.c:299
#2  0x00007f3e6bbf9ea7 in start_thread () from /lib64/libpthread.so.0
#3  0x00007f3e6b8b8c6f in clone () from /lib64/libc.so.6

Thread 2 (Thread 0x7f3e6a10f700 (LWP 5753)):
#0  0x00007f3e6b8b09d3 in select () from /lib64/libc.so.6
#1  0x00007f3e6bc30a38 in bnet_thread_server
(addrs=addrs@entry=0x556edf86e718, max_clients=20,
client_wq=client_wq@entry=0x556ede01b9c0 <ua_workq>,
handle_client_request=handle_client_request@entry=0x556eddfd8650
<handle_UA_client_request(void*)>) at bnet_server.c:166
--Type <RET> for more, q to quit, c to continue without paging--c
#2  0x0000556eddfd8296 in connect_thread (arg=0x556edf86e718) at
ua_server.c:85
#3  0x00007f3e6bbf9ea7 in start_thread () from /lib64/libpthread.so.0
#4  0x00007f3e6b8b8c6f in clone () from /lib64/libc.so.6

Thread 1 (Thread 0x7f3e6abd20c0 (LWP 5160)):
#0  0x00007f3e6bc044c5 in nanosleep () from /lib64/libpthread.so.0
#1  0x00007f3e6bc2d4e6 in bmicrosleep (sec=sec@entry=60,
usec=usec@entry=0) at bsys.c:192
#2  0x0000556eddfb00d4 in wait_for_next_job
(one_shot_job_to_run=<optimized out>) at scheduler.c:121
#3  0x0000556eddf825f5 in main (argc=<optimized out>, argv=<optimized
out>) at dird.c:387


And here's the state of one of the hung clients (the other is the Fedora
client that I don't have a debug build on):

(gdb) attach 2823
Attaching to process 2823
[New LWP 2830]
[New LWP 7909]
[New LWP 7910]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
0x00007f3f37d87123 in select () from /lib64/libc.so.6
(gdb) thread apply all bt

Thread 4 (Thread 0x7f3f36510700 (LWP 7910)):
#0  0x00007f3f37d87123 in select () from /lib64/libc.so.6
#1  0x00007f3f38136242 in fd_wait_data (fd=6, mode=<optimized out>,
mode@entry=WAIT_READ, sec=sec@entry=5, msec=msec@entry=0) at bsys.c:1203
#2  0x00007f3f3815fadb in BSOCKCORE::wait_data_intr
(this=this@entry=0x7f3f30000e18, sec=sec@entry=5, msec=msec@entry=0) at
bsockcore.c:875
#3  0x000055d4f367de98 in sd_heartbeat_thread (arg=0x7f3f2c000ed8) at
heartbeat.c:69
#4  0x00007f3f38100057 in start_thread () from /lib64/libpthread.so.0
#5  0x00007f3f37d8f6cf in clone () from /lib64/libc.so.6

Thread 3 (Thread 0x7f3f354c7700 (LWP 7909)):
#0  0x00007f3f3810a4ef in write () from /lib64/libpthread.so.0
#1  0x00007f3f3816021f in BSOCKCORE::socketWrite (this=0x7f3f2c005458,
len=35938, buf=0x7f3f2c0bb3bc, fd=<optimized out>) at ../lib/bsockcore.h:203
#2  BSOCKCORE::write_nbytes (nbytes=<optimized out>, ptr=0x7f3f2c0bb3bc
"@", this=<optimized out>) at bsockcore.c:1079
#3  BSOCKCORE::write_nbytes (this=this@entry=0x7f3f2c005458,
ptr=<optimized out>, nbytes=35938) at bsockcore.c:1064
#4  0x00007f3f3813a878 in BSOCK::write_nbytes (this=0x7f3f2c005458,
ptr=<optimized out>, nbytes=35938) at bsock.c:831
#5  0x00007f3f3813977a in BSOCK::send (aflags=0, this=0x7f3f2c005458) at
bsock.c:368
#6  BSOCK::send (this=this@entry=0x7f3f2c005458, aflags=aflags@entry=0)
at bsock.c:249
#7  0x000055d4f36730d1 in BSOCK::send (this=0x7f3f2c005458) at
../lib/bsock.h:75
#8  process_and_send_data (bctx=...) at backup.c:845
#9  0x000055d4f36752d0 in send_data (stream=<optimized out>, bctx=...)
at backup.c:655
#10 save_file (jcr=0x7f3f2c000ed8, ff_pkt=0x7f3f2c001558,
top_level=<optimized out>) at backup.c:502
#11 0x00007f3f381ae228 in find_one_file (jcr=<optimized out>,
ff_pkt=0x7f3f2c001558, handle_file=0x7f3f381acb40 <our_callback(JCR*,
FF_PKT*, bool)>, fname=0x7f3f2c0044c8 "/home/alaric/.moonchild
productions/pale moon/alaric/adblockplus/patterns-backup3.ini",
parent_device=2304, top_level=<optimized out>) at find_one.c:542
#12 0x00007f3f381aee33 in find_one_file (jcr=<optimized out>,
ff_pkt=0x7f3f2c001558, handle_file=<optimized out>, fname=0x7f3f2c0049e8
"/home/alaric/.moonchild productions/pale moon/alaric/adblockplus",
parent_device=<optimized out>, top_level=<optimized out>) at find_one.c:768
#13 0x00007f3f381aee33 in find_one_file (jcr=<optimized out>,
ff_pkt=0x7f3f2c001558, handle_file=<optimized out>, fname=0x7f3f2c0a9b68
"/home/alaric/.moonchild productions/pale moon/alaric",
parent_device=<optimized out>, top_level=<optimized out>) at find_one.c:768
#14 0x00007f3f381aee33 in find_one_file (jcr=<optimized out>,
ff_pkt=0x7f3f2c001558, handle_file=<optimized out>, fname=0x7f3f2c003f78
"/home/alaric/.moonchild productions/pale moon",
parent_device=<optimized out>, top_level=<optimized out>) at find_one.c:768
#15 0x00007f3f381aee33 in find_one_file (jcr=<optimized out>,
ff_pkt=0x7f3f2c001558, handle_file=<optimized out>, fname=0x7f3f2c0034b8
"/home/alaric/.moonchild productions", parent_device=<optimized out>,
top_level=<optimized out>) at find_one.c:768
#16 0x00007f3f381aee33 in find_one_file (jcr=<optimized out>,
ff_pkt=0x7f3f2c001558, handle_file=<optimized out>, fname=0x7f3f2c0077f8
"/home/alaric", parent_device=<optimized out>, top_level=<optimized
out>) at find_one.c:768
#17 0x00007f3f381aee33 in find_one_file (jcr=<optimized out>,
ff_pkt=0x7f3f2c001558, handle_file=<optimized out>, fname=0x7f3f2c007298
"/home", parent_device=<optimized out>, top_level=<optimized out>) at
find_one.c:768
#18 0x00007f3f381aee33 in find_one_file (jcr=jcr@entry=0x7f3f2c000ed8,
ff_pkt=ff_pkt@entry=0x7f3f2c001558,
handle_file=handle_file@entry=0x7f3f381acb40 <our_callback(JCR*,
FF_PKT*, bool)>, fname=fname@entry=0x7f3f2c002418 "/",
parent_device=parent_device@entry=18446744073709551615,
top_level=top_level@entry=true) at find_one.c:768
#19 0x00007f3f381abdf7 in find_files (jcr=jcr@entry=0x7f3f2c000ed8,
ff=0x7f3f2c001558, file_save=file_save@entry=0x55d4f3674500
<save_file(JCR*, FF_PKT*, bool)>, plugin_save=0x55d4f3679440
<plugin_save(JCR*, FF_PKT*, bool)>) at find.c:186
#20 0x000055d4f3672c06 in blast_data_to_storage_daemon
(jcr=jcr@entry=0x7f3f2c000ed8, addr=addr@entry=0x0) at backup.c:166
#21 0x000055d4f368343f in backup_cmd (jcr=0x7f3f2c000ed8) at job.c:2522
#22 0x000055d4f3684658 in handle_director_request (dir=0x55d4f52f8298)
at job.c:343
#23 handle_connection_request (caller=0x55d4f52f8298) at job.c:503
#24 0x00007f3f3816f0ac in workq_server (arg=0x55d4f36aaca0 <dir_workq>)
at workq.c:372
#25 0x00007f3f38100057 in start_thread () from /lib64/libpthread.so.0
#26 0x00007f3f37d8f6cf in clone () from /lib64/libc.so.6

Thread 2 (Thread 0x7f3f36d11700 (LWP 2830)):
#0  0x00007f3f38106d08 in pthread_cond_timedwait@@GLIBC_2.3.2 () from
/lib64/libpthread.so.0
#1  0x00007f3f3816e6d0 in watchdog_thread (arg=<optimized out>) at
watchdog.c:299
#2  0x00007f3f38100057 in start_thread () from /lib64/libpthread.so.0
#3  0x00007f3f37d8f6cf in clone () from /lib64/libc.so.6
--Type <RET> for more, q to quit, c to continue without paging--c

Thread 1 (Thread 0x7f3f377ba740 (LWP 2823)):
#0  0x00007f3f37d87123 in select () from /lib64/libc.so.6
#1  0x00007f3f38137c60 in bnet_thread_server (addrs=0x55d4f52f65b8,
max_clients=20, client_wq=0x55d4f36aaca0 <dir_workq>,
handle_client_request=0x55d4f3683c40 <handle_connection_request(void*)>)
at bnet_server.c:166
#2  0x000055d4f3671777 in main (argc=<optimized out>, argv=<optimized
out>) at filed.c:277



-- 
  Phil Stracchino
  Babylon Communications
  ph...@caerllewys.net
  p...@co.ordinate.org
  Landline: +1.603.293.8485
  Mobile:   +1.603.998.6958


_______________________________________________
Bacula-devel mailing list
Bacula-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bacula-devel

Reply via email to