Hello again, Crashes keep on happening. Instead of reporting the same backtraces here are some that I think I have not reported yet. Different debugging attempts are marked with with '== <date> ==' header so that it is easy to tell which backtraces and value print outs are related to each other.
== 2024-02-09t03-11-37 == Program terminated with signal SIGSEGV, Segmentation fault. (gdb) bt -full #0 0x00005564b581c4f9 in infra_lookup_nottl (infra=0x5564b7646663, infra@entry=0x5564b7bd9230, addr=0x0, addr@entry=0x5564b8042518, addrlen=0, addrlen@entry=16, name=0x0, name@entry=0x5564b844b0d8 "rg", namelen=namelen@entry=11, wr=-1207894016, wr@entry=1) at services/cache/infra.c:383 k = {addr = {ss_family = 2, __ss_padding = "\0005\307\004\2125\000\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000P5/\273dU\000\000\200\262J\270dU\000\0000\262J\270dU\000\0000\262J\270dU\000\000&\310\203\265dU\000\000\240\210\241\226\374\177\000\000\360\323J\270dU\000\000\030\351g\347\353\177\000\000\300\210\241\226\374\177\000\000\020\000\000\0000\000\000\000\220\211\241\226\374\177\000\000\320\210\241\226\374\177\000", __ss_align = 0}, addrlen = 16, zonename = 0x5564b844b0d8 "rg", namelen = 11, entry = {lock = {__data = {__readers = 0, __writers = 0, __wrphase_futex = 16, __writers_futex = 48, __pad3 = 2527168976, __pad4 = 32764, __cur_writer = -1767798512, __shared = 32764, __rwelision = 87 'W', __pad1 = "\331\t\350\353\177\000", __pad2 = 93891074126344, __flags = 3045382490}, __size = "\000\000\000\000\000\000\000\000\020\000\000\0000\000\000\000\320\211\241\226\374\177\000\000\020\211\241\226\374\177\000\000W\331\t\350\353\177\000\000\b\362\036\270dU\000\000Z\331\204\265\001\000\000", __align = 0}, overflow_next = 0xe3, lru_next = 0xe7f29b0d76027000, lru_prev = 0x3000000010, hash = 2527169040, key = 0x7ffc96a18950, data = 0x0}} #1 0x00005564b581ca2b in infra_rtt_update (infra=0x5564b7bd9230, addr=0x5564b8042518, addrlen=16, nm=0x5564b844b0d8 "rg", nmlen=11, qtype=1, roundtrip=-1, orig_rtt=12000, timenow=1707448279) at services/cache/infra.c:579 e = <optimized out> data = <optimized out> needtoinsert = <optimized out> expired = <optimized out> rto = <optimized out> oldprobedelay = <optimized out> #2 0x00005564b58b3dea in serviced_udp_callback (c=0x5564bb2aa9e0, arg=0x5564b80424c0, error=error@entry=-2, rep=rep@entry=0x0) at services/outside_network.c:3270 sq = 0x5564b80424c0 outnet = 0x5564baa5abe0 now = <optimized out> p = <optimized out> #3 0x00005564b58afcf6 in pending_udp_timer_cb (arg=0x5564b81b36a0) at services/outside_network.c:1592 p = 0x5564b81b36a0 outnet = 0x5564baa5abe0 __func__ = "pending_udp_timer_cb" #4 0x00007febe8094482 in ?? () No symbol table info available. #5 0x00005564ba902cf0 in ?? () No symbol table info available. ... #15 0x0000000000000000 in ?? () No symbol table info available. (gdb) print *addr@entry $1 = {ss_family = 2, __ss_padding = "\0005\307\004\2125", '\000' <repeats 15 times>, "S", '\000' <repeats 95 times>, __ss_align = 0} (gdb) print *addr Cannot access memory at address 0x0 (gdb) print *infra Cannot access memory at address 0x5564b7646663 (gdb) print k $2 = {addr = {ss_family = 2, __ss_padding = "\0005\307\004\2125\000\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000P5/\273dU\000\000\200\262J\270dU\000\0000\262J\270dU\000\0000\262J\270dU\000\000&\310\203\265dU\000\000\240\210\241\226\374\177\000\000\360\323J\270dU\000\000\030\351g\347\353\177\000\000\300\210\241\226\374\177\000\000\020\000\000\0000\000\000\000\220\211\241\226\374\177\000\000\320\210\241\226\374\177\000", __ss_align = 0}, addrlen = 16, zonename = 0x5564b844b0d8 "rg", namelen = 11, entry = {lock = {__data = {__readers = 0, __writers = 0, __wrphase_futex = 16, __writers_futex = 48, __pad3 = 2527168976, __pad4 = 32764, __cur_writer = -1767798512, __shared = 32764, __rwelision = 87 'W', __pad1 = "\331\t\350\353\177\000", __pad2 = 93891074126344, __flags = 3045382490}, __size = "\000\000\000\000\000\000\000\000\020\000\000\0000\000\000\000\320\211\241\226\374\177\000\000\020\211\241\226\374\177\000\000W\331\t\350\353\177\000\000\b\362\036\270dU\000\000Z\331\204\265\001\000\000", __align = 0}, overflow_next = 0xe3, lru_next = 0xe7f29b0d76027000, lru_prev = 0x3000000010, hash = 2527169040, key = 0x7ffc96a18950, data = 0x0}} (gdb) print wr $3 = -1207894016 (gdb) print *wr Cannot access memory at address 0xffffffffb8010000 It looks like 'addr' became null while infra_lookup_nottl() was running. == 2024-02-08t02-10-49 == Program terminated with signal SIGSEGV, Segmentation fault. (gdb) bt -full #0 rbtree_find_less_equal (rbtree=0x56182f18b550, key=0x7ffc99f47c90, result=0x7ffc99f47c88) at util/rbtree.c:527 r = <optimized out> node = 0x0 __func__ = "rbtree_find_less_equal" #1 0x000056182a24b3f1 in rbtree_search (key=0x7ffc99f47c90, rbtree=<optimized out>) at util/rbtree.c:285 node = 0x0 node = <optimized out> #2 mesh_area_find (mesh=<optimized out>, cinfo=<optimized out>, qinfo=<optimized out>, qflags=<optimized out>, prime=<optimized out>, valrec=<optimized out>) at services/mesh.c:1592 key = {node = {parent = 0x56182bb0dca8, left = 0x0, right = 0x0, key = 0x7ffc99f47c90, color = 0 '\000'}, run_node = {parent = 0x0, left = 0x56182e79c42c, right = 0x23, key = 0x100000003, color = 0 '\000'}, s = {qinfo = {qname = 0x7ffc99f48050 "\001e\froot-servers\003net", qname_len = 20, qtype = 1, qclass = 1, local_alias = 0x0}, query_flags = 256, is_priming = 0, is_valrec = 0, reply = 0x0, return_msg = 0x0, return_rcode = 0, reply_origin = 0x7ffc99f482f0, blacklist = 0x56182bae22d0, region = 0x4, errinf = 0x0, curmod = 3, ext_state = { module_state_initial, module_state_initial, module_state_initial, 707042248, 22040, 790017248, 22040, module_restart_next, module_state_initial, 2582937080, 32764, 1883532271, 32511, 4294967295, module_state_initial, 779732000}, minfo = {0x0, 0x7ffc99f47ff0, 0x7ffc99f481c0, 0x45, 0x43, 0x7ffc99f480f0, 0x7ffc99f480f0, 0x7eff70449afd, 0x56182e79c3f0, 0x56182f16b4e0, 0x60a03274372, 0x0, 0xffffffff00000000, 0xa0289dd641a06f00, 0x0, 0x7ffc99f47ff0}, env = 0x56182a35ccc8 <dnstap.message.field_descriptors.lto_priv+936>, mesh_info = 0x84, prefetch_leeway = 140722891423984, serve_expired_data = 0x7ffc99f480f0, edns_opts_front_in = 0x7ffc99f480f0, edns_opts_back_out = 0x7eff70448ab3, edns_opts_back_in = 0xe, edns_opts_front_out = 0x81, no_cache_lookup = 0, no_cache_store = 0, need_refetch = -1712030080, was_ratelimited = 32764, qstarttime = 0, is_cachedb_answer = 0, client_info = 0x0, respip_action_info = 0x7eff70446e64, rpz_passthru = 0, tcp_required = 0, is_drop = 1883532271}, reply_list = 0x0, cb_list = 0x7ffc99f47ef0, super_set = { root = 0x0, count = 139635549566676, cmp = 0x56002bd196d0}, sub_set = {root = 0xa0289dd641a06f00, count = 53, cmp = 0x56182a25abb5 <ipstrtoaddr+213>}, num_activated = 140722891423872, prev = 0x7eff704499bd, next = 0x0, list_select = (unknown: 0x99f47ef0), unique = 0x0, replies_sent = 92 '\\'} result = <optimized out> #3 0x000056182a2df54f in mesh_new_callback.constprop.0 (mesh=0x56182f18b520, qinfo=qinfo@entry=0x7ffc99f47fb0, edns=edns@entry=0x7ffc99f48250, buf=buf@entry=0x56182f19bdd0, cb=cb@entry=0x56182a2881f0 <auth_xfer_probe_lookup_callback>, cb_arg=cb_arg@entry=0x56182bd26350, rpz_passthru=0, qid=0, qflags=256) at services/mesh.c:559 s = 0x0 unique = 0 timeout = 0 was_detached = 0 was_noreply = 0 added = 0 #4 0x000056182a287683 in xfr_probe_lookup_host (env=0x56182c1b3518, xfr=0x56182bd26350) at services/authzone.c:6630 and so on (gdb) print *node Cannot access memory at address 0x0 Red-black tree node became null while it was examined. == 2024-02-08t04-40-48 == Program terminated with signal SIGSEGV, Segmentation fault. (gdb) bt #0 0x000055da1457abeb in mesh_state_compare (ap=0x55da15570c00, bp=0x0) at services/mesh.c:144 #1 0x000055da1459195a in rbtree_find_less_equal (rbtree=rbtree@entry=0x55da183c5058, key=0x55da15570c00, result=result@entry=0x7ffea92b1638) at util/rbtree.c:527 #2 0x000055da145919e7 in rbtree_search (key=<optimized out>, rbtree=0x55da183c5058) at util/rbtree.c:285 #3 rbtree_delete (rbtree=0x55da183c5058, key=<optimized out>) at util/rbtree.c:333 #4 0x000055da1462439c in mesh_state_delete.part.0.isra.0 (qstate=<optimized out>) at services/mesh.c:984 #5 0x000055da145808c2 in mesh_state_delete (qstate=0x55da15570c50) at services/mesh.c:952 #6 mesh_continue (ev=<synthetic pointer>, s=<optimized out>, mstate=0x55da15570c00, mesh=0x55da183c5040) at services/mesh.c:1843 #7 mesh_run (mesh=0x55da183c5040, mstate=0x55da15570c00, ev=<optimized out>, e=0x0) at services/mesh.c:1874 #8 0x000055da14549f13 in mesh_report_reply (what=0, reply=0x7ffea92b1a40, e=0x55da155749a0, mesh=<optimized out>) at services/mesh.c:816 #9 worker_handle_service_reply (c=0x55da1837fa30, arg=0x55da155749a0, error=0, reply_info=0x7ffea92b1a40) at daemon/worker.c:268 #10 0x000055da145f7069 in serviced_callbacks (sq=sq@entry=0x55da183f6a60, error=error@entry=0, c=c@entry=0x55da1837fa30, rep=rep@entry=0x7ffea92b1a40) at services/outside_network.c:3044 #11 0x000055da145f7e30 in serviced_udp_callback (c=0x55da1837fa30, arg=0x55da183f6a60, error=error@entry=0, rep=rep@entry=0x7ffea92b1a40) at services/outside_network.c:3384 #12 0x000055da145f3b7c in outnet_udp_cb (c=<optimized out>, arg=0x55da17b2c6d0, error=<optimized out>, reply_info=0x7ffea92b1a40) at services/outside_network.c:1537 #13 0x000055da145eba58 in comm_point_udp_callback (fd=236, event=<optimized out>, arg=<optimized out>) at util/netevent.c:1145 #14 0x00007f6cf021d585 in ?? () #15 0x000055da179d4790 in ?? () #16 0x000055da179d4920 in ?? () #17 0x000000000013ce8c in ?? () #18 0x0000000025b244d4 in ?? () #19 0x000000000013cef5 in ?? () #20 0x33eddbe1ed004d00 in ?? () #21 0x000000000013ce7d in ?? () #22 0x000055da179d44d0 in ?? () #23 0x000055da179d4628 in ?? () #24 0x33eddbe1ed004d00 in ?? () #25 0x0000000000000000 in ?? () (gdb) print *a $1 = {node = {parent = 0x55da155b0d00, left = 0x55da14696680 <rbtree_null_node>, right = 0x55da14696680 <rbtree_null_node>, key = 0x55da15570c00, color = 1 '\001'}, run_node = {parent = 0x55da14696680 <rbtree_null_node>, left = 0x55da14696680 <rbtree_null_node>, right = 0x55da14696680 <rbtree_null_node>, key = 0x55da15570c00, color = 0 '\000'}, s = {qinfo = {qname = 0x55da15570e70 "\froot-servers\003net", qname_len = 18, qtype = 43, qclass = 1, local_alias = 0x0}, query_flags = 272, is_priming = 0, is_valrec = 1, reply = 0x0, return_msg = 0x55da155749c0, return_rcode = 0, reply_origin = 0x55da155d52c8, blacklist = 0x0, region = 0x55da15570bc0, errinf = 0x0, curmod = 0, ext_state = {module_finished, module_finished, module_state_initial <repeats 14 times>}, minfo = {0x0, 0x55da15570eb8, 0x0 <repeats 14 times>}, env = 0x55da153ecf48, mesh_info = 0x55da15570c00, prefetch_leeway = 0, serve_expired_data = 0x0, edns_opts_front_in = 0x0, edns_opts_back_out = 0x0, edns_opts_back_in = 0x0, edns_opts_front_out = 0x0, no_cache_lookup = 0, no_cache_store = 0, need_refetch = 0, was_ratelimited = 0, qstarttime = 1707367230, is_cachedb_answer = 0, client_info = 0x0, respip_action_info = 0x0, rpz_passthru = 0, tcp_required = 0, is_drop = 0}, reply_list = 0x0, cb_list = 0x0, super_set = {root = 0x55da15570e88, count = 1, cmp = 0x55da1457adf0 <mesh_state_ref_compare>}, sub_set = {root = 0x55da14696680 <rbtree_null_node>, count = 0, cmp = 0x55da1457adf0 <mesh_state_ref_compare>}, num_activated = 4, prev = 0x0, next = 0x0, list_select = mesh_no_list, unique = 0x0, replies_sent = 1 '\001'} (gdb) print *b Cannot access memory at address 0x0 (gdb) frame 1 #1 0x000055da1459195a in rbtree_find_less_equal (rbtree=rbtree@entry=0x55da183c5058, key=0x55da15570c00, result=result@entry=0x7ffea92b1638) at util/rbtree.c:527 527 util/rbtree.c: No such file or directory. (gdb) print *node $2 = {parent = 0x0, left = 0x0, right = 0x0, key = 0x0, color = 0 '\000'} Got a bit further with red-black tree handling, but ultimately failed due null key. == 2024-02-09t00-07-58 == Program terminated with signal SIGSEGV, Segmentation fault. (gdb) bt -full #0 0x000055ec98cde3ef in new_entry (infra=infra@entry=0x55ec9a467230, addr=addr@entry=0x55ec9a950958, addrlen=0, addrlen@entry=16, name=0x0, name@entry=0x55ec9ad6ee88 "consul\005cfops\003net", namelen=namelen@entry=11, tm=tm@entry=1707437259) at services/cache/infra.c:440 data = 0x550000000000 key = 0x0 #1 0x000055ec98cdec79 in infra_rtt_update (infra=0x55ec9a467230, addr=0x55ec9a950958, addrlen=16, nm=0x55ec9ad6ee88 "consul\005cfops\003net", nmlen=11, qtype=1, roundtrip=-1, orig_rtt=12000, timenow=1707437259) at services/cache/infra.c:586 e = <optimized out> data = <optimized out> needtoinsert = 0 expired = 0 rto = 1 oldprobedelay = 0 #2 0x000055ec98d75dea in serviced_udp_callback (c=0x55ec9db37580, arg=0x55ec9a950900, error=error@entry=-2, rep=rep@entry=0x0) at services/outside_network.c:3270 sq = 0x55ec9a950900 outnet = 0x55ec9d2e6840 now = <optimized out> p = <optimized out> #3 0x000055ec98d71cf6 in pending_udp_timer_cb (arg=0x55ec9a7eb900) at services/outside_network.c:1592 p = 0x55ec9a7eb900 outnet = 0x55ec9d2e6840 __func__ = "pending_udp_timer_cb" #4 0x00007f2462df7482 in ?? () No symbol table info available. ... #15 0x0000000000000000 in ?? () No symbol table info available. Trying to 'key->addrlen = addrlen' in to null key, which is really strange point to fail because three earlier lines of code were key->value assignments and they worked. == 2024-02-04t08-56-00 == Program terminated with signal SIGABRT, Aborted. (gdb) bt -full #0 0x00007f01a06a9e2c in __pthread_mutex_cond_lock_full (mutex=0x130fd1) at ../nptl/pthread_mutex_lock.c:210 assume_other_futex_waiters = <optimized out> oldval = <optimized out> id = -1507605342 __PRETTY_FUNCTION__ = "__pthread_mutex_cond_lock_full" #1 0x0000555873f8f97e in iter_handle (qstate=<optimized out>, iq=0x555874edec28, ie=0x55587489c710, id=1) at iterator/iterator.c:4031 cont = 1 #2 0x00007f01a07bb138 in ?? () from /lib/x86_64-linux-gnu/libc.so.6 No symbol table info available. #3 0x00007fff00000018 in ?? () No symbol table info available. #4 0x00007fff288d2de0 in ?? () No symbol table info available. #5 0x00007fff288d2d70 in ?? () No symbol table info available. #6 0xa623c4a208356200 in ?? () No symbol table info available. #7 0x0000003000000010 in ?? () No symbol table info available. #8 0x00007fff288d2e50 in ?? () No symbol table info available. #9 0x00007f01a07bb138 in ?? () from /lib/x86_64-linux-gnu/libc.so.6 No symbol table info available. #10 0x000007f01a07f242 in ?? () No symbol table info available. #11 0x000055587400d0b0 in reuse_cmp (key1=0x7f01a069e24e <__libc_message+366>, key2=0x7f01a07bb138) at services/outside_network.c:175 r = <optimized out> #12 0x00007f01a07f2440 in main_arena () from /lib/x86_64-linux-gnu/libc.so.6 No symbol table info available. #13 0x0000000000000000 in ?? () No symbol table info available. Was the pthread destroyed before unlocking was attempted? == 2024-02-03t16-54-26 == Program terminated with signal SIGSEGV, Segmentation fault. (gdb) bt -full #0 0x00007ff192d717db in __memcpy_avx_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:557 No locals. #1 0x0000564423d53032 in regional_alloc_init (size=5, init=0x5644253bfe58, r=0x7ff192df23f0 <main_arena+1936>) at util/regional.c:189 s = <optimized out> s = <optimized out> #2 serviced_create (region=0x7ff192df23f0 <main_arena+1936>, alloc=0x564424d56ca0, pad_queries_block_size=0, opt_list=<optimized out>, qtype=1, zonelen=5, zone=0x5644253bfe58 "\003net", addrlen=16, addr=0x7ffe944f2db0, tls_auth_name=0x0, ssl_upstream=0, tcp_upstream=0, nocaps=0, want_dnssec=1, dnssec=32784, buff=<optimized out>, outnet=0x56442792f2f0) at services/outside_network.c:2628 sq = 0x5644251363c0 t = {tv_sec = 16, tv_usec = 94850680552456} sq = <optimized out> t = <optimized out> #3 outnet_serviced_query (callback=<optimized out>, was_ratelimited=0x7ffe944f2da8, env=0x5644251efc08, buff=<optimized out>, callback_arg=0x5644253c3600, qstate=0x5644253bfa40, zonelen=5, zone=0x5644253bfe58 "\003net", addrlen=16, addr=0x7ffe944f2db0, tls_auth_name=0x0, ssl_upstream=0, tcp_upstream=0, check_ratelimit=1, nocaps=0, want_dnssec=1, dnssec=32784, flags=<optimized out>, qinfo=<optimized out>, outnet=<optimized out>) at services/outside_network.c:3466 sq = <optimized out> region = 0x7ff192df23f0 <main_arena+1936> timenow = 1706979247 cb = <optimized out> client_string_addr = <optimized out> backed_up_opt_list = <optimized out> per_upstream_opt_list = 0x0 sq = <optimized out> cb = <optimized out> client_string_addr = <optimized out> region = <optimized out> backed_up_opt_list = <optimized out> per_upstream_opt_list = <optimized out> timenow = <optimized out> #4 worker_send_query (qinfo=<optimized out>, flags=<optimized out>, dnssec=32784, want_dnssec=1, nocaps=0, check_ratelimit=1, addr=0x7ffe944f2db0, addrlen=16, zone=0x5644253bfe58 "\003net", zonelen=5, tcp_upstream=0, ssl_upstream=0, tls_auth_name=0x0, q=0x5644253bfa40, was_ratelimited=0x7ffe944f2da8) at daemon/worker.c:2395 etc etc... How is it even possible to get memory alignment issue? -- snip As mentioned earlier these are coming from systems that have connectivity problems with configuration to keep items in cache. Unfortunately I have not examined unbound code base to the point where I could say if failing requests can cause entries to be removed from cache. Please let me know if there is anything I can do to make debugging easier. -- Sami Kerola https://kerolasa.iki.fi/