Hi all,
I did a csconf -cq reset on one of the good nodes and booted the bad
node as suggested with exactly the same result as before; the bad node
crashed when tried to join the cluster. Here is the stack trace:
# mdb unix.9 vmcore.9
Loading modules: [ unix krtld genunix ip s1394 usba logindmux ptm cpc
sppp ipc r
andom nca ]
> $C
000002a10007bbf1
__1cToutbound_invo_tableGwakeup6FHIpnJrecstream_rnHID_node__v_+0xd0(7839
9cc8, 10
, 30006e0cc48, 30006e0cd18, 1c00, 0)
000002a10007bca1
__1cGrxdoorMhandle_reply6FpnJrecstream__v_+0xe4(30006e0cc48,
78000, 785df10c, 781a9118, 30006e0cc78, 30006e0cd18)
000002a10007bd61
__1cMtcp_endpointUtcp_receive_callback6MpnPtcpmod_header_t_pnEmsgb_nCosO
mem_allo
c_type__i_+0xe8(783ba, 2a10007c798, 78000, 1, 1, 30006e0cc48)
000002a10007be21
__1cQprocess_recv_msg6FpnFqueue_pnPtcpmod_header_t_pnEmsgb_b_i_+0x228(
300031d8a70, 2a10007c798, 30006ea3000, 300042a7380, 300031bc280,
30003184830)
000002a10007bee1 __1cKtcpmodrsrv6FpnFqueue__i_+0xac(300031d8a70,
30006ea3000, 0
, 300031bc280, 2a10007c798, 0)
000002a10007bfb1
__1cKtcpmodrput6FpnFqueue_pnEmsgb__i_+0x17c(300031d8a70,
30006ea3000, 38, 0, 0, 300031bc280)
000002a10007c071 putnext+0x21c(0, 30006ea3000, 38, e22, b00,
30006eaf788)
000002a10007c121 tcp_rput_data+0x35c8(5b4, 0, 18, 30003f2aa10,
30003f2aa24,
30006a56440)
000002a10007c381 putnext+0x21c(0, 30006ea3000, 20, 0, 8, 8)
000002a10007c431 ip_rput_local+0xcec(0, 0, 30006ea3000, 14, 0,
30000062030)
000002a10007c5c1 ip_rput+0x8a8(30006ea3000, 30000062030, 3000217aa60, 0,
30003f2aa10, 25637f3043)
000002a10007c691 putnext+0x21c(0, 30006ea3000, 78000, 784aa000,
30003143e68,
78400)
000002a10007c741 hbsndr_rput+0x28(3000217b9c0, 30006ea3000, 20,
782e9dd0,
78141c00, 782e9dd0)
000002a10007c7f1 putnext+0x21c(0, 30006ea3000, fc00, 30006e056c0, 8, 10)
000002a10007c8a1 ce_intr+0x7348(104dbb8, 0, 30006ea3000, 7816f448,
3000217bc50,
80)
000002a10007d261 pci_intr_wrapper+0x7c(30000d4bee0, 745, 1400000,
2a10007dd20,
e8a0, 12e7f18)
000002a10007d311 intr_thread+0x130(1446328, ffffffffffffffff, bab1,
bab0,
1418ea0, 10)
000002a101e7c051 page_ctr_sub+0x2c(70000459218, 70000459218, 0,
30000b0b440,
78396000, 0)
000002a101e7c101 page_get_mnode_freelist+0x1e4(142c418, 11, 0, 468, 0,
fffe)
000002a101e7c201 page_get_freelist+0x280(0, 300074a6000, 2a101e7cc50,
300074a6000, 2000, b)
000002a101e7c2b1 page_create_va+0x394(115e0, 1, 75, 3, 4c, 142abd0)
000002a101e7c3a1 segkmem_page_create+0x54(3000740e000, 182000, 0, 0, 4,
1)
000002a101e7c4a1 segkmem_xalloc+0x8c(30000004000, 0, 182000, 0, 0,
104f8a0)
000002a101e7c561 segkmem_alloc+0xd0(30000004000, 182000, 0, 0, 0, 78000)
000002a101e7c621 vmem_xalloc+0x4cc(ffffffffffffe000, 300000062d0, 0, 0,
0,
30000006020)
000002a101e7c751 vmem_alloc+0x1f4(30000006000, 180018, 0, 2a101e7d848,
2,
2a101e7d5f0)
000002a101e7c811 kmem_alloc+0xf0(180018, 0, 2a101e7d840, 8, 300031b4ef8,
2a101e7d848)
000002a101e7c8c1
__1cKshared_new6FLnCosOmem_alloc_type_n0ANmem_zero_type__pv_+0x24(180008
, 0, 0,
0, 30006e7dd50, 180018)
000002a101e7c971
__1cLrxdoor_node2t5B6MrnHID_node_II_v_+0x54(30006e73f48,
2a101e7d448, 10000, 20000, 20000, 78000)
000002a101e7ca21
__1cRrxdoor_hash_tableScreate_rxdoor_node6MrnHID_node__pnLrxdoor_node__+
0x44(
30006e73f48, 2a101e7d448, 3, 1, 4, 1)
000002a101e7cad1
__1cOrxdoor_managerNlookup_rxdoor6MrnHID_node_IppnNrxdoor_bucket__pnGrxd
oor__+0x
108(30000064008, 2a101e7d448, 10, 2a101e7d510, 30000064060, 30000064868)
000002a101e7cb81
__1cWrxdoor_from_server_kitNlookup_rxdoor6MpbrnHID_node_rnNMarshalStream
_ppnNrxd
oor_bucket__pnGrxdoor__+0x68(783bc2a8, 2a101e7d51f, 30006e7dd50, 783bb,
2a101e7d510, 78000)
000002a101e7cc51
__1cKrxdoor_kitJunmarshal6MrnHID_node_rnNMarshalStream_pnFCORBALEnvironm
ent__pnF
Xdoor__+0x24(783bc2a8, 30006e7dd50, 30006e7dcf0, 2a101e7d848, 78361c4c,
783bc838
)
000002a101e7cd21
__1cNXdoor_managerMtranslate_in6FrnHID_node_IrnNMarshalStream_pnFCORBALE
nvironme
nt__v_+0xa8(30006e7dd50, 1, 30006e7dcf0, 2a101e7d848, 2, 2a101e7d5f0)
000002a101e7ce31
__1cGrxdoorVhandle_request_common6FrnHID_node_rnHservice_pnSrxdoor_invo_
header_C
_v_+0x2b4(30006e7dd50, 2a101e7d7f0, 2a101e7d840, 8, 300031b4ef8,
2a101e7d848)
000002a101e7cf41
__1cGrxdoorNhandle_twoway6FpnJrecstream__v_+0xf0(30006e7dc80,
78000, 30006e7dcb0, 0, 30006e7dd50, 2a101e7d7f0)
000002a101e7d091
__1cTthreadpool_worker_tVdeferred_task_handler6M_v_+0x114(
781a9118, 30001c0ac78, 30006e7dc80, 30000a1e8d8, 1, 30000a1e8d8)
000002a101e7d141
__1cKthreadpoolOthread_handler6FpnTthreadpool_worker_t__v_+0x1c
(30001c0ac78, 1, 300021ef7b8, 1, 783bc000, 783bc)
000002a101e7d1f1 cllwpwrapper+0x10c(2a101e7db80, 78366a84, 0, 0,
783d0000, 783d0
)
000002a101e7d2d1 thread_start+4(2a101e7db80, 18, 0, 0, 0, 0)
> $<msgbuf
0x30000f379e3: NOTICE: ce0: xcvr addr:0x01 - link up 100 Mbps full
duplex
0x30000f36d5f: NOTICE: bge0: link up 1000Mbps Full-Duplex (initialised)
0x30000f36c20: ssd6 at scsi_vhci0: name
g600c0ff0000a5209a3674c4603000000, bus
address g600c0ff0000a5209a3674c4603000000
0x30000f36ae3: ssd6 is /scsi_vhci/ssd at g600c0ff0000a5209a3674c4603000000
0x30001ca9be1: /scsi_vhci/ssd at g600c0ff0000a5209a3674c4603000000 (ssd6)
multipat
h status: optimal, path /pci at 1c,600000/SUNW,qlc at 2/fp at 0,0 (fp0) to
target
address
: 207000c0ff0a5121,5 is online. Load balancing: round-robin
0x300000729a0: ssd7 at scsi_vhci0: name
g600c0ff0000a5209a3674c4601000000, bus
address g600c0ff0000a5209a3674c4601000000
0x30000f369a3: ssd7 is /scsi_vhci/ssd at g600c0ff0000a5209a3674c4601000000
0x30001ca98e1: /scsi_vhci/ssd at g600c0ff0000a5209a3674c4601000000 (ssd7)
multipat
h status: optimal, path /pci at 1c,600000/SUNW,qlc at 2/fp at 0,0 (fp0) to
target
address
: 207000c0ff0a5121,4 is online. Load balancing: round-robin
0x30000361de0: ssd8 at scsi_vhci0: name
g600c0ff0000a5209a3674c4602000000, bus
address g600c0ff0000a5209a3674c4602000000
0x30000f36863: ssd8 is /scsi_vhci/ssd at g600c0ff0000a5209a3674c4602000000
0x30001ca95e1: /scsi_vhci/ssd at g600c0ff0000a5209a3674c4602000000 (ssd8)
multipat
h status: optimal, path /pci at 1c,600000/SUNW,qlc at 2/fp at 0,0 (fp0) to
target
address
: 207000c0ff0a5121,3 is online. Load balancing: round-robin
0x300003612a3: /scsi_vhci/ssd at g600c0ff0000a5209a3674c4603000000 (ssd6)
online
0x30000f36363: dump on /dev/md/dsk/d1 size 2000 MB
0x30000f36222: pseudo-device: devinfo0
0x30000f360e3: devinfo0 is /pseudo/devinfo at 0
0x30003047f20: /pci at 1d,700000/scsi at 4,1/st at 5,0 (st12):
<HP DDS-4 DAT (Sun)>
0x30003047de0: st12 at glm1: target 5 lun 0
0x30003047ca3: st12 is /pci at 1d,700000/scsi at 4,1/st at 5,0
0x30003047b60: /pci at 1d,700000/scsi at 4,1/st at 6,0 (st13):
<Vendor 'TANDBERG' Product 'TS400 '>
0x30003047a20: st13 at glm1: target 6 lun 0
0x300030478e3: st13 is /pci at 1d,700000/scsi at 4,1/st at 6,0
0x300030477a0: isadma0 at ebus0: offset 0,0
0x30003047660: ecpp0 at ebus0: offset 0,378
0x30003047523: ecpp0 is /pci at 1e,600000/isa at 7/dma at 0,0/parallel at 0,378
0x300030473e2: pseudo-device: fssnap0
0x300030472a3: fssnap0 is /pseudo/fssnap at 0
0x300030468a2: pseudo-device: ramdisk1024
0x30003046763: ramdisk1024 is /pseudo/ramdisk at 1024
0x30003046622: pseudo-device: winlock0
0x300030464e3: winlock0 is /pseudo/winlock at 0
0x300030463a2: pseudo-device: lockstat0
0x30003046263: lockstat0 is /pseudo/lockstat at 0
0x30003046122: pseudo-device: vol0
0x30002bf9ea3: vol0 is /pseudo/vol at 0
0x30002bf9d62: pseudo-device: llc10
0x30002bf9c23: llc10 is /pseudo/llc1 at 0
0x30002bf9ae2: pseudo-device: pm0
0x30002bf99a3: pm0 is /pseudo/pm at 0
0x30002bf9862: pseudo-device: tod0
0x30002bf9723: tod0 is /pseudo/tod at 0
0x30002bf95e2: pseudo-device: lofi0
0x30002bf94a3: lofi0 is /pseudo/lofi at 0
0x30002bf9222: pseudo-device: rsm0
0x30002bf90e3: rsm0 is /pseudo/rsm at 0
0x30002bf8fa2: pseudo-device: trapstat0
0x30002bf8e63: trapstat0 is /pseudo/trapstat at 0
0x30002bf8d22: pseudo-device: rmcadm0
0x30002bf8be3: rmcadm0 is /pseudo/rmcadm at 0
0x30002bf8aa3: tomtppm0 at root: SAFARI 0x1c 0x0
0x30002bf8963: tomtppm0 is /ppm at 1c,0
0x30002bf8823: tomtppm1 at root: SAFARI 0x1e 0x0
0x30002bf85a3: tomtppm1 is /ppm at 1e,0
0x30002bf8463: WARNING: adm1031_0:0 iblock_cookie error:-1
0x30003cbb622: pseudo-device: fcsm0
0x30003cbb3a3: fcsm0 is /pseudo/fcsm at 0
0x30003cbb262: pseudo-device: clprivnet0
0x30003cbb123: clprivnet0 is /pseudo/clprivnet at 0
0x30003cbad62: WARNING: Duplicate instance 0 of node "did" ignored.
0x30003cbaae6: NOTICE: CMM: Node dietrich (nodeid = 1) with votecount =
1 added
.
0x30003cba9a6: NOTICE: CMM: Node dalle (nodeid = 2) with votecount = 1
added.
0x30003cba866: NOTICE: CMM: Node bacall (nodeid = 3) with votecount = 0
added.
0x30003cba726: NOTICE: CMM: Quorum device 1 (/dev/did/rdsk/d1s2) added;
votecou
nt = 1, bitmask of nodes with configured paths = 0x3.
0x30003cba366: NOTICE: clcomm: Adapter ce2 constructed
0x300042a3f26: NOTICE: clcomm: Adapter ce1 constructed
0x300042a3de6: NOTICE: CMM: Node bacall: attempting to join cluster.
0x300042a37a6: NOTICE: CMM: Node dietrich (nodeid: 1, incarnation #:
1197559020
) has become reachable.
0x300042a3666: NOTICE: clcomm: Path bacall:ce1 - dietrich:bge1 online
0x300042a3526: NOTICE: clcomm: Path bacall:ce2 - dietrich:bge2 online
0x300042a33e6: NOTICE: CMM: Node dalle (nodeid: 2, incarnation #:
1197557971) h
as become reachable.
0x300042a32a6: NOTICE: clcomm: Path bacall:ce2 - dalle:bge2 online
0x300042a3166: NOTICE: CMM: Cluster has reached quorum.
0x300042a3026: NOTICE: CMM: Node dietrich (nodeid = 1) is up; new
incarnation n
umber = 1197559020.
0x300042a2ee6: NOTICE: CMM: Node dalle (nodeid = 2) is up; new
incarnation numb
er = 1197557971.
0x300042a2da6: NOTICE: CMM: Node bacall (nodeid = 3) is up; new
incarnation num
ber = 1197622286.
0x300042a2c66: NOTICE: CMM: Cluster members: dietrich dalle bacall.
0x300042a2b26: NOTICE: clcomm: Path bacall:ce1 - dalle:bge1 online
0x300042a29e8: Notifying cluster that this node is panicking
0x300042a28a0:
panic[cpu0]/thread=2a10007dd20:
0x300042a2760: BAD TRAP: type=31 rp=2a10007c350 addr=8 mmu_fsr=0
occurred in mo
dule "cl_orb" due to a NULL pointer dereference
0x300042a2620:
0x300042a24e0: sched:
0x300042a23a0: trap type = 0x31
0x300042a2260: addr=0x8
0x300042a2120: pid=0, pc=0x78355790, sp=0x2a10007bbf1,
tstate=0x4400001603, con
text=0x0
0x30006ef1ea0: g1-g7: 1, 78341af0, 78341000, 78341, 2, 0, 2a10007dd20
0x30006ef1d60:
0x30006ef1c23: 000002a10007c070 unix:die+80 (31, 2a10007c350, 8, 0, 20,
0)
0x30006ef1ae3: %l0-3: 0000000000000000 0000000001413878
000002a10007c350 0000
02a10007c240
%l4-7: 0000000000000031 0000000078141e18 0000000000000000
0000030006ea3000
0x30006ef19a3: 000002a10007c150 unix:trap+8e4 (2a10007c350, 0, 10000,
10200, 0,
0)
0x30006ef1863: %l0-3: 0000000000000001 0000000000000000
00000000014527d8 0000
000000000031
%l4-7: 0000000000000005 0000000000000001 0000000000000000
0000000000000000
0x30006ef1723: 000002a10007c2a0 unix:ktl0+48 (476148d3, 476148d3, 8,
783b9fe0,
783b9000, 78000)
0x30006ef15e3: %l0-3: 0000000000000004 0000000000001400
0000004400001603 0000
00000102db30
%l4-7: 0000000000000035 0000000000000010 0000000000000006
000002a10007c350
0x30006ef14a3: 000002a10007c3f0 78000 (78399cc8, 10, 30006e0cc48,
30006e0cd18,
1c00, 0)
0x30006ef1363: %l0-3: 0000000000000000 0000000000000000
00000000783d0010 0000
0000783d0000
%l4-7: 00000000000783d0 0000000000078000 0000000000000000
0000000000001fff
0x30006ef1223: 000002a10007c4a0
cl_orb:__1cGrxdoorMhandle_reply6FpnJrecstream__
v_+e4 (30006e0cc48, 78000, 785df10c, 781a9118, 30006e0cc78, 30006e0cd18)
0x30006ef10e3: %l0-3: 0000000000078000 0000000000000200
000000007836054c 0000
000000000014
%l4-7: 0000000000000014 000000000001c000 0000000078396000
0000000000078396
0x30001ca92e3: 000002a10007c560
cl_dlpitrans:__1cMtcp_endpointUtcp_receive_call
back6MpnPtcpmod_header_t_pnEmsgb_nCosOmem_alloc_type__i_+e8 (783ba,
2a10007c798,
78000, 1, 1, 30006e0cc48)
0x30000a8b263: %l0-3: 00000300042a7380 0000000000000000
0000030006ea3000 0000
030001098288
%l4-7: 0000030003f2aa10 0000030000062030 0000000000000001
00000000783ba000
0x30001ca8fe3: 000002a10007c620
cltcpint:__1cQprocess_recv_msg6FpnFqueue_pnPtcp
mod_header_t_pnEmsgb_b_i_+228 (300031d8a70, 2a10007c798, 30006ea3000,
300042a738
0, 300031bc280, 30003184830)
0x30000a8ad63: %l0-3: 0000030002a4d5c8 000003000302ec00
0000000000000000 0000
03000302ebb8
%l4-7: 0000000000000048 00000000f0100000 fffffffff0100000
0000030006ea3000
0x30006ef0fa3: 000002a10007c6e0
cltcpint:__1cKtcpmodrsrv6FpnFqueue__i_+ac (3000
31d8a70, 30006ea3000, 0, 300031bc280, 2a10007c798, 0)
0x30006ef0e63: %l0-3: 0000000000000002 0000030000070288
0000000000000000 0000
04000ef81a18
%l4-7: 0000030000363ea8 0008000000000000 000002a101e7cc50
000000000000000b
0x30006ef0d23: 000002a10007c7b0
cltcpint:__1cKtcpmodrput6FpnFqueue_pnEmsgb__i_+
17c (300031d8a70, 30006ea3000, 38, 0, 0, 300031bc280)
0x30006ef0be3: %l0-3: 00000000013ffd2c 0000000000000038
0000030003f2aa70 0000
030003f2aa38
%l4-7: 0000000000000000 0000000078141e18 000003000217bc50
0000030006ea3000
0x30006ef0aa3: 000002a10007c870 unix:putnext+21c (0, 30006ea3000, 38,
e22, b00,
30006eaf788)
0x30006ef0963: %l0-3: 00000000785e6590 00000300031d8c50
00000300031d8a70 0000
000000000000
%l4-7: 0000000000000000 00000000785d6770 0000030006ead4e0
0000030006ea3000
0x30006ef0823: 000002a10007c920 tcp:tcp_rput_data+35c8 (5b4, 0, 18,
30003f2aa10
, 30003f2aa24, 30006a56440)
0x30006ef06e3: %l0-3: 0000030006eaf788 0000000000002018
0000030006ea3000 0000
000000000038
%l4-7: 00000000b2c64423 0000000002740b0e 0000000000000000
0000030006ead4e0
0x30006ef05a3: 000002a10007cb80 unix:putnext+21c (0, 30006ea3000, 20,
0, 8, 8)
0x30006ef0463: %l0-3: 0000000001349d40 0000030006ead6c0
0000030006ead4e0 0000
000000000000
%l4-7: 0000000000000000 0000000078018390 0000030006ead770
0000030006ea3000
0x30006ef0323: 000002a10007cc30 ip:ip_rput_local+cec (0, 0,
30006ea3000, 14, 0,
30000062030)
0x30006ef01e3: %l0-3: 0000030006e0c970 0000000000000000
0000030006e0c9d8 0000
030006ead770
%l4-7: 00000000014c8400 00000300003ec6b8 0000030003f2aa10
0000030003f2aa10
0x30006ef00a3: 000002a10007cdc0 ip:ip_rput+8a8 (30006ea3000,
30000062030, 30002
17aa60, 0, 30003f2aa10, 25637f3043)
0x30006eedee3: %l0-3: 0000000000000000 0000030001098288
0000030006ea3000 0000
030001098288
%l4-7: 0000030003f2aa10 0000030000062030 000003000217aa60
0000000000000000
0x30006eedda3: 000002a10007ce90 unix:putnext+21c (0, 30006ea3000,
78000, 784aa0
00, 30003143e68, 78400)
0x30006eedc63: %l0-3: 0000000001212e70 0000030000a69ef8
000003000217aa60 0000
000000000000
%l4-7: 0000000000000000 00000000014c8af8 000003000217b9c0
0000030006ea3000
0x30006eedb23: 000002a10007cf40 clhbsndr:hbsndr_rput+28 (3000217b9c0,
30006ea30
00, 20, 782e9dd0, 78141c00, 782e9dd0)
0x30006eed9e3: %l0-3: 0000000000000002 0000030000070288
0000000000000000 0000
04000ef81a18
%l4-7: 0000030000363ea8 0008000000000000 000002a101e7cc50
000000000000000b
0x30006eed8a3: 000002a10007cff0 unix:putnext+21c (0, 30006ea3000, fc00,
30006e0
56c0, 8, 10)
0x30006eed763: %l0-3: 00000000013ffd2c 000003000217bba0
000003000217b9c0 0000
000000000000
%l4-7: 0000000000000000 0000000078141e18 000003000217bc50
0000030006ea3000
0x30006eed623: 000002a10007d0a0 ce:ce_intr+7348 (104dbb8, 0,
30006ea3000, 7816f
448, 3000217bc50, 80)
0x30006eed4e3: %l0-3: 0000030003f2aa10 0000000000000000
0000000000000001 0000
03000023d408
%l4-7: 0000030003f2aa02 0000000000000000 000000000108d2d4
0000000000000001
0x30006eed3a3: 000002a10007da60 pcisch:pci_intr_wrapper+7c
(30000d4bee0, 745, 1
400000, 2a10007dd20, e8a0, 12e7f18)
0x30006eed263: %l0-3: 000000007823aeec 00000300009c4810
0000000000000000 0000
000000000000
%l4-7: 0000030000363ea8 000002a101f44d58 0000000078028400
0000000078028400
0x30006eed120:
0x30006eecfe3: syncing file systems...
0x30006eecea3: 2
0x30006eecd63: 1
0x30006eecc23: done
0x30006eecae3: dumping to /dev/md/dsk/d1, offset 419495936, content:
kernel
I do hope this all means something to someone! We are getting very close
to pulling the plug on the whole project!
Thanks in advance.
David
-----Original Message-----
From: [email protected]
[mailto:ha-clusters-discuss-bounces at opensolaris.org] On Behalf Of
Tirthankar
Sent: 13 December 2007 18:12
To: ha-clusters-discuss at opensolaris.org
Subject: Re: [ha-clusters-discuss] Establising quorums?
Seems like you are hitting a null pointer dereference in the orb
invocation code path
I would recommend you do a "scconf -cq reset" as Zoram mentioned.
i.e.
1. Boot up the 2 nodes that do not give a problem 2. Run the command
"scconf -cq reset"
3. The boot up the 3rd node that is giving a problem.
See if the node joins without panicking.
If it does, then please send the stack trace ( $C ) output and $<msgbuf
output
--
This message posted from opensolaris.org
_______________________________________________
ha-clusters-discuss mailing list
ha-clusters-discuss at opensolaris.org
http://mail.opensolaris.org/mailman/listinfo/ha-clusters-discuss
CONFIDENTIALITY NOTICE The information contained in this
e-mail is intended only for the confidential use of the above
named recipient. If you are not the intended recipient or person
responsible for delivering it to the intended recipient, you have
received this communication in error and must not distribute or
copy it. Please accept the sender's apologies, notify the sender
immediately by return e-mail and delete this communication.
Thank you.