Gleb Natapov wrote:
On Sun, Jul 08, 2007 at 12:41:58PM -0400, Tim Prins wrote:
On Sunday 08 July 2007 08:32:27 am Gleb Natapov wrote:
On Fri, Jul 06, 2007 at 06:36:13PM -0400, Tim Prins wrote:
While looking into another problem I ran into an issue which made ob1
segfault on me. Using gm, and running the test test_dan1 in the onesided
test suite, if I limit the gm freelist by too much, I get a segfault.
That is,
mpirun -np 2 -mca btl gm,self -mca btl_gm_free_list_max 1024 test_dan1
works fine, but
mpirun -np 2 -mca btl gm,self -mca btl_gm_free_list_max 512 test_dan1
I cannot, unfortunately, reproduce this with openib BTL.
segfaults. Here is the relevant output from gdb:
Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 1077541088 (LWP 15600)]
0x404d81c1 in mca_pml_ob1_send_fin (proc=0x9bd9490, bml_btl=0xd323580,
hdr_des=0x9e54e78, order=255 '�', status=1) at pml_ob1.c:267
267 MCA_PML_OB1_DES_ALLOC(bml_btl, fin, order,
sizeof(mca_pml_ob1_fin_hdr_t));
can you send me what's inside bml_btl?
It turns out that the order of arguments to mca_pml_ob1_send_fin was wrong. I
fixed this in r15304. But now we hang instead of segfault, and have both
processes just looping through opal_progress. I really don't what to look
for. Any hints?
Can you look in gdb at mca_pml_ob1.rdma_pending?
Yeah, rank 0 has nothing on the list, and rank 1 has 48 things.
Here is the first item on the list:
$7 = {
super = {
super = {
super = {
obj_magic_id = 16046253926196952813,
obj_class = 0x404f5980,
obj_reference_count = 1,
cls_init_file_name = 0x404f30f9 "pml_ob1_sendreq.c",
cls_init_lineno = 1134
},
opal_list_next = 0x8f5d680,
opal_list_prev = 0x404f57c8,
opal_list_item_refcount = 1,
opal_list_item_belong_to = 0x404f57b0
},
registration = 0x0,
ptr = 0x0
},
rdma_bml = 0x8729098,
rdma_hdr = {
hdr_common = {
hdr_type = 8 '\b',
hdr_flags = 4 '\004'
},
hdr_match = {
hdr_common = {
hdr_type = 8 '\b',
hdr_flags = 4 '\004'
},
hdr_ctx = 5,
hdr_src = 1,
hdr_tag = 142418176,
hdr_seq = 0,
hdr_padding = "\000"
},
hdr_rndv = {
hdr_match = {
hdr_common = {
hdr_type = 8 '\b',
hdr_flags = 4 '\004'
},
hdr_ctx = 5,
hdr_src = 1,
hdr_tag = 142418176,
hdr_seq = 0,
hdr_padding = "\000"
},
hdr_msg_length = 236982400,
hdr_src_req = {
lval = 0,
ival = 0,
pval = 0x0,
sval = {
uval = 0,
lval = 0
}
}
},
hdr_rget = {
hdr_rndv = {
hdr_match = {
hdr_common = {
hdr_type = 8 '\b',
hdr_flags = 4 '\004'
},
hdr_ctx = 5,
hdr_src = 1,
hdr_tag = 142418176,
hdr_seq = 0,
hdr_padding = "\000"
},
hdr_msg_length = 236982400,
hdr_src_req = {
lval = 0,
ival = 0,
pval = 0x0,
sval = {
uval = 0,
lval = 0
}
}
},
hdr_seg_cnt = 1106481152,
hdr_padding = "\000\000\000",
hdr_des = {
lval = 32768,
ival = 32768,
pval = 0x8000,
sval = {
uval = 32768,
lval = 0
}
},
hdr_segs = {{
seg_addr = {
lval = 0,
ival = 0,
pval = 0x0,
sval = {
uval = 0,
lval = 0
}
},
seg_len = 0,
seg_padding = "\000\000\000",
seg_key = {
key32 = {0, 0},
key64 = 0,
key8 = "\000\000\000\000\000\000\000"
}
}}
},
hdr_frag = {
hdr_common = {
hdr_type = 8 '\b',
hdr_flags = 4 '\004'
},
hdr_padding = "\005\000\001\000\000",
hdr_frag_offset = 142418176,
hdr_src_req = {
lval = 236982400,
ival = 236982400,
pval = 0xe201080,
sval = {
uval = 236982400,
lval = 0
}
},
hdr_dst_req = {
lval = 0,
ival = 0,
pval = 0x0,
sval = {
uval = 0,
lval = 0
}
}
},
hdr_ack = {
hdr_common = {
hdr_type = 8 '\b',
hdr_flags = 4 '\004'
},
hdr_padding = "\005\000\001\000\000",
hdr_src_req = {
lval = 142418176,
ival = 142418176,
pval = 0x87d2100,
sval = {
uval = 142418176,
lval = 0
}
},
hdr_dst_req = {
lval = 236982400,
ival = 236982400,
pval = 0xe201080,
sval = {
uval = 236982400,
lval = 0
}
},
hdr_send_offset = 0
},
hdr_rdma = {
hdr_common = {
hdr_type = 8 '\b',
hdr_flags = 4 '\004'
},
hdr_padding = "\005",
hdr_seg_cnt = 1,
hdr_req = {
lval = 142418176,
ival = 142418176,
pval = 0x87d2100,
sval = {
uval = 142418176,
lval = 0
}
},
hdr_des = {
lval = 236982400,
ival = 236982400,
pval = 0xe201080,
sval = {
uval = 236982400,
lval = 0
}
},
hdr_rdma_offset = 0,
hdr_segs = {{
seg_addr = {
lval = 1106481152,
ival = 1106481152,
pval = 0x41f39000,
sval = {
uval = 1106481152,
lval = 0
}
},
seg_len = 32768,
seg_padding = "\000\000\000",
seg_key = {
key32 = {0, 0},
key64 = 0,
key8 = "\000\000\000\000\000\000\000"
}
}}
},
hdr_fin = {
hdr_common = {
hdr_type = 8 '\b',
hdr_flags = 4 '\004'
},
hdr_padding = "\005\000\001\000\000",
hdr_des = {
lval = 142418176,
ival = 142418176,
pval = 0x87d2100,
sval = {
uval = 142418176,
lval = 0
}
},
hdr_fail = 236982400
}
},
rdma_state = MCA_PML_OB1_RDMA_PUT,
rdma_length = 32768,
rdma_segs = {{
seg_addr = {
lval = 1106481152,
ival = 1106481152,
pval = 0x41f39000,
sval = {
uval = 1106481152,
lval = 0
}
},
seg_len = 32768,
seg_padding = "\000\000\000",
seg_key = {
key32 = {0, 0},
key64 = 0,
key8 = "\000\000\000\000\000\000\000"
}
}, {
seg_addr = {
lval = 0,
ival = 0,
pval = 0x0,
sval = {
uval = 0,
lval = 0
}
},
seg_len = 0,
seg_padding = "\000\000\000",
seg_key = {
key32 = {0, 0},
key64 = 0,
key8 = "\000\000\000\000\000\000\000"
}
} <repeats 15 times>},
rdma_req = 0x87d2100,
rdma_ep = 0x8516f08,
convertor = {
super = {
obj_magic_id = 0,
obj_class = 0x0,
obj_reference_count = 0,
cls_init_file_name = 0x0,
cls_init_lineno = 0
},
remoteArch = 4291428864,
flags = 1855942,
local_size = 32768,
remote_size = 32768,
pDesc = 0x8054620,
use_desc = 0x80546b4,
count = 32768,
pBaseBuf = 0x41f39000 "",
pStack = 0x8f5c3ec,
stack_size = 5,
fAdvance = 0,
master = 0x84ad398,
stack_pos = 4294967295,
bConverted = 0,
partial_length = 0,
checksum = 0,
csum_ui1 = 0,
csum_ui2 = 0,
static_stack = {{
index = 0,
type = 0,
count = 0,
disp = 0
}, {
index = 0,
type = 0,
count = 0,
disp = 0
}, {
index = 0,
type = 0,
count = 0,
disp = 0
}, {
index = 0,
type = 0,
count = 0,
disp = 0
}, {
index = 0,
type = 0,
count = 0,
disp = 0
}}
},
reg = 0x8515e80,
retries = 1
}
Thanks,
Tim