Gleb Natapov wrote:
On Sun, Jul 08, 2007 at 12:41:58PM -0400, Tim Prins wrote:
On Sunday 08 July 2007 08:32:27 am Gleb Natapov wrote:
On Fri, Jul 06, 2007 at 06:36:13PM -0400, Tim Prins wrote:
While looking into another problem I ran into an issue which made ob1
segfault on me. Using gm, and running the test test_dan1 in the onesided
test suite, if I limit the gm freelist by too much, I get a segfault.
That is,

mpirun -np 2 -mca btl gm,self -mca btl_gm_free_list_max 1024 test_dan1

works fine, but

mpirun -np 2 -mca btl gm,self -mca btl_gm_free_list_max 512 test_dan1
I cannot, unfortunately, reproduce this with openib BTL.

segfaults. Here is the relevant output from gdb:

Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 1077541088 (LWP 15600)]
0x404d81c1 in mca_pml_ob1_send_fin (proc=0x9bd9490, bml_btl=0xd323580,
    hdr_des=0x9e54e78, order=255 '�', status=1) at pml_ob1.c:267
267         MCA_PML_OB1_DES_ALLOC(bml_btl, fin, order,
sizeof(mca_pml_ob1_fin_hdr_t));
can you send me what's inside bml_btl?
It turns out that the order of arguments to mca_pml_ob1_send_fin was wrong. I fixed this in r15304. But now we hang instead of segfault, and have both processes just looping through opal_progress. I really don't what to look for. Any hints?

Can you look in gdb at mca_pml_ob1.rdma_pending?
Yeah, rank 0 has nothing on the list, and rank 1 has 48 things.

Here is the first item on the list:
$7 = {
  super = {
    super = {
      super = {
        obj_magic_id = 16046253926196952813,
        obj_class = 0x404f5980,
        obj_reference_count = 1,
        cls_init_file_name = 0x404f30f9 "pml_ob1_sendreq.c",
        cls_init_lineno = 1134
      },
      opal_list_next = 0x8f5d680,
      opal_list_prev = 0x404f57c8,
      opal_list_item_refcount = 1,
      opal_list_item_belong_to = 0x404f57b0
    },
    registration = 0x0,
    ptr = 0x0
  },
  rdma_bml = 0x8729098,
  rdma_hdr = {
    hdr_common = {
      hdr_type = 8 '\b',
      hdr_flags = 4 '\004'
    },
    hdr_match = {
      hdr_common = {
        hdr_type = 8 '\b',
        hdr_flags = 4 '\004'
      },
      hdr_ctx = 5,
      hdr_src = 1,
      hdr_tag = 142418176,
      hdr_seq = 0,
      hdr_padding = "\000"
    },
    hdr_rndv = {
      hdr_match = {
        hdr_common = {
          hdr_type = 8 '\b',
          hdr_flags = 4 '\004'
        },
        hdr_ctx = 5,
        hdr_src = 1,
        hdr_tag = 142418176,
        hdr_seq = 0,
        hdr_padding = "\000"
      },
      hdr_msg_length = 236982400,
      hdr_src_req = {
        lval = 0,
        ival = 0,
        pval = 0x0,
        sval = {
          uval = 0,
          lval = 0
        }
      }
    },
    hdr_rget = {
      hdr_rndv = {
        hdr_match = {
          hdr_common = {
            hdr_type = 8 '\b',
            hdr_flags = 4 '\004'
          },
          hdr_ctx = 5,
          hdr_src = 1,
          hdr_tag = 142418176,
          hdr_seq = 0,
          hdr_padding = "\000"
        },
        hdr_msg_length = 236982400,
        hdr_src_req = {
          lval = 0,
          ival = 0,
          pval = 0x0,
          sval = {
            uval = 0,
            lval = 0
          }
        }
      },
      hdr_seg_cnt = 1106481152,
      hdr_padding = "\000\000\000",
      hdr_des = {
        lval = 32768,
        ival = 32768,
        pval = 0x8000,
        sval = {
          uval = 32768,
          lval = 0
        }
      },
      hdr_segs = {{
          seg_addr = {
            lval = 0,
            ival = 0,
            pval = 0x0,
            sval = {
              uval = 0,
              lval = 0
            }
          },
          seg_len = 0,
          seg_padding = "\000\000\000",
          seg_key = {
            key32 = {0, 0},
            key64 = 0,
            key8 = "\000\000\000\000\000\000\000"
          }
        }}
    },
    hdr_frag = {
      hdr_common = {
        hdr_type = 8 '\b',
        hdr_flags = 4 '\004'
      },
      hdr_padding = "\005\000\001\000\000",
      hdr_frag_offset = 142418176,
      hdr_src_req = {
        lval = 236982400,
        ival = 236982400,
        pval = 0xe201080,
        sval = {
          uval = 236982400,
          lval = 0
        }
      },
      hdr_dst_req = {
        lval = 0,
        ival = 0,
        pval = 0x0,
        sval = {
          uval = 0,
          lval = 0
        }
      }
    },
    hdr_ack = {
      hdr_common = {
        hdr_type = 8 '\b',
        hdr_flags = 4 '\004'
      },
      hdr_padding = "\005\000\001\000\000",
      hdr_src_req = {
        lval = 142418176,
        ival = 142418176,
        pval = 0x87d2100,
        sval = {
          uval = 142418176,
          lval = 0
        }
      },
      hdr_dst_req = {
        lval = 236982400,
        ival = 236982400,
        pval = 0xe201080,
        sval = {
          uval = 236982400,
          lval = 0
        }
      },
      hdr_send_offset = 0
    },
    hdr_rdma = {
      hdr_common = {
        hdr_type = 8 '\b',
        hdr_flags = 4 '\004'
      },
      hdr_padding = "\005",
      hdr_seg_cnt = 1,
      hdr_req = {
        lval = 142418176,
        ival = 142418176,
        pval = 0x87d2100,
        sval = {
          uval = 142418176,
          lval = 0
        }
      },
      hdr_des = {
        lval = 236982400,
        ival = 236982400,
        pval = 0xe201080,
        sval = {
          uval = 236982400,
          lval = 0
        }
      },
      hdr_rdma_offset = 0,
      hdr_segs = {{
          seg_addr = {
            lval = 1106481152,
            ival = 1106481152,
            pval = 0x41f39000,
            sval = {
              uval = 1106481152,
              lval = 0
            }
          },
          seg_len = 32768,
          seg_padding = "\000\000\000",
          seg_key = {
            key32 = {0, 0},
            key64 = 0,
            key8 = "\000\000\000\000\000\000\000"
          }
        }}
    },
    hdr_fin = {
      hdr_common = {
        hdr_type = 8 '\b',
        hdr_flags = 4 '\004'
      },
      hdr_padding = "\005\000\001\000\000",
      hdr_des = {
        lval = 142418176,
        ival = 142418176,
        pval = 0x87d2100,
        sval = {
          uval = 142418176,
          lval = 0
        }
      },
      hdr_fail = 236982400
    }
  },
  rdma_state = MCA_PML_OB1_RDMA_PUT,
  rdma_length = 32768,
  rdma_segs = {{
      seg_addr = {
        lval = 1106481152,
        ival = 1106481152,
        pval = 0x41f39000,
        sval = {
          uval = 1106481152,
          lval = 0
        }
      },
      seg_len = 32768,
      seg_padding = "\000\000\000",
      seg_key = {
        key32 = {0, 0},
        key64 = 0,
        key8 = "\000\000\000\000\000\000\000"
      }
    }, {
      seg_addr = {
        lval = 0,
        ival = 0,
        pval = 0x0,
        sval = {
          uval = 0,
          lval = 0
        }
      },
      seg_len = 0,
      seg_padding = "\000\000\000",
      seg_key = {
        key32 = {0, 0},
        key64 = 0,
        key8 = "\000\000\000\000\000\000\000"
      }
    } <repeats 15 times>},
  rdma_req = 0x87d2100,
  rdma_ep = 0x8516f08,
  convertor = {
    super = {
      obj_magic_id = 0,
      obj_class = 0x0,
      obj_reference_count = 0,
      cls_init_file_name = 0x0,
      cls_init_lineno = 0
    },
    remoteArch = 4291428864,
    flags = 1855942,
    local_size = 32768,
    remote_size = 32768,
    pDesc = 0x8054620,
    use_desc = 0x80546b4,
    count = 32768,
    pBaseBuf = 0x41f39000 "",
    pStack = 0x8f5c3ec,
    stack_size = 5,
    fAdvance = 0,
    master = 0x84ad398,
    stack_pos = 4294967295,
    bConverted = 0,
    partial_length = 0,
    checksum = 0,
    csum_ui1 = 0,
    csum_ui2 = 0,
    static_stack = {{
        index = 0,
        type = 0,
        count = 0,
        disp = 0
      }, {
        index = 0,
        type = 0,
        count = 0,
        disp = 0
      }, {
        index = 0,
        type = 0,
        count = 0,
        disp = 0
      }, {
        index = 0,
        type = 0,
        count = 0,
        disp = 0
      }, {
        index = 0,
        type = 0,
        count = 0,
        disp = 0
      }}
  },
  reg = 0x8515e80,
  retries = 1
}


Thanks,

Tim

Reply via email to