On Fri, 16 Dec 2011 05:03:42 -0800 Hal Rosenstock <[email protected]> wrote:
> On 12/15/2011 7:33 PM, Ira Weiny wrote: > > On Thu, 15 Dec 2011 14:20:28 -0800 > > Hal Rosenstock <[email protected]> wrote: > > > >> On 12/15/2011 12:49 PM, Ira Weiny wrote: > >>> On Thu, 15 Dec 2011 06:15:17 -0800 > >>> Hal Rosenstock <[email protected]> wrote: > >>> > >>>> On 12/14/2011 10:18 PM, Ira Weiny wrote: > >>>>> [snip] > >> > >> Since the callback is made for both DR and LR SMPs, the logging at the > >> vendor layer isn't needed for those. It's still needed for GMPs though > >> (like PerfMgr). > > > > The PerfMgr prints the address info in it's error call back. > > I see other info there but not LID and TID being logged there. Ah forgive me that is the NodeGuid not the PortGuid. I was thinking PortGuid when I read the code which would have been just as good as LID (perhaps better?) I will fix the typo in V3 and update that message. Ira > > -- Hal > > > The SA however does not. :-( > > > > So I will add it there. > > > >> > >>> Why would you care about the other classes which > >>> timeout? Wouldn't they have the same issue of a response which is "fake"? > >> > >> No; the issue is only with DR path. Isn't LID fine ? > > > > Yep it would be. > > > >> > >>> If we want to remove the logging at this layer I think we should consider > >>> this. > >>> > >>> diff --git a/libvendor/osm_vendor_ibumad.c > >>> b/libvendor/osm_vendor_ibumad.c > >>> index b2872c8..b352cef 100644 > >>> --- a/libvendor/osm_vendor_ibumad.c > >>> +++ b/libvendor/osm_vendor_ibumad.c > >>> @@ -327,29 +327,6 @@ static void *umad_receiver(void *p_ptr) > >>> /* if status != 0 then we are handling recv timeout on > >>> send */ > >>> if (umad_status(p_madw->vend_wrap.umad)) { > >>> > >>> - if (mad->mgmt_class != IB_MCLASS_SUBN_DIR) { > >>> - /* LID routed */ > >>> - OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, > >>> "ERR 5410: " > >>> - "Send completed with error -- > >>> dropping\n" > >>> - "\t\t\tClass 0x%x, Method 0x%X, > >>> Attr 0x%X, " > >>> - "TID 0x%" PRIx64 ", LID %u\n", > >>> - mad->mgmt_class, mad->method, > >>> - cl_ntoh16(mad->attr_id), > >>> - cl_ntoh64(mad->trans_id), > >>> - cl_ntoh16(ib_mad_addr->lid)); > >>> - } else { > >>> - ib_smp_t *smp; > >>> - > >>> - /* Direct routed SMP */ > >>> - smp = (ib_smp_t *) mad; > >>> - OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, > >>> "ERR 5411: " > >>> - "DR SMP Send completed with error > >>> -- dropping\n" > >>> - "\t\t\tMethod 0x%X, Attr 0x%X, > >>> TID 0x%" PRIx64 > >>> - ", Hop Ptr: 0x%X\n", > >>> - mad->method, > >>> cl_ntoh16(mad->attr_id), > >>> - cl_ntoh64(mad->trans_id), > >>> smp->hop_ptr); > >>> - } > >>> - > >>> if (!(p_req_madw = get_madw(p_vend, > >>> &mad->trans_id))) { > >>> OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, > >>> "ERR 5412: " > >>> > >>> > >>> But I felt that was a bit draconian, and it was not my initial intent. > >> > >> Yes that's overkill. I think it is more like the below: > >> > >> /* if status != 0 and GMP then we are handling recv > >> timeout on send */ > >> if (umad_status(p_madw->vend_wrap.umad)) { > >> > >> if ((mad->mgmt_class != IB_MCLASS_SUBN_DIR) && > >> (mad->mgmt_class != IB_MCLASS_SUBN_LID)) { > >> /* LID routed */ > >> OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, > >> "ERR 5410: " > >> "Send completed with error -- > >> dropping\n" > >> "\t\t\tClass 0x%x, Method 0x%X, > >> Attr 0x%X, " > >> "TID 0x%" PRIx64 ", LID %u\n", > >> mad->mgmt_class, mad->method, > >> cl_ntoh16(mad->attr_id), > >> cl_ntoh64(mad->trans_id), > >> cl_ntoh16(ib_mad_addr->lid)); > >> } > >> > >> removing the else clause totally. > > > > New patch which logs this in the SA so we can make the above: > > > > OSM_LOG(p_vend->p_log, OSM_LOG_VERBOSE, "ERR 5410: " > > "Recieve Timeout on Send -- dropping " > > "TID 0x%" PRIx64 "\n", > > cl_ntoh64(mad->trans_id)); > > > > Just for reference of where the call back is coming from if needed, > > Ira > > > >> > >> -- Hal > >> > >>> Ira > >>> > >>>> > >>>> -- Hal > >>>> > >>>>> } > >>>>> > >>>>> if (!(p_req_madw = get_madw(p_vend, > >>>>> &mad->trans_id))) { > >>>>> diff --git a/opensm/osm_helper.c b/opensm/osm_helper.c > >>>>> index f9f3d9d..b968679 100644 > >>>>> --- a/opensm/osm_helper.c > >>>>> +++ b/opensm/osm_helper.c > >>>>> @@ -2059,8 +2059,9 @@ void osm_dump_smp_dr_path(IN osm_log_t * p_log, > >>>>> IN const ib_smp_t * p_smp, > >>>>> char buf[BUF_SIZE]; > >>>>> unsigned n; > >>>>> > >>>>> - n = sprintf(buf, "Received SMP on a %u hop path: " > >>>>> - "Initial path = ", p_smp->hop_count); > >>>>> + n = sprintf(buf, " DR SMP (TID 0x%" PRIx64 ") on a %u > >>>>> hop path: " > >>>>> + "Initial path = ", > >>>>> + cl_ntoh64(p_smp->trans_id), > >>>>> p_smp->hop_count); > >>>>> n += sprint_uint8_arr(buf + n, sizeof(buf) - n, > >>>>> p_smp->initial_path, > >>>>> p_smp->hop_count + 1); > >>>>> diff --git a/opensm/osm_sm_mad_ctrl.c b/opensm/osm_sm_mad_ctrl.c > >>>>> index ee92c66..a3b444a 100644 > >>>>> --- a/opensm/osm_sm_mad_ctrl.c > >>>>> +++ b/opensm/osm_sm_mad_ctrl.c > >>>>> @@ -704,6 +704,7 @@ Exit: > >>>>> */ > >>>>> static void (IN void *context, IN osm_madw_t * p_madw) > >>>>> { > >>>>> + char lidstr[8]; > >>>>> osm_sm_mad_ctrl_t *p_ctrl = context; > >>>>> ib_api_status_t status; > >>>>> ib_smp_t *p_smp; > >>>>> @@ -713,13 +714,24 @@ static void sm_mad_ctrl_send_err_cb(IN void > >>>>> *context, IN osm_madw_t * p_madw) > >>>>> CL_ASSERT(p_madw); > >>>>> > >>>>> p_smp = osm_madw_get_smp_ptr(p_madw); > >>>>> + > >>>>> + if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR) > >>>>> + lidstr[0] = '\0'; > >>>>> + else > >>>>> + snprintf(lidstr, 8, " DLID %u", > >>>>> + cl_ntoh16(p_madw->mad_addr.dest_lid)); > >>>>> + > >>>>> OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 3113: " > >>>>> "MAD completed in error (%s): " > >>>>> - "%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 "\n", > >>>>> + "%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 " %s\n", > >>>>> ib_get_err_str(p_madw->status), > >>>>> ib_get_sm_method_str(p_smp->method), > >>>>> ib_get_sm_attr_str(p_smp->attr_id), > >>>>> cl_ntoh32(p_smp->attr_mod), > >>>>> - cl_ntoh64(p_smp->trans_id)); > >>>>> + cl_ntoh64(p_smp->trans_id), > >>>>> + lidstr); > >>>>> + > >>>>> + if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR) > >>>>> + osm_dump_smp_dr_path(p_ctrl->p_log, p_smp, > >>>>> OSM_LOG_ERROR); > >>>>> > >>>>> /* > >>>>> If this was a SubnSet MAD, then this error might indicate a > >>>>> problem > >>>> > >>> > >>> > >> > > > > > -- Ira Weiny Math Programmer/Computer Scientist Lawrence Livermore National Lab 925-423-8008 [email protected] -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to [email protected] More majordomo info at http://vger.kernel.org/majordomo-info.html
