On Fri, 16 Dec 2011 05:03:42 -0800
Hal Rosenstock <[email protected]> wrote:

> On 12/15/2011 7:33 PM, Ira Weiny wrote:
> > On Thu, 15 Dec 2011 14:20:28 -0800
> > Hal Rosenstock <[email protected]> wrote:
> > 
> >> On 12/15/2011 12:49 PM, Ira Weiny wrote:
> >>> On Thu, 15 Dec 2011 06:15:17 -0800
> >>> Hal Rosenstock <[email protected]> wrote:
> >>>
> >>>> On 12/14/2011 10:18 PM, Ira Weiny wrote:
> >>>>>

[snip]

> >>
> >> Since the callback is made for both DR and LR SMPs, the logging at the
> >> vendor layer isn't needed for those. It's still needed for GMPs though
> >> (like PerfMgr).
> > 
> > The PerfMgr prints the address info in it's error call back.  
> 
> I see other info there but not LID and TID being logged there.

Ah forgive me that is the NodeGuid not the PortGuid.  I was thinking PortGuid 
when I read the code which would have been just as good as LID (perhaps better?)

I will fix the typo in V3 and update that message.

Ira

> 
> -- Hal
> 
> > The SA however does not.  :-(
> > 
> > So I will add it there.
> > 
> >>
> >>> Why would you care about the other classes which
> >>> timeout?  Wouldn't they have the same issue of a response which is "fake"?
> >>
> >> No; the issue is only with DR path. Isn't LID fine ?
> > 
> > Yep it would be.
> > 
> >>
> >>> If we want to remove the logging at this layer I think we should consider
> >>> this.
> >>>
> >>> diff --git a/libvendor/osm_vendor_ibumad.c 
> >>> b/libvendor/osm_vendor_ibumad.c           
> >>> index b2872c8..b352cef 100644
> >>> --- a/libvendor/osm_vendor_ibumad.c
> >>> +++ b/libvendor/osm_vendor_ibumad.c
> >>> @@ -327,29 +327,6 @@ static void *umad_receiver(void *p_ptr)
> >>>                 /* if status != 0 then we are handling recv timeout on 
> >>> send */
> >>>                 if (umad_status(p_madw->vend_wrap.umad)) {
> >>>
> >>> -                       if (mad->mgmt_class != IB_MCLASS_SUBN_DIR) {
> >>> -                               /* LID routed */
> >>> -                               OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, 
> >>> "ERR 5410: "
> >>> -                                       "Send completed with error -- 
> >>> dropping\n"
> >>> -                                       "\t\t\tClass 0x%x, Method 0x%X, 
> >>> Attr 0x%X, "
> >>> -                                       "TID 0x%" PRIx64 ", LID %u\n",
> >>> -                                       mad->mgmt_class, mad->method,
> >>> -                                       cl_ntoh16(mad->attr_id),
> >>> -                                       cl_ntoh64(mad->trans_id),
> >>> -                                       cl_ntoh16(ib_mad_addr->lid));
> >>> -                       } else {
> >>> -                               ib_smp_t *smp;
> >>> -
> >>> -                               /* Direct routed SMP */
> >>> -                               smp = (ib_smp_t *) mad;
> >>> -                               OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, 
> >>> "ERR 5411: "
> >>> -                                       "DR SMP Send completed with error 
> >>> -- dropping\n"
> >>> -                                       "\t\t\tMethod 0x%X, Attr 0x%X, 
> >>> TID 0x%" PRIx64
> >>> -                                       ", Hop Ptr: 0x%X\n",
> >>> -                                       mad->method, 
> >>> cl_ntoh16(mad->attr_id),
> >>> -                                       cl_ntoh64(mad->trans_id), 
> >>> smp->hop_ptr);
> >>> -                       }
> >>> -
> >>>                         if (!(p_req_madw = get_madw(p_vend, 
> >>> &mad->trans_id))) {
> >>>                                 OSM_LOG(p_vend->p_log, OSM_LOG_ERROR,
> >>>                                         "ERR 5412: "
> >>>
> >>>
> >>> But I felt that was a bit draconian, and it was not my initial intent.
> >>
> >> Yes that's overkill. I think it is more like the below:
> >>
> >>                 /* if status != 0 and GMP then we are handling recv
> >> timeout on send */
> >>                 if (umad_status(p_madw->vend_wrap.umad)) {
> >>
> >>                         if ((mad->mgmt_class != IB_MCLASS_SUBN_DIR) &&
> >>                        (mad->mgmt_class != IB_MCLASS_SUBN_LID)) {
> >>                                 /* LID routed */
> >>                                 OSM_LOG(p_vend->p_log, OSM_LOG_ERROR,
> >> "ERR 5410: "
> >>                                         "Send completed with error --
> >> dropping\n"
> >>                                         "\t\t\tClass 0x%x, Method 0x%X,
> >> Attr 0x%X, "
> >>                                         "TID 0x%" PRIx64 ", LID %u\n",
> >>                                         mad->mgmt_class, mad->method,
> >>                                         cl_ntoh16(mad->attr_id),
> >>                                         cl_ntoh64(mad->trans_id),
> >>                                         cl_ntoh16(ib_mad_addr->lid));
> >>                         }
> >>
> >> removing the else clause totally.
> > 
> > New patch which logs this in the SA so we can make the above:
> > 
> >                        OSM_LOG(p_vend->p_log, OSM_LOG_VERBOSE, "ERR 5410: "
> >                                "Recieve Timeout on Send -- dropping "
> >                                "TID 0x%" PRIx64 "\n", 
> > cl_ntoh64(mad->trans_id));
> > 
> > Just for reference of where the call back is coming from if needed,
> > Ira
> > 
> >>
> >> -- Hal
> >>
> >>> Ira
> >>>
> >>>>
> >>>> -- Hal
> >>>>
> >>>>>                         }
> >>>>>  
> >>>>>                         if (!(p_req_madw = get_madw(p_vend, 
> >>>>> &mad->trans_id))) {
> >>>>> diff --git a/opensm/osm_helper.c b/opensm/osm_helper.c
> >>>>> index f9f3d9d..b968679 100644
> >>>>> --- a/opensm/osm_helper.c
> >>>>> +++ b/opensm/osm_helper.c
> >>>>> @@ -2059,8 +2059,9 @@ void osm_dump_smp_dr_path(IN osm_log_t * p_log, 
> >>>>> IN const ib_smp_t * p_smp,
> >>>>>                 char buf[BUF_SIZE];
> >>>>>                 unsigned n;
> >>>>>  
> >>>>> -               n = sprintf(buf, "Received SMP on a %u hop path: "
> >>>>> -                           "Initial path = ", p_smp->hop_count);
> >>>>> +               n = sprintf(buf, "   DR SMP (TID 0x%" PRIx64 ") on a %u 
> >>>>> hop path: "
> >>>>> +                           "Initial path = ",
> >>>>> +                           cl_ntoh64(p_smp->trans_id), 
> >>>>> p_smp->hop_count);
> >>>>>                 n += sprint_uint8_arr(buf + n, sizeof(buf) - n,
> >>>>>                                       p_smp->initial_path,
> >>>>>                                       p_smp->hop_count + 1);
> >>>>> diff --git a/opensm/osm_sm_mad_ctrl.c b/opensm/osm_sm_mad_ctrl.c
> >>>>> index ee92c66..a3b444a 100644
> >>>>> --- a/opensm/osm_sm_mad_ctrl.c
> >>>>> +++ b/opensm/osm_sm_mad_ctrl.c
> >>>>> @@ -704,6 +704,7 @@ Exit:
> >>>>>   */
> >>>>>  static void (IN void *context, IN osm_madw_t * p_madw)
> >>>>>  {
> >>>>> +       char lidstr[8];
> >>>>>         osm_sm_mad_ctrl_t *p_ctrl = context;
> >>>>>         ib_api_status_t status;
> >>>>>         ib_smp_t *p_smp;
> >>>>> @@ -713,13 +714,24 @@ static void sm_mad_ctrl_send_err_cb(IN void 
> >>>>> *context, IN osm_madw_t * p_madw)
> >>>>>         CL_ASSERT(p_madw);
> >>>>>  
> >>>>>         p_smp = osm_madw_get_smp_ptr(p_madw);
> >>>>> +
> >>>>> +       if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
> >>>>> +               lidstr[0] = '\0';
> >>>>> +       else
> >>>>> +               snprintf(lidstr, 8, " DLID %u",
> >>>>> +                       cl_ntoh16(p_madw->mad_addr.dest_lid));
> >>>>> +
> >>>>>         OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 3113: "
> >>>>>                 "MAD completed in error (%s): "
> >>>>> -               "%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 "\n",
> >>>>> +               "%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 " %s\n",
> >>>>>                 ib_get_err_str(p_madw->status),
> >>>>>                 ib_get_sm_method_str(p_smp->method),
> >>>>>                 ib_get_sm_attr_str(p_smp->attr_id), 
> >>>>> cl_ntoh32(p_smp->attr_mod),
> >>>>> -               cl_ntoh64(p_smp->trans_id));
> >>>>> +               cl_ntoh64(p_smp->trans_id),
> >>>>> +               lidstr);
> >>>>> +
> >>>>> +       if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
> >>>>> +               osm_dump_smp_dr_path(p_ctrl->p_log, p_smp, 
> >>>>> OSM_LOG_ERROR);
> >>>>>  
> >>>>>         /*
> >>>>>            If this was a SubnSet MAD, then this error might indicate a 
> >>>>> problem
> >>>>
> >>>
> >>>
> >>
> > 
> > 
> 


-- 
Ira Weiny
Math Programmer/Computer Scientist
Lawrence Livermore National Lab
925-423-8008
[email protected]
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to