My previous post on changing ipoib parameters programmatically can be found here: http://lists.openfabrics.org/pipermail/ofw/2007-October/001705.html
We might add this script as an example of how to configure a cluster. Thanks Tzachi > -----Original Message----- > From: Smith, Stan [mailto:[EMAIL PROTECTED] > Sent: Thursday, September 04, 2008 10:55 PM > To: Tzachi Dar; Alex Naslednikov; [email protected] > Subject: RE: [ofw] [PATCH 3/3] Mlx4 (checksum): improving the > mechanism of csum offload > > Please see inline comments. > > Thanks. > > Tzachi Dar wrote: > > See bellow. > > > > Thanks > > Tzachi > > > >> -----Original Message----- > >> From: Smith, Stan [mailto:[EMAIL PROTECTED] > >> Sent: Thursday, September 04, 2008 10:16 PM > >> To: Alex Naslednikov; Tzachi Dar; [email protected] > >> Subject: RE: [ofw] [PATCH 3/3] Mlx4 (checksum): improving the > >> mechanism of csum offload > >> > >> Hello Alex, > >> > >> Some questions: > >> > >> 1) Am I correct in understanding that ipoib offload is turned 'on' > >> by default? > >> > > The mode that is turned on by default, is "do checksum in HW if HW > > supports it, or by SW if HW doesn't support that". Probably > the best > > mode for everyone. > > > >> 2) For debug/testing purposes, how does one turn ipoib > offload 'off' > >> or 'on'? > > > > I have once sent such a script, I'll try to find it or send > a new one. > > > > Should this script be part of IPoIB offload documentation? > > > > >> > >> 3) In the patch for ipoib_port.c, when making the call > >> query_ca(p_port->ib_mgr.h_ca, NULL, &attr_size ); > >> I do not see attr_size initialized, hence it could be a very > >> large integer (stack garbage). The garbage value could pass the > >> 1st call to get the attr size, then the following > >> cl_malloc(attr_size) call could fail > >> due to the very large integer. > >> Do you agree? > > > > In the function mlnx_query_ca there is the following code: > > // resource sufficience check > > if (NULL == p_ca_attr || *p_byte_count < required_size) { > > *p_byte_count = required_size; > > status = IB_INSUFFICIENT_MEMORY; > > if ( p_ca_attr != NULL) { > > HCA_PRINT (TRACE_LEVEL_ERROR,HCA_DBG_SHIM, > > ("Failed *p_byte_count (%d) < > > required_size (%d)\n", *p_byte_count, required_size )); > > } > > goto err_insuff_mem; > > } > > Since p_ca_attr== NULL there shouldn't be a problem, the value is > > ignored. > > OK, situation covered. > > Thanks. > > > > > > >> > >> Please notify the list when the patches are applied and > committed to > >> svn. > >> > >> Thanks, > >> > >> Stan. > >> > >> > >> Alex Naslednikov wrote: > >>> This is cumulative patch that fixes all open issues shown the last > >>> week: SGE wrong calculation LSO messages in log files > Fixes in ipoib > >>> csum offload mechanism Adding 3 states for CSUM offload : > disabled, > >>> enabled if supported by HW and bypass (see inline) > >>> > >>> Signed-off by: xalex (Alexander Naslednikov) > >>> Index: > >> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_port.c > >>> > =================================================================== > >>> --- D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_port.c > >>> (revision 3106) +++ > >>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_port.c > >>> (revision 3107) @@ -825,6 +825,8 @@ uint64_t > >>> vaddr; net32_t rkey; > >>> ib_qp_attr_t qp_attr; > >>> + ib_ca_attr_t * p_ca_attr; > >>> + uint32_t attr_size; > >>> > >>> IPOIB_ENTER( IPOIB_DBG_INIT ); > >>> > >>> @@ -889,7 +891,7 @@ > >>> > >> p_port->p_adapter->p_ifc->get_err_str( status > >>> )) ); > >>> return status; > >>> } > >>> - > >>> + > >>> /* Allocate the QP. */ > >>> cl_memclr( &qp_create, sizeof(qp_create) ); > >>> qp_create.qp_type = IB_QPT_UNRELIABLE_DGRM; @@ -897,8 > >>> +899,47 @@ qp_create.rq_sge = 2; /* To support buffers > >>> spanning pages. */ qp_create.h_rq_cq = > >>> p_port->ib_mgr.h_recv_cq; qp_create.sq_depth = > >>> p_port->p_adapter->params.sq_depth; > >>> - //TODO: Figure out the right number of SGE entries for > >>> sends. > >>> - qp_create.sq_sge = MAX_SEND_SGE; > >>> + > >>> + //Figure out the right number of SGE entries for sends. > >>> + /* Get the size of the CA attribute structure. */ > >>> + status = p_port->p_adapter->p_ifc->query_ca( > >>> p_port->ib_mgr.h_ca, NULL, &attr_size ); > >>> + if( status != IB_INSUFFICIENT_MEMORY ) > >>> + { > >>> + IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, > >>> IPOIB_DBG_ERROR, + ("ib_query_ca failed with > >>> status %s.\n", p_port->p_adapter->p_ifc->get_err_str(status)) ); > >>> + return status; > >>> + } > >>> + > >>> + /* Allocate enough space to store the attribute structure. > >>> */ + p_ca_attr = cl_malloc( attr_size ); > >>> + if( !p_ca_attr ) > >>> + { > >>> + IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, > >>> IPOIB_DBG_ERROR, + ("cl_malloc failed to > >>> allocate p_ca_attr!\n") ); + return > >>> IB_INSUFFICIENT_RESOURCES; + } + > >>> + /* Query the CA attributes. */ > >>> + status = > >>> p_port->p_adapter->p_ifc->query_ca(p_port->ib_mgr.h_ca, p_ca_attr, > >>> &attr_size ); + if( status != IB_SUCCESS ) > >>> + { > >>> + cl_free( p_ca_attr ); > >>> + > >>> + IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, > >>> IPOIB_DBG_ERROR, + ("ib_query_ca failed with > >>> status %s.\n", p_port->p_adapter->p_ifc->get_err_str(status)) ); > >>> + return status; > >>> + } > >>> +#define UD_QP_USED_SGE 3 > >>> + qp_create.sq_sge = MAX_SEND_SGE < p_ca_attr->max_sges ? > >>> MAX_SEND_SGE : (p_ca_attr->max_sges - UD_QP_USED_SGE); > >>> + if (!p_ca_attr->ipoib_csum) { > >>> + //checksum is not supported by device > >>> + //user must specify BYPASS to explicitly cancel > >>> checksum calculation + if > >>> (p_port->p_adapter->params.send_chksum_offload == CSUM_ENABLED) + > >> p_port->p_adapter->params.send_chksum_offload > >>> = CSUM_DISABLED; > >>> + if (p_port->p_adapter->params.recv_chksum_offload > >>> == CSUM_ENABLED) + > >> p_port->p_adapter->params.recv_chksum_offload > >>> = CSUM_DISABLED; > >>> + } > >>> + > >>> qp_create.h_sq_cq = p_port->ib_mgr.h_send_cq; > >>> qp_create.sq_signaled = TRUE; > >>> status = p_port->p_adapter->p_ifc->create_qp( @@ -1937,7 > >>> +1978,7 @@ > >>> > >>> } > >>> /* Successful completion. Get the receive > >>> information. */ - p_desc->ndis_csum.Value = (ULONG) > >>> p_wc->csum_ok; + p_desc->ndis_csum.Value = (ULONG) > >>> p_wc->recv.ud.csum_ok; cl_perf_start( > GetRecvEndpts > >>> ); __recv_get_endpts( p_port, p_desc, > p_wc, &p_src, > >>> &p_dst ); cl_perf_stop( &p_port->p_adapter->perf, > >>> GetRecvEndpts ); @@ -2580,10 +2621,22 @@ > >>> ("__buf_mgr_get_ndis_pkt failed\n") ); return > >>> IB_INSUFFICIENT_RESOURCES; } > >>> - > >>> + if (p_port->p_adapter->params.recv_chksum_offload != > >>> CSUM_BYPASS) { /* Get the checksums directly from packet > >>> information. */ > >>> - NDIS_PER_PACKET_INFO_FROM_PACKET( *pp_packet, > >>> TcpIpChecksumPacketInfo ) = > >>> - (PVOID) (uintn_t) (p_desc->ndis_csum.Value); > >>> + /* In this case, no one of cheksum's cat get > false value */ > >>> + /* If hardware checksum failed or wasn't calculated, NDIS > >>> will recalculate it again */ + > >>> NDIS_PER_PACKET_INFO_FROM_PACKET( *pp_packet, > >>> TcpIpChecksumPacketInfo ) = + (PVOID) > >>> (uintn_t) (p_desc->ndis_csum.Value); + } else { + > >>> NDIS_TCP_IP_CHECKSUM_PACKET_INFO chksum; + /* > >>> Flag the checksums as having been calculated. */ + > chksum.Value = 0; > >>> + > >> chksum.Receive.NdisPacketTcpChecksumSucceeded = TRUE; > >>> + > >> chksum.Receive.NdisPacketUdpChecksumSucceeded = TRUE; > >>> + > chksum.Receive.NdisPacketIpChecksumSucceeded = TRUE; > >>> + NDIS_PER_PACKET_INFO_FROM_PACKET( *pp_packet, > >>> TcpIpChecksumPacketInfo ) = + > >>> (void*)(uintn_t)chksum.Value; + } > >>> ipoib_inc_recv_stat( p_port->p_adapter, type, > p_desc->len ); > >>> > >>> IPOIB_EXIT( IPOIB_DBG_RECV ); @@ -3833,9 +3886,9 @@ > >>> p_desc->wr.dgrm.ud.hlen = lso_header_size; > >>> // Tell NDIS how much we will send. > >>> PktExt->NdisPacketInfo[TcpLargeSendPacketInfo] = > >>> UlongToPtr(PacketLength); - p_desc->wr.send_opt |= > >>> (IB_SEND_OPT_TX_IP_CSUM | IB_SEND_OPT_TX_TCP_UDP_CSUM); + > >>> p_desc->wr.send_opt |= (IB_SEND_OPT_TX_IP_CSUM | > >>> IB_SEND_OPT_TX_TCP_UDP_CSUM) | > IB_SEND_OPT_SIGNALED; > >>> __send_gen(p_port, p_desc, IndexOfData); - p_desc->wr.wr_type = ( > >>> WR_LSO | IB_SEND_OPT_SIGNALED); + > >>> p_desc->wr.wr_type = WR_LSO; } else { > >>> > >>> /* Setup the first local data segment > (used for the > >>> IPoIB header). */ Index: > >>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_adapter.h > >>> > =================================================================== > >>> --- > >> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_adapter.h > >>> (revision 3106) +++ > >>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_adapter.h > >>> (revision 3107) @@ -60,15 +60,15 @@ > >>> /* > >>> * Macros > >>> */ > >>> +typedef enum {CSUM_DISABLED = 0, CSUM_ENABLED, CSUM_BYPASS} > >>> tCsumTypeEn; > >>> > >>> - > >>> typedef struct _ipoib_params > >>> { > >>> int32_t rq_depth; > >>> int32_t rq_low_watermark; > >>> int32_t sq_depth; > >>> - boolean_t send_chksum_offload; > >>> - boolean_t recv_chksum_offload; > >>> + int32_t send_chksum_offload; //is > actually of type > >>> tCsumTypeEn + int32_t recv_chksum_offload; //is > >>> actually of type tCsumTypeEn uint32_t sa_timeout; > >>> uint32_t sa_retry_cnt; > >>> uint32_t recv_pool_ratio; > >>> @@ -91,12 +91,11 @@ > >>> * Number of send WQEs to allocate. > >>> * > >>> * send_chksum_offload > >>> -* Flag to indicate whether to offload send > checksums. > >>> This will make it > >>> -* so that IPoIB packets should never be > forwarded out > >>> of the IB subnet > >>> -* without recalculating the checksum. -* > >>> * recv_chksum_offload > >>> -* Flag to indicate whether to offload recv > checksums. > >>> +* Flags to indicate whether to offload send/recv > >>> checksums. +* 0 - No hardware cheksum > >>> +* 1 - Try to offload if the device support it > >>> +* 2 - Always report success (checksum bypass) * > >>> * wsdp_enabled > >>> * Flag to indicate whether WSDP is enabled for an > >>> adapter adapter. Index: > >>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_driver.c > >>> > =================================================================== > >>> --- > >>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_driver.c > >>> (revision 3106) +++ > >>> > >>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_driver.c > >>> (revision 3107) @@ -150,8 +150,8 @@ > {NDIS_STRING_CONST("RqDepth"), > >>> 1, > >>> IPOIB_OFFSET(rq_depth), IPOIB_SIZE(rq_depth), 512, > >>> 128, 1024}, {NDIS_STRING_CONST("RqLowWatermark"), 0, > >>> IPOIB_OFFSET(rq_low_watermark), IPOIB_SIZE(rq_low_watermark), > >>> 4, 2, 8}, {NDIS_STRING_CONST("SqDepth"), 1, > >>> IPOIB_OFFSET(sq_depth), IPOIB_SIZE(sq_depth), 512, > >>> 128, 1024}, - {NDIS_STRING_CONST("SendChksum"), 1, > >>> IPOIB_OFFSET(send_chksum_offload), > >>> IPOIB_SIZE(send_chksum_offload),0, 0, 1}, > >>> - {NDIS_STRING_CONST("RecvChksum"), 1, > >>> IPOIB_OFFSET(recv_chksum_offload), > >>> IPOIB_SIZE(recv_chksum_offload),0, 0, 1}, > >>> + {NDIS_STRING_CONST("SendChksum"), 1, > >>> IPOIB_OFFSET(send_chksum_offload), > >>> > >> > IPOIB_SIZE(send_chksum_offload),CSUM_ENABLED,CSUM_DISABLED,CSUM_BYPAS > >> S > >>> }, > >>> + {NDIS_STRING_CONST("RecvChksum"), 1, > >>> IPOIB_OFFSET(recv_chksum_offload), > >>> > >> IPOIB_SIZE(recv_chksum_offload),CSUM_ENABLED,CSUM_DISABLED,CSU > >> M_BYPASS}, > >>> {NDIS_STRING_CONST("SaTimeout"), 1, > >>> IPOIB_OFFSET(sa_timeout), IPOIB_SIZE(sa_timeout), > >>> 1000, 250, UINT_MAX}, > >>> {NDIS_STRING_CONST("SaRetries"), 1, > >>> IPOIB_OFFSET(sa_retry_cnt), IPOIB_SIZE(sa_retry_cnt), 10, > >>> 1, UINT_MAX}, {NDIS_STRING_CONST("RecvRatio"), > >>> 1, IPOIB_OFFSET(recv_pool_ratio), > >>> IPOIB_SIZE(recv_pool_ratio), 1, 1, 10}, @@ -1450,28 > +1450,20 @@ > >>> p_offload_task->TaskBufferLength = > >>> sizeof(NDIS_TASK_TCP_IP_CHECKSUM); > >>> p_offload_chksum = > >>> > >>> (NDIS_TASK_TCP_IP_CHECKSUM*)p_offload_task->TaskBuffer; - + > >>> p_offload_chksum->V4Transmit.IpOptionsSupported = > >>> - p_adapter->params.send_chksum_offload; > >>> p_offload_chksum->V4Transmit.TcpOptionsSupported = > >>> - p_adapter->params.send_chksum_offload; > >>> p_offload_chksum->V4Transmit.TcpChecksum = > >>> - p_adapter->params.send_chksum_offload; > >>> p_offload_chksum->V4Transmit.UdpChecksum = > >>> - p_adapter->params.send_chksum_offload; > >>> p_offload_chksum->V4Transmit.IpChecksum = > >>> - p_adapter->params.send_chksum_offload; > >>> + !!(p_adapter->params.send_chksum_offload); > >>> > >>> p_offload_chksum->V4Receive.IpOptionsSupported = > >>> - p_adapter->params.recv_chksum_offload; > >>> p_offload_chksum->V4Receive.TcpOptionsSupported = > >>> - p_adapter->params.recv_chksum_offload; > >>> p_offload_chksum->V4Receive.TcpChecksum = > >>> - p_adapter->params.recv_chksum_offload; > >>> p_offload_chksum->V4Receive.UdpChecksum = > >>> - p_adapter->params.recv_chksum_offload; > >>> p_offload_chksum->V4Receive.IpChecksum = > >>> - p_adapter->params.recv_chksum_offload; > >>> + !!(p_adapter->params.recv_chksum_offload); > >>> > >>> p_offload_chksum->V6Transmit.IpOptionsSupported = FALSE; > >>> p_offload_chksum->V6Transmit.TcpOptionsSupported = FALSE; > >>> Index: > >> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/netipoib.inx > >>> > =================================================================== > >>> --- D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/netipoib.inx > >>> (revision 3106) +++ > >>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/netipoib.inx > >>> (revision 3107) @@ -95,18 +95,28 @@ > >>> > >>> HKR, Ndi\Params\SendChksum, ParamDesc, 0, "Send > >>> Checksum Offload" HKR, Ndi\Params\SendChksum, Type, > >>> 0, "enum" -HKR, Ndi\Params\SendChksum, Default, > >>> 0, "0" +HKR, Ndi\Params\SendChksum, Default, 0, > >>> "1" HKR, Ndi\Params\SendChksum, Optional, 0, "0" > >>> HKR, Ndi\Params\SendChksum\enum, "0", 0, "Disabled" > >>> -HKR, Ndi\Params\SendChksum\enum, "1", 0, "Enabled" > >>> +HKR, Ndi\Params\SendChksum\enum, "1", 0, "Enabled (if > >>> supported by HW)" +HKR, Ndi\Params\SendChksum\enum, "2", > >>> 0, "Bypass" > >>> > >>> HKR, Ndi\Params\RecvChksum, ParamDesc, 0, "Recv > >>> Checksum Offload" HKR, Ndi\Params\RecvChksum, Type, > >>> 0, "enum" -HKR, Ndi\Params\RecvChksum, Default, > >>> 0, "0" +HKR, Ndi\Params\RecvChksum, Default, 0, > >>> "1" HKR, Ndi\Params\RecvChksum, Optional, 0, "0" > >>> HKR, Ndi\Params\RecvChksum\enum, "0", 0, "Disabled" > >>> -HKR, Ndi\Params\RecvChksum\enum, "1", 0, "Enabled" > >>> +HKR, Ndi\Params\RecvChksum\enum, "1", 0, "Enabled (if > >>> supported by HW)" +HKR, Ndi\Params\RecvChksum\enum, "2", > >>> 0, "Bypass" > >>> > >>> +HKR, Ndi\Params\lso, ParamDesc, 0, "Large Send > >>> Offload" +HKR, Ndi\Params\lso, Type, 0, "enum" > >>> +HKR, Ndi\Params\lso, Default, 0, "0" > >>> +HKR, Ndi\Params\lso, Optional, 0, "0" > >>> +HKR, Ndi\Params\lso\enum, "0", 0, "Disabled" > >>> +HKR, Ndi\Params\lso\enum, "1", 0, "Enabled" + > >>> + > >>> HKR, Ndi\Params\SaTimeout, ParamDesc, 0, "SA > >>> Query Timeout (ms)" HKR, Ndi\Params\SaTimeout, Type, > >>> 0, "dword" HKR, Ndi\Params\SaTimeout, Default, > >>> 0, "1000" Index: > >>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/hw/mlx4/kernel/bus/ib/cq.c > >>> > =================================================================== > >>> --- D:/Windows/MLNX_WINOF_CSUM_ARBEL/hw/mlx4/kernel/bus/ib/cq.c > >>> (revision 3106) +++ > >>> > >>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/hw/mlx4/kernel/bus/ib/cq.c > >>> (revision 3107) @@ -498,7 +498,7 @@ > >>> wc->recv.ud.path_bits = (u8)(cqe->g_mlpath_rqpn & 0x7f); > >>> wc->recv.ud.recv_opt |= > >>> cqe->g_mlpath_rqpn & 0x080 ? IB_RECV_OPT_GRH_VALID : 0; > >>> wc->recv.ud.pkey_index = > (u16)(be32_to_cpu(cqe->immed_rss_invalid) > >>> & 0x7f); - wc->csum_ok = > >>> mlx4_ib_ipoib_csum_ok(cqe->ipoib_status,cqe->checksum); + > >>> wc->recv.ud.csum_ok = > >>> mlx4_ib_ipoib_csum_ok(cqe->ipoib_status,cqe->checksum); } > >>> if (!is_send && cqe->rlid == 0){ > >>> > >> MLX4_PRINT(TRACE_LEVEL_INFORMATION,MLX4_DBG_CQ,("found > >>> rlid == 0 \n ")); > >>> _______________________________________________ > >>> ofw mailing list > >>> [email protected] > >>> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw > > _______________________________________________ ofw mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
