Ack with comments. Comments are marked [AndersW] below: regards, Anders Widell
On 09/14/2016 06:16 AM, mahesh.va...@oracle.com wrote: > osaf/services/infrastructure/dtms/config/dtmd.conf | 12 ++++++++++ > osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c | 11 +++++++++ > osaf/services/infrastructure/dtms/dtm/dtm_read_config.c | 19 > +++++++++++++++- > osaf/services/infrastructure/dtms/include/dtm.h | 4 +++ > osaf/services/infrastructure/dtms/include/dtm_cb.h | 1 + > 5 files changed, 46 insertions(+), 1 deletions(-) > > > Since Linux kernel above and equal 3.18, TCP sockets have an option called > TCP_USER_TIMEOUT. [AndersW] The TCP_USER_TIMEOUT option was implemented (although still buggy) already in Linux 2.6.37. So change 3.18 to 2.6.37 in the text above. > > TCP_USER_TIMEOUT is a TCP level socket option that takes an unsigned int, > when grater than 0, to specify the maximum amount of time in ms that > transmitted > data may remain unacknowledged before TCP will forcefully close the > corresponding connection and return ETIMEDOUT to the application. If > 0 is given, TCP will continue to use the system default. > > Increasing the TCP_USER_TIMEOUT allows a TCP connection to survive extended > periods without end-to-end connectivity. > Decreasing the user timeouts allows applications to fail fast if so desired. > Otherwise it may take upto 20 minutes with the current system > defaults in a normal WAN environment. > > This option, TCP_USER_TIMEOUT will override keep-alive > to determine when to close a connection due to keep-alive failure. > > Defaulted to 1.5 sec to match other transport protocols (TIPC) supported > Opensaf. > > Try to tune & test the DTM_TCP_USER_TIMEOUT=1500 to higher and lower value in > dtmd.conf > > diff --git a/osaf/services/infrastructure/dtms/config/dtmd.conf > b/osaf/services/infrastructure/dtms/config/dtmd.conf > --- a/osaf/services/infrastructure/dtms/config/dtmd.conf > +++ b/osaf/services/infrastructure/dtms/config/dtmd.conf > @@ -73,6 +73,18 @@ DTM_TCP_KEEPALIVE_INTVL=1 > DTM_TCP_KEEPALIVE_PROBES=2 > > # > +# Since Linux kernel >= 3.18, TCP sockets have an option called > TCP_USER_TIMEOUT. [AndersW] Change 3.18 to 2.6.37. > +# Increasing the TCP_USER_TIMEOUT allows a TCP connection to survive extended > +# periods without end-to-end connectivity. > +# Decreasing the user timeouts allows applications to "fail fast" if so > desired. > +# Otherwise it may take upto 20 minutes with the current system > +# defaults in a normal WAN environment. > +# This option, TCP_USER_TIMEOUT will override keepalive > +# to determine when to close a connection due to keepalive failure. > +# kept to 1.5 sec to match other transport protocols supported Opensaf > +DTM_TCP_USER_TIMEOUT=1500 > + > +# > #Used to Set the dtm internode & intranode socket SO_SNDBUF/SO_RCVBUF > buffer in bytes. > #The kernel doubles this value (to allow space for bookkeeping over-head) > #when it is set using setsockopt(),and this doubled value is returned by > diff --git a/osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c > b/osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c > --- a/osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c > +++ b/osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c > @@ -74,6 +74,7 @@ static uint32_t set_keepalive(DTM_INTERN > > TRACE_ENTER(); > int comm_keepidle_time, comm_keepalive_intvl, comm_keepalive_probes; > + int comm_user_timeout; [AndersW] The TCP_USER_TIMEOUT option takes an unsigned int parameter, so the type of the variable comm_user_timeout ought to be "unsigned". Link to "tcp" man page: http://man7.org/linux/man-pages/man7/tcp.7.html > int so_keepalive; > int smode = 1; > > @@ -82,6 +83,7 @@ static uint32_t set_keepalive(DTM_INTERN > comm_keepidle_time = dtms_cb->comm_keepidle_time; > comm_keepalive_intvl = dtms_cb->comm_keepalive_intvl; > comm_keepalive_probes = dtms_cb->comm_keepalive_probes; > + comm_user_timeout = dtms_cb->comm_user_timeout; > > if (so_keepalive == true) { > socklen_t optlen; > @@ -117,6 +119,15 @@ static uint32_t set_keepalive(DTM_INTERN > TRACE_LEAVE2("rc :%d", NCSCC_RC_FAILURE); > return NCSCC_RC_FAILURE; > } > + > + /* Set TCP_USER_TIMEOUT timeout in milliseconds [ms] */ > + optlen = sizeof(comm_user_timeout); > + if (setsockopt(sock_desc, SOL_TCP, TCP_USER_TIMEOUT, > &comm_user_timeout, optlen) < 0) { [AndersW] According to the man pages, the "level" parameter for setsockopt() ought to be set to "IPPROTO_TCP" when setting TCP level options: Link to "tcp" man page: http://man7.org/linux/man-pages/man7/tcp.7.html Link to setsockopt man page: http://pubs.opengroup.org/onlinepubs/009695399/functions/setsockopt.html > + LOG_ER("DTM :setsockopt TCP_USER_TIMEOUT failed err > :%s ", strerror(errno)); > + TRACE_LEAVE2("rc :%d", NCSCC_RC_FAILURE); > + return NCSCC_RC_FAILURE; > + } > + > } > > if ((setsockopt(sock_desc, SOL_SOCKET, SO_REUSEADDR, (void *)&smode, > sizeof(smode)) == -1)) { > diff --git a/osaf/services/infrastructure/dtms/dtm/dtm_read_config.c > b/osaf/services/infrastructure/dtms/dtm/dtm_read_config.c > --- a/osaf/services/infrastructure/dtms/dtm/dtm_read_config.c > +++ b/osaf/services/infrastructure/dtms/dtm/dtm_read_config.c > @@ -39,6 +39,7 @@ extern uint32_t intranode_max_processes; > #define KEEPALIVE_PROBES 9 > #define DIS_TIME_OUT 5 > #define BCAST_FRE 250 > +#define USER_TIMEOUT 1500 // 1.5 sec to match other transport > > const char *IN6ADDR_LINK_LOCAL = "ff02::1"; /* IPv6, Scope:Link > multicast address */ > const char *IN6ADDR_LINK_GLOBAL = "ff0e::1"; /* IPv6, Scope:Global > multicast address */ > @@ -100,7 +101,9 @@ void dtm_print_config(DTM_INTERNODE_CB * > TRACE(" %d", config->comm_keepalive_intvl); > TRACE(" COMM_KEEPALIVE_PROBES: "); > TRACE(" %d", config->comm_keepalive_probes); > - TRACE(" DTM_INI_DIS_TIMEOUT_SECS: "); > + TRACE(" COMM_TCP_USER_TIMEOUT: "); > + TRACE(" %d", config->comm_user_timeout); > + TRACE(" DTM_INI_DIS_TIMEOUT_SECS: "); > TRACE(" %d", config->initial_dis_timeout); > TRACE(" DTM_BCAST_FRE_MSECS: "); > TRACE(" %" PRId64 "", config->bcast_msg_freq); > @@ -254,6 +257,7 @@ int dtm_read_config(DTM_INTERNODE_CB * c > config->comm_keepidle_time = KEEPIDLE_TIME; > config->comm_keepalive_intvl = KEEPALIVE_INTVL; > config->comm_keepalive_probes = KEEPALIVE_PROBES; > + config->comm_keepidle_time = USER_TIMEOUT; > config->i_addr_family = DTM_IP_ADDR_TYPE_IPV4; > config->bcast_msg_freq = BCAST_FRE; > config->initial_dis_timeout = DIS_TIME_OUT; > @@ -475,6 +479,19 @@ int dtm_read_config(DTM_INTERNODE_CB * c > tag_len = 0; > > } > + if (strncmp(line, "DTM_TCP_USER_TIMEOUT=", > strlen("DTM_TCP_USER_TIMEOUT=")) == 0) { > + tag_len = strlen("DTM_TCP_USER_TIMEOUT="); > + config->comm_user_timeout = > atoi(&line[tag_len]); > + if (config->comm_user_timeout < 1) { [AndersW] Shouldn't we allow the value zero here as well? I.e. it would be the same as not setting this configuration option? > + LOG_ER("DTM:comm_user_timeout must be a > positive integer"); > + fclose(dtm_conf_file); > + return -1; > + } > + > + tag = 0; > + tag_len = 0; > + > + } > if (strncmp(line, "DTM_SOCK_SND_RCV_BUF_SIZE=", > strlen("DTM_SOCK_SND_RCV_BUF_SIZE=")) == 0) { > tag_len = strlen("DTM_SOCK_SND_RCV_BUF_SIZE="); > uint32_t sndbuf_size = 0; /* Send buffer size */ > diff --git a/osaf/services/infrastructure/dtms/include/dtm.h > b/osaf/services/infrastructure/dtms/include/dtm.h > --- a/osaf/services/infrastructure/dtms/include/dtm.h > +++ b/osaf/services/infrastructure/dtms/include/dtm.h > @@ -32,6 +32,10 @@ > extern DTM_INTERNODE_CB *dtms_gl_cb; > extern uint8_t initial_discovery_phase; > > +#ifndef TCP_USER_TIMEOUT > +#define TCP_USER_TIMEOUT 18 > +#endif [AndersW] Move the three lines above from the header file to the file dtm_node_sockets.c > + > #define BMCAST_MSG_LEN ( sizeof(uint16_t) + sizeof(uint32_t) + > sizeof(uint32_t) + sizeof(uint16_t) > > #define m_NODE_DISCOVERY_TASKNAME "NODE_DISCOVERY" > diff --git a/osaf/services/infrastructure/dtms/include/dtm_cb.h > b/osaf/services/infrastructure/dtms/include/dtm_cb.h > --- a/osaf/services/infrastructure/dtms/include/dtm_cb.h > +++ b/osaf/services/infrastructure/dtms/include/dtm_cb.h > @@ -89,6 +89,7 @@ typedef struct dtm_internode_cb { > int comm_keepidle_time; > int comm_keepalive_intvl; > int comm_keepalive_probes; > + int comm_user_timeout; // tcp socket user timeout in milliseconds [ms] [AndersW] This line above adds a new cpplint warning, which should be fixed: osaf/services/infrastructure/dtms/include/dtm_cb.h:92: At least two spaces is best between code and comments [whitespace/comments] [2] > int32_t sock_sndbuf_size; /* The value of SO_SNDBUF */ > int32_t sock_rcvbuf_size; /* The value of SO_RCVBUF */ > SYSF_MBX mbx; ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel