Ack with comments. Comments are marked [AndersW] below:

regards,
Anders Widell

On 09/14/2016 06:16 AM, mahesh.va...@oracle.com wrote:
>   osaf/services/infrastructure/dtms/config/dtmd.conf       |  12 ++++++++++
>   osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c |  11 +++++++++
>   osaf/services/infrastructure/dtms/dtm/dtm_read_config.c  |  19 
> +++++++++++++++-
>   osaf/services/infrastructure/dtms/include/dtm.h          |   4 +++
>   osaf/services/infrastructure/dtms/include/dtm_cb.h       |   1 +
>   5 files changed, 46 insertions(+), 1 deletions(-)
>
>
> Since Linux kernel above and equal 3.18, TCP sockets have an option called 
> TCP_USER_TIMEOUT.
[AndersW] The TCP_USER_TIMEOUT option was implemented (although still 
buggy) already in Linux 2.6.37. So change 3.18 to 2.6.37 in the text above.
>
> TCP_USER_TIMEOUT is a TCP level socket option that takes an unsigned int,
> when grater than 0, to specify the maximum amount of time in ms that 
> transmitted
> data may remain unacknowledged before TCP will forcefully close the
> corresponding connection and return ETIMEDOUT to the application. If
> 0 is given, TCP will continue to use the system default.
>
> Increasing the TCP_USER_TIMEOUT allows a TCP connection to survive extended
> periods without end-to-end connectivity.
> Decreasing the user timeouts allows applications to fail fast if so desired.
> Otherwise it may take upto 20 minutes with the current system
> defaults in a normal WAN environment.
>
> This option, TCP_USER_TIMEOUT will override keep-alive
> to determine when to close a connection due to keep-alive failure.
>
> Defaulted  to 1.5 sec to match other transport protocols (TIPC) supported 
> Opensaf.
>
> Try to tune & test the DTM_TCP_USER_TIMEOUT=1500 to higher and lower value in 
> dtmd.conf
>
> diff --git a/osaf/services/infrastructure/dtms/config/dtmd.conf 
> b/osaf/services/infrastructure/dtms/config/dtmd.conf
> --- a/osaf/services/infrastructure/dtms/config/dtmd.conf
> +++ b/osaf/services/infrastructure/dtms/config/dtmd.conf
> @@ -73,6 +73,18 @@ DTM_TCP_KEEPALIVE_INTVL=1
>   DTM_TCP_KEEPALIVE_PROBES=2
>   
>   #
> +# Since Linux kernel >= 3.18, TCP sockets have an option called 
> TCP_USER_TIMEOUT.
[AndersW] Change 3.18 to 2.6.37.
> +# Increasing the TCP_USER_TIMEOUT allows a TCP connection to survive extended
> +# periods without end-to-end connectivity.
> +# Decreasing the user timeouts allows applications to "fail fast" if so 
> desired.
> +# Otherwise it may take upto 20 minutes with the current system
> +#  defaults in a normal WAN environment.
> +# This option, TCP_USER_TIMEOUT will override keepalive
> +# to determine when to close a connection due to keepalive failure.
> +# kept to 1.5 sec to match other transport protocols supported Opensaf
> +DTM_TCP_USER_TIMEOUT=1500
> +
> +#
>   #Used to Set the dtm internode & intranode  socket SO_SNDBUF/SO_RCVBUF 
> buffer in bytes.
>   #The kernel doubles this value (to allow space for bookkeeping over-head)
>   #when it is set using setsockopt(),and this doubled value is returned  by
> diff --git a/osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c 
> b/osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c
> --- a/osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c
> +++ b/osaf/services/infrastructure/dtms/dtm/dtm_node_sockets.c
> @@ -74,6 +74,7 @@ static uint32_t set_keepalive(DTM_INTERN
>   
>       TRACE_ENTER();
>       int comm_keepidle_time, comm_keepalive_intvl, comm_keepalive_probes;
> +     int comm_user_timeout;
[AndersW] The TCP_USER_TIMEOUT option takes an unsigned int parameter, 
so the type of the variable comm_user_timeout ought to be "unsigned".

Link to "tcp" man page: http://man7.org/linux/man-pages/man7/tcp.7.html
>       int so_keepalive;
>       int smode = 1;
>   
> @@ -82,6 +83,7 @@ static uint32_t set_keepalive(DTM_INTERN
>       comm_keepidle_time = dtms_cb->comm_keepidle_time;
>       comm_keepalive_intvl = dtms_cb->comm_keepalive_intvl;
>       comm_keepalive_probes = dtms_cb->comm_keepalive_probes;
> +     comm_user_timeout = dtms_cb->comm_user_timeout;
>   
>       if (so_keepalive == true) {
>               socklen_t optlen;
> @@ -117,6 +119,15 @@ static uint32_t set_keepalive(DTM_INTERN
>                       TRACE_LEAVE2("rc :%d", NCSCC_RC_FAILURE);
>                       return NCSCC_RC_FAILURE;
>               }
> +
> +             /* Set TCP_USER_TIMEOUT  timeout in milliseconds [ms]  */
> +             optlen = sizeof(comm_user_timeout);
> +                if (setsockopt(sock_desc, SOL_TCP, TCP_USER_TIMEOUT, 
> &comm_user_timeout, optlen) < 0) {
[AndersW] According to the man pages, the "level" parameter for 
setsockopt() ought to be set to "IPPROTO_TCP" when setting TCP level 
options:

Link to "tcp" man page: http://man7.org/linux/man-pages/man7/tcp.7.html

Link to setsockopt man page: 
http://pubs.opengroup.org/onlinepubs/009695399/functions/setsockopt.html

> +                        LOG_ER("DTM :setsockopt TCP_USER_TIMEOUT failed err 
> :%s ", strerror(errno));
> +                        TRACE_LEAVE2("rc :%d", NCSCC_RC_FAILURE);
> +                        return NCSCC_RC_FAILURE;
> +                }
> +
>       }
>   
>       if ((setsockopt(sock_desc, SOL_SOCKET, SO_REUSEADDR, (void *)&smode, 
> sizeof(smode)) == -1)) {
> diff --git a/osaf/services/infrastructure/dtms/dtm/dtm_read_config.c 
> b/osaf/services/infrastructure/dtms/dtm/dtm_read_config.c
> --- a/osaf/services/infrastructure/dtms/dtm/dtm_read_config.c
> +++ b/osaf/services/infrastructure/dtms/dtm/dtm_read_config.c
> @@ -39,6 +39,7 @@ extern uint32_t intranode_max_processes;
>   #define KEEPALIVE_PROBES 9
>   #define DIS_TIME_OUT 5
>   #define BCAST_FRE 250
> +#define USER_TIMEOUT 1500 // 1.5 sec to match other transport
>   
>   const char *IN6ADDR_LINK_LOCAL = "ff02::1";       /* IPv6, Scope:Link 
> multicast address */
>   const char *IN6ADDR_LINK_GLOBAL = "ff0e::1";      /* IPv6, Scope:Global 
> multicast address */
> @@ -100,7 +101,9 @@ void dtm_print_config(DTM_INTERNODE_CB *
>       TRACE("  %d", config->comm_keepalive_intvl);
>       TRACE("  COMM_KEEPALIVE_PROBES: ");
>       TRACE("  %d", config->comm_keepalive_probes);
> -     TRACE("  DTM_INI_DIS_TIMEOUT_SECS: ");
> +     TRACE("  COMM_TCP_USER_TIMEOUT: ");
> +        TRACE("  %d", config->comm_user_timeout);
> +        TRACE("  DTM_INI_DIS_TIMEOUT_SECS: ");
>       TRACE("  %d", config->initial_dis_timeout);
>       TRACE("  DTM_BCAST_FRE_MSECS: ");
>       TRACE("  %" PRId64 "", config->bcast_msg_freq);
> @@ -254,6 +257,7 @@ int dtm_read_config(DTM_INTERNODE_CB * c
>       config->comm_keepidle_time = KEEPIDLE_TIME;
>       config->comm_keepalive_intvl = KEEPALIVE_INTVL;
>       config->comm_keepalive_probes = KEEPALIVE_PROBES;
> +     config->comm_keepidle_time = USER_TIMEOUT;
>       config->i_addr_family = DTM_IP_ADDR_TYPE_IPV4;
>       config->bcast_msg_freq = BCAST_FRE;
>       config->initial_dis_timeout = DIS_TIME_OUT;
> @@ -475,6 +479,19 @@ int dtm_read_config(DTM_INTERNODE_CB * c
>                               tag_len = 0;
>   
>                       }
> +                     if (strncmp(line, "DTM_TCP_USER_TIMEOUT=", 
> strlen("DTM_TCP_USER_TIMEOUT=")) == 0) {
> +                             tag_len = strlen("DTM_TCP_USER_TIMEOUT=");
> +                             config->comm_user_timeout = 
> atoi(&line[tag_len]);
> +                             if (config->comm_user_timeout < 1) {
[AndersW] Shouldn't we allow the value zero here as well? I.e. it would 
be the same as not setting this configuration option?
> +                                     LOG_ER("DTM:comm_user_timeout must be a 
> positive integer");
> +                                     fclose(dtm_conf_file);
> +                                     return -1;
> +                             }
> +
> +                             tag = 0;
> +                                tag_len = 0;
> +
> +                        }
>                       if (strncmp(line, "DTM_SOCK_SND_RCV_BUF_SIZE=", 
> strlen("DTM_SOCK_SND_RCV_BUF_SIZE=")) == 0) {
>                               tag_len = strlen("DTM_SOCK_SND_RCV_BUF_SIZE=");
>                               uint32_t sndbuf_size = 0; /* Send buffer size */
> diff --git a/osaf/services/infrastructure/dtms/include/dtm.h 
> b/osaf/services/infrastructure/dtms/include/dtm.h
> --- a/osaf/services/infrastructure/dtms/include/dtm.h
> +++ b/osaf/services/infrastructure/dtms/include/dtm.h
> @@ -32,6 +32,10 @@
>   extern DTM_INTERNODE_CB *dtms_gl_cb;
>   extern uint8_t initial_discovery_phase;
>   
> +#ifndef TCP_USER_TIMEOUT
> +#define TCP_USER_TIMEOUT 18
> +#endif
[AndersW] Move the three lines above from the header file to the file 
dtm_node_sockets.c

> +
>   #define BMCAST_MSG_LEN ( sizeof(uint16_t) + sizeof(uint32_t) + 
> sizeof(uint32_t) + sizeof(uint16_t)
>   
>   #define m_NODE_DISCOVERY_TASKNAME       "NODE_DISCOVERY"
> diff --git a/osaf/services/infrastructure/dtms/include/dtm_cb.h 
> b/osaf/services/infrastructure/dtms/include/dtm_cb.h
> --- a/osaf/services/infrastructure/dtms/include/dtm_cb.h
> +++ b/osaf/services/infrastructure/dtms/include/dtm_cb.h
> @@ -89,6 +89,7 @@ typedef struct dtm_internode_cb {
>     int comm_keepidle_time;
>     int comm_keepalive_intvl;
>     int comm_keepalive_probes;
> +  int comm_user_timeout; // tcp socket user timeout in milliseconds [ms]
[AndersW] This line above adds a new cpplint warning, which should be fixed:

osaf/services/infrastructure/dtms/include/dtm_cb.h:92:  At least two 
spaces is best between code and comments  [whitespace/comments] [2]

>     int32_t sock_sndbuf_size; /* The value of SO_SNDBUF */
>     int32_t sock_rcvbuf_size; /* The value of SO_RCVBUF */
>     SYSF_MBX mbx;


------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to