Minor comments inline.

On 2017-10-05 17:22, Jonathan Rajotte wrote:
> Allow relayd to clean-up objects related to a dead connection
> for which the FIN packet was no emitted (Unexpected shutdown,
> ethernet blocking). Note that an idle peer is not considered dead given
> that it respond to the keep-alive query after the idle time is elapsed.
> 
> By RFC 1122-4.2.3.6 implementation must default to no less than two
> hours for the idle period. On linux the default value is indeed 2 hours.
> This could be problematic if relayd should be aggressive regarding
> dead-peers. Hence it is important to provide tuning knob regarding the
> tcp keep-alive mechanism.
> 
> The following environments variable can be used to enable and fine-tune
> it:
>     LTTNG_RELAYD_TCP_KEEP_ALIVE_ENABLE
>         Set to 1 to enable the use of tcp keep-alive allowing the detection
>         of dead peers.
> 
>     LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME
>         See tcp(7) tcp_keepalive_time or tcp_keepalive_interval on
>       Solaris 11.
>         A value of -1 lets the operating system manage this parameter
>         (default).
> 
>     LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES
>         See tcp(7) tcp_keepalive_probes.
>         A value of -1 lets the operating system manage this
>         parameter (default).
>       No effect on Solaris.
> 
>     LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL`::
>         See tcp(7) tcp_keepalive_intvl.
>         A value of -1 lets the operating system manage
>         his parameter (default).
> 
> Signed-off-by: Jonathan Rajotte <[email protected]>
> ---
>  src/bin/lttng-relayd/Makefile.am      |   3 +-
>  src/bin/lttng-relayd/main.c           |  14 ++
>  src/bin/lttng-relayd/tcp_keep_alive.c | 243 
> ++++++++++++++++++++++++++++++++++
>  src/bin/lttng-relayd/tcp_keep_alive.h |  25 ++++
>  4 files changed, 284 insertions(+), 1 deletion(-)
>  create mode 100644 src/bin/lttng-relayd/tcp_keep_alive.c
>  create mode 100644 src/bin/lttng-relayd/tcp_keep_alive.h
> 
> diff --git a/src/bin/lttng-relayd/Makefile.am 
> b/src/bin/lttng-relayd/Makefile.am
> index c7dd37e1..5125c721 100644
> --- a/src/bin/lttng-relayd/Makefile.am
> +++ b/src/bin/lttng-relayd/Makefile.am
> @@ -21,7 +21,8 @@ lttng_relayd_SOURCES = main.c lttng-relayd.h utils.h 
> utils.c cmd.h \
>                         stream-fd.c stream-fd.h \
>                         connection.c connection.h \
>                         viewer-session.c viewer-session.h \
> -                       tracefile-array.c tracefile-array.h
> +                       tracefile-array.c tracefile-array.h \
> +                       tcp_keep_alive.c tcp_keep_alive.h
>  
>  # link on liblttngctl for check if relayd is already alive.
>  lttng_relayd_LDADD = -lurcu-common -lurcu \
> diff --git a/src/bin/lttng-relayd/main.c b/src/bin/lttng-relayd/main.c
> index 0eb8e282..b43743a5 100644
> --- a/src/bin/lttng-relayd/main.c
> +++ b/src/bin/lttng-relayd/main.c
> @@ -70,6 +70,7 @@
>  #include "stream.h"
>  #include "connection.h"
>  #include "tracefile-array.h"
> +#include "tcp_keep_alive.h"
>  
>  static const char *help_msg =
>  #ifdef LTTNG_EMBED_HELP
> @@ -899,6 +900,14 @@ restart:
>                                       lttcomm_destroy_sock(newsock);
>                                       goto error;
>                               }
> +
> +                             ret = tcp_keepalive_setsockopt(newsock->fd);
> +                             if (ret < 0) {
> +                                     PERROR("setsockopt tcp_keep_alive");
> +                                     lttcomm_destroy_sock(newsock);
> +                                     goto error;
> +                             }
> +
>                               new_conn = connection_create(newsock, type);
>                               if (!new_conn) {
>                                       lttcomm_destroy_sock(newsock);
> @@ -2755,6 +2764,11 @@ int main(int argc, char **argv)
>               goto exit_options;
>       }
>  
> +     if (tcp_keepalive_get_settings()) {
> +             retval = -1;
> +             goto exit_options;
> +     }
> +
>       if (set_signal_handler()) {
>               retval = -1;
>               goto exit_options;
> diff --git a/src/bin/lttng-relayd/tcp_keep_alive.c 
> b/src/bin/lttng-relayd/tcp_keep_alive.c
> new file mode 100644
> index 00000000..d61802d6
> --- /dev/null
> +++ b/src/bin/lttng-relayd/tcp_keep_alive.c
> @@ -0,0 +1,243 @@
> +/*
> + * Copyright (C) 2017 - Jonathan Rajotte 
> <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2 only,
> + * as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but 
> WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <sys/types.h>
> +#include <netinet/tcp.h>
> +#include <stdbool.h>
> +#include <sys/socket.h>
> +
> +#include <common/compat/getenv.h>
> +
> +#include "tcp_keep_alive.h"
> +
> +#define LTTNG_RELAYD_TCP_KEEP_ALIVE_ENABLE_ENV 
> "LTTNG_RELAYD_TCP_KEEP_ALIVE_ENABLE"
> +#define LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME_ENV 
> "LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME"
> +#define LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES_ENV 
> "LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES"
> +#define LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL_ENV 
> "LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL"
> +
> +#ifdef __sun

We use "__sun__" in the rest of the codebase.

> +#define COMPAT_TCP_KEEPIDLE TCP_KEEPALIVE_THRESHOLD
> +#define SOL_TCP IPPROTO_TCP /* Solaris does not support SOL_TCP */
> +#else
> +#define COMPAT_TCP_KEEPIDLE TCP_KEEPIDLE
> +#endif /* __sun */
> +
> +static bool tcp_keepalive_enabled = false;
> +static int tcp_keepalive_time = -1;
> +static int tcp_keepalive_intvl = -1;
> +static int tcp_keepalive_probes = -1;
> +
> +static bool tcp_keepalive_time_valid(int value)
> +{
> +#ifndef __sun
> +     return true;
> +#else
> +#ifdef TCP_KEEPALIVE_THRESHOLD
> +     /*
> +      * Minimum 10s , maximum 10 days. Defined by
> +      * 
> https://docs.oracle.com/cd/E23824_01/html/821-1475/tcp-7p.html#REFMAN7tcp-7p
> +      */
> +     if (value < 10 || value > 864000) {
> +             return false;
> +     }
> +     return true;
> +#else
> +     WARN("Solaris 10 does not support local override of the 
> TCP_KEEP_ALIVE_THRESHOLD. " LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME_ENV);
> +     return false;
> +#endif /* TCP_KEEPALIVE_THRESHOLD */
> +#endif /* __sun */
> +}

I think it would be more readable this way :

#ifdef __sun__
static bool tcp_keepalive_time_valid(int value)
{ ... }
#else
static bool tcp_keepalive_time_valid(int value)
{
  return true;
}
#endif

> +
> +int tcp_keepalive_get_settings(void)
> +{
> +     int ret;
> +     const char *value;
> +
> +     value = lttng_secure_getenv(LTTNG_RELAYD_TCP_KEEP_ALIVE_ENABLE_ENV);
> +     if (value && !strcmp(value, "1")) {
> +             tcp_keepalive_enabled = true;
> +     } else {
> +             tcp_keepalive_enabled = false;
> +             ret = 0;
> +             goto disabled;
> +     }
> +
> +     /* Get value for tcp_keepalive_time in seconds*/
> +     value = lttng_secure_getenv(LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME_ENV);
> +     if (value) {
> +             int tmp;
> +             errno = 0;
> +             tmp = (int) strtol(value, NULL, 0);
> +             if (errno != 0 || tmp < -1) {
> +                     PERROR("TCP_KEEP_ALIVE time parse");
> +                     ret = 1;
> +                     goto error;
> +             } else {
> +                     if (!tcp_keepalive_time_valid(tmp)) {
> +                             ERR("TCP_KEEP_ALIVE time invalid value");
> +                             ret = 1;
> +                             goto error;
> +                     }
> +#ifdef __sun
> +                     /*
> +                      * Under solaris this value is expressed in
> +                      * milliseconds. Fits in a int.
> +                      */
> +                     tmp = tmp * 1000;

Maybe use MSEC_PER_SEC from src/common/time.h

> +#endif /* ifdef __sun */
> +                     tcp_keepalive_time = tmp;
> +             }
> +     }
> +
> +
> +     /* Get value for tcp_keepalive_intvl in seconds */
> +     value = lttng_secure_getenv(LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL_ENV);
> +     if (value) {
> +#ifdef __sun
> +             WARN("Solaris does not support local override of 
> tcp_keepalive_intvl. " LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL_ENV);
> +             ret = 1;
> +             goto error;
> +#else
> +             int tmp;
> +             errno = 0;
> +             tmp = (int) strtol(value, NULL, 0);
> +             if (errno != 0 || tmp < -1) {
> +                     PERROR("TCP_KEEP_ALIVE interval parse");
> +                     ret = 1;
> +                     goto error;
> +             } else {
> +                     if (tmp > 0) {
> +                             tcp_keepalive_intvl = tmp;
> +                     }
> +             }
> +#endif /* __sun */
> +     }
> +
> +     /* Get value for tcp_keepalive_probes */
> +     value = lttng_secure_getenv(LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES_ENV);
> +     if (value) {
> +#ifdef __sun
> +             WARN("Solaris does not support local override of 
> tcp_keepalive_probes. " LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES_ENV);
> +             ret = 1;
> +             goto error;
> +#else
> +             int tmp;
> +             errno = 0;
> +             tmp = (int) strtol(value, NULL, 0);
> +             if (errno != 0 || tmp < -1) {
> +                     PERROR("TCP_KEEP_ALIVE probes parse");
> +                     ret = 1;
> +                     goto error;
> +             } else {
> +                     if (tmp > 0) {
> +                             tcp_keepalive_probes = tmp;
> +                     }
> +             }
> +#endif /* __sun */
> +     }
> +
> +     DBG("TCP_KEEP_ALIVE enabled");
> +     if (tcp_keepalive_time > -1) {
> +             DBG("Overwrite tcp_keepalive_time to %d", tcp_keepalive_time);
> +     }
> +     if (tcp_keepalive_intvl > -1) {
> +             DBG("Overwrite tcp_keepalive_intvl to %d", tcp_keepalive_intvl);
> +     }
> +     if (tcp_keepalive_probes > -1) {
> +             DBG("Overwrite tcp_keepalive_time to %d", tcp_keepalive_probes);
> +     }
> +     ret = 0;
> +
> +error:
> +disabled:
> +     return ret;
> +}
> +
> +/*
> + * Set the socket options regarding tcp_keepalive.
> + */
> +int tcp_keepalive_setsockopt(int fd)
> +{
> +     int ret;
> +     int val = 1;
> +     int optval;
> +     socklen_t optlen = sizeof(optval);
> +
> +     if (!tcp_keepalive_enabled) {
> +             ret = 0;
> +             goto end;
> +     }
> +
> +     ret = setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val,
> +                     sizeof(val));
> +     if (ret < 0) {
> +             PERROR("setsockopt so_keepalive");
> +             goto end;
> +     }
> +
> +     /* Check the status for the keepalive option */
> +     if(getsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &optval, &optlen) < 0) {
> +             perror("getsockopt()");
> +     }
> +     WARN("SO_KEEPALIVE is %s", (optval ? "ON" : "OFF"));
> +
> +#if !defined(__sun) || (defined(__sun) && defined(TCP_KEEPALIVE_THRESHOLD))
> +     if (tcp_keepalive_time > -1) {
> +             ret = setsockopt(fd, SOL_TCP, COMPAT_TCP_KEEPIDLE, 
> &tcp_keepalive_time,
> +                             sizeof(tcp_keepalive_time));
> +             if (ret < 0) {
> +                     PERROR("setsockopt TCP_KEEPIDLE");
> +                     goto end;
> +             }
> +             /* Check the status for the keepalive option */
> +             if(getsockopt(fd, SOL_TCP, COMPAT_TCP_KEEPIDLE, &optval, 
> &optlen) < 0) {
> +                     perror("getsockopt()");
> +             }
> +             WARN("SO_ keep_idle value is %d", optval);
> +     }
> +#endif /* ! defined(__sun) || (defined(__sun) && 
> defined(TCP_KEEPALIVE_THRESHOLD)) */
> +
> +     /* Sun does not support either tcp_keepalive_probes or
> +      * tcp_keepalive_intvl. Inferring a value for
> +      * TCP_KEEPALIVE_ABORT_THRESHOLD doing
> +      * tcp_keepalive_probes * tcp_keepalive_intvl could yield a good
> +      * alternative but Solaris does not detail the algorithm used (constant
> +      * time retry like linux or somthing fancier). So simply ignore those
> +      * setting on solaris for now.
> +      */
> +#ifndef __sun
> +     if (tcp_keepalive_intvl > -1) {
> +             ret = setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, 
> &tcp_keepalive_intvl,
> +                             sizeof(tcp_keepalive_intvl));
> +             if (ret < 0) {
> +                     PERROR("setsockopt TCP_KEEPINTVL");
> +                     goto end;
> +             }
> +     }
> +
> +     if (tcp_keepalive_probes > -1) {
> +             ret = setsockopt(fd, SOL_TCP, TCP_KEEPCNT, 
> &tcp_keepalive_probes,
> +                             sizeof(tcp_keepalive_probes));
> +             if (ret < 0) {
> +                     PERROR("setsockopt TCP_KEEPCNT");
> +                     goto end;
> +             }
> +     }
> +#endif /* __sun */
> +end:
> +     return ret;
> +}
> diff --git a/src/bin/lttng-relayd/tcp_keep_alive.h 
> b/src/bin/lttng-relayd/tcp_keep_alive.h
> new file mode 100644
> index 00000000..cd8025a0
> --- /dev/null
> +++ b/src/bin/lttng-relayd/tcp_keep_alive.h
> @@ -0,0 +1,25 @@
> +#ifndef RELAYD_TCP_KEEP_ALIVE_H
> +#define RELAYD_TCP_KEEP_ALIVE_H
> +
> +/*
> + * Copyright (C) 2017 - Jonathan Rajotte 
> <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2 only,
> + * as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but 
> WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +
> +int tcp_keepalive_get_settings(void);
> +int tcp_keepalive_setsockopt(int socket_fd);
> +
> +#endif /* RELAYD_UTILS_H */
> 

_______________________________________________
lttng-dev mailing list
[email protected]
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev

Reply via email to