Ralph,

The code pushed in g8e30579 is clearly not the right solution.

The problem starts in oob_tcp_listener.c line 742. A new
mca_oob_tcp_pending_connection_t object is allocated to store the incoming
connection. The accept few lines below fails with an error code of 0x23
which means "resource temporary unavailable" on OS X (i.e. EAGAIN). Thus,
the if at line 750 is skipped, and we reach line 763 (a "continue") with 1)
a connection not accepted, and 2) an allocated object not release. Voila!

Freeing the pending_connection object is not the right approach either, as
it will only remove the memory leak but the process will become a CPU hog.

  Thanks,
    George.




On Thu, May 14, 2015 at 8:10 PM, <git...@crest.iu.edu> wrote:

> This is an automated email from the git hooks/post-receive script. It was
> generated because a ref change was pushed to the repository containing
> the project "open-mpi/ompi".
>
> The branch, master has been updated
>        via  8e30579e6efab580cf9cf1bec8f8df1376b7e9ef (commit)
>       from  1488e82efd1d09c30ba46dfa00b89e623623272f (commit)
>
> Those revisions listed above that are new to this repository have
> not appeared on any other notification email; so we list those
> revisions in full, below.
>
> - Log -----------------------------------------------------------------
>
> https://github.com/open-mpi/ompi/commit/8e30579e6efab580cf9cf1bec8f8df1376b7e9ef
>
> commit 8e30579e6efab580cf9cf1bec8f8df1376b7e9ef
> Author: Ralph Castain <r...@open-mpi.org>
> Date:   Thu May 14 18:09:13 2015 -0600
>
>     The Mac appears to have problems with the keepalive support - once
> keepalive starts, the memory footprint soars. So disable keepalive on the
> Mac
>
> diff --git a/config/opal_check_os_flavors.m4
> b/config/opal_check_os_flavors.m4
> index d1d124d..4939560 100644
> --- a/config/opal_check_os_flavors.m4
> +++ b/config/opal_check_os_flavors.m4
> @@ -57,6 +57,12 @@ AC_DEFUN([OPAL_CHECK_OS_FLAVORS],
>                         [$opal_have_solaris],
>                         [Whether or not we have solaris])
>
> +    AS_IF([test "$opal_found_apple" = "yes"],
> +          [opal_have_mac=1], [opal_have_mac=0])
> +    AC_DEFINE_UNQUOTED([OPAL_HAVE_MAC],
> +                       [$opal_have_mac],
> +                       [Whether or not we are on a Mac])
> +
>      # check for sockaddr_in (a good sign we have TCP)
>      AC_CHECK_HEADERS([netdb.h netinet/in.h netinet/tcp.h])
>      AC_CHECK_TYPES([struct sockaddr_in],
> diff --git a/orte/mca/oob/tcp/oob_tcp_common.c
> b/orte/mca/oob/tcp/oob_tcp_common.c
> index a768472..e3decf2 100644
> --- a/orte/mca/oob/tcp/oob_tcp_common.c
> +++ b/orte/mca/oob/tcp/oob_tcp_common.c
> @@ -72,7 +72,7 @@
>  /**
>   * Set socket buffering
>   */
> -
> +#if defined(SO_KEEPALIVE) && !OPAL_HAVE_MAC
>  static void set_keepalive(int sd)
>  {
>      int option;
> @@ -146,6 +146,7 @@ static void set_keepalive(int sd)
>      }
>  #endif  // TCP_KEEPCNT
>  }
> +#endif //SO_KEEPALIVE
>
>  void orte_oob_tcp_set_socket_options(int sd)
>  {
> @@ -181,7 +182,7 @@ void orte_oob_tcp_set_socket_options(int sd)
>                              opal_socket_errno);
>      }
>  #endif
> -#if defined(SO_KEEPALIVE)
> +#if defined(SO_KEEPALIVE) && !OPAL_HAVE_MAC
>      if (0 < mca_oob_tcp_component.keepalive_time) {
>          set_keepalive(sd);
>      }
> diff --git a/orte/mca/oob/tcp/oob_tcp_component.c
> b/orte/mca/oob/tcp/oob_tcp_component.c
> index dd1af2a..372ed4c 100644
> --- a/orte/mca/oob/tcp/oob_tcp_component.c
> +++ b/orte/mca/oob/tcp/oob_tcp_component.c
> @@ -404,7 +404,7 @@ static int tcp_component_register(void)
>
>  &mca_oob_tcp_component.disable_ipv6_family);
>  #endif
>
> -
> +#if !OPAL_HAVE_MAC
>      mca_oob_tcp_component.keepalive_time = 10;
>      (void)mca_base_component_var_register(component, "keepalive_time",
>                                            "Idle time in seconds before
> starting to send keepalives (num <= 0 ----> disable keepalive)",
> @@ -427,7 +427,8 @@ static int tcp_component_register(void)
>                                            OPAL_INFO_LVL_9,
>                                            MCA_BASE_VAR_SCOPE_READONLY,
>
>  &mca_oob_tcp_component.keepalive_probes);
> -
> +#endif
> +
>      mca_oob_tcp_component.retry_delay = 0;
>      (void)mca_base_component_var_register(component, "retry_delay",
>                                            "Time (in sec) to wait before
> trying to connect to peer again",
>
>
> -----------------------------------------------------------------------
>
> Summary of changes:
>  config/opal_check_os_flavors.m4      | 6 ++++++
>  orte/mca/oob/tcp/oob_tcp_common.c    | 5 +++--
>  orte/mca/oob/tcp/oob_tcp_component.c | 5 +++--
>  3 files changed, 12 insertions(+), 4 deletions(-)
>
>
> hooks/post-receive
> --
> open-mpi/ompi
> _______________________________________________
> ompi-commits mailing list
> ompi-comm...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/ompi-commits
>

Reply via email to