I reverted this changeset from the trunk as it incorrectly re-added the
local coprocessor data to the HNP's hash table. This is already being done
in the ess/hnp module, and there is no value in duplicating it again every
time a daemon calls back.

As noted in the revert comment, if we want host daemons to retain their
coprocessor info in a hash table, then we need to do that somewhere else,
not where this was done. At this time, I don't see the daemons using that
info anywhere.



On Wed, Oct 23, 2013 at 8:56 AM, <svn-commit-mai...@open-mpi.org> wrote:

> Author: hjelmn (Nathan Hjelm)
> Date: 2013-10-23 11:56:23 EDT (Wed, 23 Oct 2013)
> New Revision: 29489
> URL: https://svn.open-mpi.org/trac/ompi/changeset/29489
>
> Log:
> Fix coprocessor detection by always adding the local daemon's co-processors
> to the hash table.
>
> Tested and working on a system with 2 Xeon Phi co-processors.
>
> cmr=v1.7.4:ticket=3847:reviewer=ompi-rm1.7
>
> Text files modified:
>    trunk/orte/mca/plm/base/plm_base_launch_support.c |    40
> +++++++++++++++++++++++++++++++++++++---
>    1 files changed, 37 insertions(+), 3 deletions(-)
>
> Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c
>
> ==============================================================================
> --- trunk/orte/mca/plm/base/plm_base_launch_support.c   Wed Oct 23
> 11:52:05 2013        (r29488)
> +++ trunk/orte/mca/plm/base/plm_base_launch_support.c   2013-10-23
> 11:56:23 EDT (Wed, 23 Oct 2013)      (r29489)
> @@ -1,3 +1,4 @@
> +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
>  /*
>   * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
>   *                         University Research and Technology
> @@ -12,7 +13,8 @@
>   * Copyright (c) 2007-2011 Cisco Systems, Inc.  All rights reserved.
>   * Copyright (c) 2009      Institut National de Recherche en Informatique
>   *                         et Automatique. All rights reserved.
> - * Copyright (c) 2011-2012 Los Alamos National Security, LLC.
> + * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
> + *                         reserved.
>   * Copyright (c) 2013      Intel, Inc. All rights reserved.
>   * $COPYRIGHT$
>   *
> @@ -677,6 +679,38 @@
>          jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
>      }
>
> +#if OPAL_HAVE_HWLOC
> +    {
> +        char *coprocessors, **sns;
> +
> +        /* detect and add any of my coprocessors to the hash table */
> +        coprocessors =
> opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
> +
> +        if (NULL != coprocessors) {
> +            /* init the hash table, if necessary */
> +            if (NULL == orte_coprocessors) {
> +                orte_coprocessors = OBJ_NEW(opal_hash_table_t);
> +                opal_hash_table_init(orte_coprocessors,
> orte_process_info.num_procs);
> +            }
> +            /* separate the serial numbers of the coprocessors
> +             * on this host
> +             */
> +            sns = opal_argv_split(coprocessors, ',');
> +            for (int idx = 0 ; NULL != sns[idx] ; ++idx) {
> +                uint32_t h;
> +
> +                /* compute the hash */
> +                OPAL_HASH_STR(sns[idx], h);
> +                /* mark that this coprocessor is hosted by this daemon */
> +                opal_hash_table_set_value_uint32(orte_coprocessors, h,
> (void*)&ORTE_PROC_MY_NAME->vpid);
> +            }
> +            opal_argv_free(sns);
> +            free(coprocessors);
> +            orte_coprocessors_detected = true;
> +        }
> +    }
> +#endif
> +
>      /* multiple daemons could be in this buffer, so unpack until we
> exhaust the data */
>      idx = 1;
>      while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx,
> ORTE_NAME))) {
> @@ -1271,7 +1305,7 @@
>              /* check for duplicate */
>              ignore = false;
>              for (j=0; j < *argc; j++) {
> -             if (0 == strcmp((*argv)[j], orted_cmd_line[i+1])) {
> +              if (0 == strcmp((*argv)[j], orted_cmd_line[i+1])) {
>                      ignore = true;
>                      break;
>                  }
> @@ -1589,7 +1623,7 @@
>          OBJ_DESTRUCT(&nodes);
>          /* mark that the daemons have reported so we can proceed */
>          daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
> -       daemons->updated = false;
> +        daemons->updated = false;
>          return ORTE_SUCCESS;
>      }
>
> _______________________________________________
> svn mailing list
> s...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/svn
>

Reply via email to