On Thu, 9 Jun 2011 06:05:47 -0700, "Andrej N. Gritsenko" <[email protected]> 
wrote:
Non-text part: multipart/mixed
>     Hello there!
> 
>     Currently there is no stackable plugins in slurmd but sometimes it's
> crucial to do something in system environment before slurmstepd is even
> forked. Apparently task plugin is the best one to make it stackable. And
> attached patch make it so. It's still fully compatible with old behavior
> but there is new one - TaskPlugin variable in slurm.conf can be a list of
> plugins with comma as separator, each element can be either 'task/plugin'
> or just 'plugin' name as it is for JobSubmitPlugins variable. Neither any
> changes in documentation was made nor any changes for TaskPluginParams
> variable, it's still sticky for task/affinity.


There are stackable plugins in SLURM, see the spank(8) manpage.
In fact, replacing the "task" plugin is the main reason the spank
framework was initially developed.

mark


 
>     Hope to see it in next SLURM.
>     Andriy.> diff -udpr slurm-2.2.6.orig/src/slurmd/common/task_plugin.c 
> slurm-2.2.6/src/slurmd/common/task_plugin.c
> --- slurm-2.2.6.orig/src/slurmd/common/task_plugin.c  2011-05-27 
> 21:25:06.000000000 +0300
> +++ slurm-2.2.6/src/slurmd/common/task_plugin.c       2011-06-08 
> 14:52:13.000000000 +0300
> @@ -73,7 +73,8 @@ typedef struct slurmd_task_context {
>       slurmd_task_ops_t       ops;
>  } slurmd_task_context_t;
>  
> -static slurmd_task_context_t *g_task_context = NULL;
> +static slurmd_task_context_t **g_task_context = NULL;
> +static int                   g_task_context_num = -1;
>  static pthread_mutex_t               g_task_context_lock = 
> PTHREAD_MUTEX_INITIALIZER;
>  
>  
> @@ -195,34 +196,57 @@ _slurmd_task_context_destroy(slurmd_task
>   */
>  extern int slurmd_task_init(void)
>  {
> -     int retval = SLURM_SUCCESS;
> +     int retval = SLURM_SUCCESS, i;
>       char *task_plugin_type = NULL;
> +     char *last = NULL, *task_plugin_list, *task_plugin = NULL;
>  
>       slurm_mutex_lock( &g_task_context_lock );
>  
> -     if ( g_task_context )
> +     if ( g_task_context_num >= 0 )
>               goto done;
>  
>       task_plugin_type = slurm_get_task_plugin();
> -     g_task_context = _slurmd_task_context_create( task_plugin_type );
> -     if ( g_task_context == NULL ) {
> -             error( "cannot create task context for %s",
> -                      task_plugin_type );
> -             retval = SLURM_ERROR;
> +     g_task_context_num = 0; /* mark it before anything else */
> +     if (task_plugin_type == NULL || task_plugin_type[0] == '\0')
>               goto done;
> -     }
>  
> -     if ( _slurmd_task_get_ops( g_task_context ) == NULL ) {
> -             error( "cannot resolve task plugin operations" );
> -             _slurmd_task_context_destroy( g_task_context );
> -             g_task_context = NULL;
> -             retval = SLURM_ERROR;
> +     task_plugin_list = task_plugin_type;
> +     while ((task_plugin = strtok_r(task_plugin_list, ",", &last))) {
> +             i = g_task_context_num++;
> +             xrealloc(g_task_context,
> +                      (sizeof(slurmd_task_context_t *) * 
> g_task_context_num));
> +             if (strncmp(task_plugin, "task/", 5) == 0)
> +                     task_plugin += 5; /* backward compatibility */
> +             task_plugin = xstrdup_printf("task/%s", task_plugin);
> +             g_task_context[i] = _slurmd_task_context_create( task_plugin );
> +             if ( g_task_context[i] == NULL ) {
> +                     error( "cannot create task context for %s",
> +                              task_plugin );
> +                     goto error;
> +             }
> +
> +             if ( _slurmd_task_get_ops( g_task_context[i] ) == NULL ) {
> +                     error( "cannot resolve task plugin operations for %s",
> +                            task_plugin );
> +                     goto error;
> +             }
> +             xfree(task_plugin);
> +             task_plugin_list = NULL; /* for next iteration */
>       }
>  
>   done:
>       slurm_mutex_unlock( &g_task_context_lock );
>       xfree(task_plugin_type);
>       return retval;
> +
> +error:
> +     xfree(task_plugin);
> +     retval = SLURM_ERROR;
> +     for (i = 0; i < g_task_context_num; i++)
> +             if (g_task_context[i])
> +                     _slurmd_task_context_destroy(g_task_context[i]);
> +     xfree(g_task_context);
> +     goto done;
>  }
>  
>  /*
> @@ -232,13 +256,21 @@ extern int slurmd_task_init(void)
>   */
>  extern int slurmd_task_fini(void)
>  {
> -     int rc;
> +     int i, rc = SLURM_SUCCESS;
>  
> +     slurm_mutex_lock( &g_task_context_lock );
>       if (!g_task_context)
> -             return SLURM_SUCCESS;
> +             goto done;
>  
> -     rc = _slurmd_task_context_destroy(g_task_context);
> -     g_task_context = NULL;
> +     for (i = 0; i < g_task_context_num; i++)
> +             if (_slurmd_task_context_destroy(g_task_context[i]) != 
> SLURM_SUCCESS)
> +                     rc = SLURM_ERROR;
> +
> +     xfree(g_task_context);
> +     g_task_context_num = -1;
> +
> +done:
> +     slurm_mutex_unlock( &g_task_context_lock );
>       return rc;
>  }
>  
> @@ -249,10 +281,17 @@ extern int slurmd_task_fini(void)
>   */
>  extern int slurmd_batch_request(uint32_t job_id, batch_job_launch_msg_t *req)
>  {
> +     int i, rc = SLURM_SUCCESS;
> +
>       if (slurmd_task_init())
>               return SLURM_ERROR;
>  
> -     return (*(g_task_context->ops.slurmd_batch_request))(job_id, req);
> +     slurm_mutex_lock( &g_task_context_lock );
> +     for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> +             rc = (*(g_task_context[i]->ops.slurmd_batch_request))(job_id, 
> req);
> +     slurm_mutex_unlock( &g_task_context_lock );
> +
> +     return (rc);
>  }
>  
>  /*
> @@ -264,10 +303,18 @@ extern int slurmd_launch_request(uint32_
>                                launch_tasks_request_msg_t *req,
>                                uint32_t node_id)
>  {
> +     int i, rc = SLURM_SUCCESS;
> +
>       if (slurmd_task_init())
>               return SLURM_ERROR;
>  
> -     return (*(g_task_context->ops.slurmd_launch_request))(job_id, req, 
> node_id);
> +     slurm_mutex_lock( &g_task_context_lock );
> +     for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> +             rc = (*(g_task_context[i]->ops.slurmd_launch_request))(job_id,
> +                                                             req, node_id);
> +     slurm_mutex_unlock( &g_task_context_lock );
> +
> +     return (rc);
>  }
>  
>  /*
> @@ -279,10 +326,18 @@ extern int slurmd_reserve_resources(uint
>                                   launch_tasks_request_msg_t *req,
>                                   uint32_t node_id )
>  {
> +     int i, rc = SLURM_SUCCESS;
> +
>       if (slurmd_task_init())
>               return SLURM_ERROR;
>  
> -     return (*(g_task_context->ops.slurmd_reserve_resources))(job_id, req, 
> node_id);
> +     slurm_mutex_lock( &g_task_context_lock );
> +     for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> +             rc = 
> (*(g_task_context[i]->ops.slurmd_reserve_resources))(job_id,
> +                                                             req, node_id);
> +     slurm_mutex_unlock( &g_task_context_lock );
> +
> +     return (rc);
>  }
>  
>  /*
> @@ -292,10 +347,17 @@ extern int slurmd_reserve_resources(uint
>   */
>  extern int slurmd_suspend_job(uint32_t job_id)
>  {
> +     int i, rc = SLURM_SUCCESS;
> +
>       if (slurmd_task_init())
>               return SLURM_ERROR;
>  
> -     return (*(g_task_context->ops.slurmd_suspend_job))(job_id);
> +     slurm_mutex_lock( &g_task_context_lock );
> +     for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> +             rc = (*(g_task_context[i]->ops.slurmd_suspend_job))(job_id);
> +     slurm_mutex_unlock( &g_task_context_lock );
> +
> +     return (rc);
>  }
>  
>  /*
> @@ -305,10 +367,17 @@ extern int slurmd_suspend_job(uint32_t j
>   */
>  extern int slurmd_resume_job(uint32_t job_id)
>  {
> +     int i, rc = SLURM_SUCCESS;
> +
>       if (slurmd_task_init())
>               return SLURM_ERROR;
>  
> -     return (*(g_task_context->ops.slurmd_resume_job))(job_id);
> +     slurm_mutex_lock( &g_task_context_lock );
> +     for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> +             rc = (*(g_task_context[i]->ops.slurmd_resume_job))(job_id);
> +     slurm_mutex_unlock( &g_task_context_lock );
> +
> +     return (rc);
>  }
>  
>  /*
> @@ -318,10 +387,17 @@ extern int slurmd_resume_job(uint32_t jo
>   */
>  extern int slurmd_release_resources(uint32_t job_id)
>  {
> +     int i, rc = SLURM_SUCCESS;
> +
>       if (slurmd_task_init())
>               return SLURM_ERROR;
>  
> -     return (*(g_task_context->ops.slurmd_release_resources))(job_id);
> +     slurm_mutex_lock( &g_task_context_lock );
> +     for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> +             rc = 
> (*(g_task_context[i]->ops.slurmd_release_resources))(job_id);
> +     slurm_mutex_unlock( &g_task_context_lock );
> +
> +     return (rc);
>  }
>  
>  /*
> @@ -332,10 +408,17 @@ extern int slurmd_release_resources(uint
>   */
>  extern int pre_setuid(slurmd_job_t *job)
>  {
> +     int i, rc = SLURM_SUCCESS;
> +
>       if (slurmd_task_init())
>               return SLURM_ERROR;
>  
> -     return (*(g_task_context->ops.pre_setuid))(job);
> +     slurm_mutex_lock( &g_task_context_lock );
> +     for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> +             rc = (*(g_task_context[i]->ops.pre_setuid))(job);
> +     slurm_mutex_unlock( &g_task_context_lock );
> +
> +     return (rc);
>  }
>  
>  /*
> @@ -345,10 +428,17 @@ extern int pre_setuid(slurmd_job_t *job)
>   */
>  extern int pre_launch(slurmd_job_t *job)
>  {
> +     int i, rc = SLURM_SUCCESS;
> +
>       if (slurmd_task_init())
>               return SLURM_ERROR;
>  
> -     return (*(g_task_context->ops.pre_launch))(job);
> +     slurm_mutex_lock( &g_task_context_lock );
> +     for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> +             rc = (*(g_task_context[i]->ops.pre_launch))(job);
> +     slurm_mutex_unlock( &g_task_context_lock );
> +
> +     return (rc);
>  }
>  
>  /*
> @@ -358,8 +448,15 @@ extern int pre_launch(slurmd_job_t *job)
>   */
>  extern int post_term(slurmd_job_t *job)
>  {
> +     int i, rc = SLURM_SUCCESS;
> +
>       if (slurmd_task_init())
>               return SLURM_ERROR;
>  
> -     return (*(g_task_context->ops.post_term))(job);
> +     slurm_mutex_lock( &g_task_context_lock );
> +     for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> +             rc = (*(g_task_context[i]->ops.post_term))(job);
> +     slurm_mutex_unlock( &g_task_context_lock );
> +
> +     return (rc);
>  }

Reply via email to