On Thu, 9 Jun 2011 06:05:47 -0700, "Andrej N. Gritsenko" <[email protected]>
wrote:
Non-text part: multipart/mixed
> Hello there!
>
> Currently there is no stackable plugins in slurmd but sometimes it's
> crucial to do something in system environment before slurmstepd is even
> forked. Apparently task plugin is the best one to make it stackable. And
> attached patch make it so. It's still fully compatible with old behavior
> but there is new one - TaskPlugin variable in slurm.conf can be a list of
> plugins with comma as separator, each element can be either 'task/plugin'
> or just 'plugin' name as it is for JobSubmitPlugins variable. Neither any
> changes in documentation was made nor any changes for TaskPluginParams
> variable, it's still sticky for task/affinity.
There are stackable plugins in SLURM, see the spank(8) manpage.
In fact, replacing the "task" plugin is the main reason the spank
framework was initially developed.
mark
> Hope to see it in next SLURM.
> Andriy.> diff -udpr slurm-2.2.6.orig/src/slurmd/common/task_plugin.c
> slurm-2.2.6/src/slurmd/common/task_plugin.c
> --- slurm-2.2.6.orig/src/slurmd/common/task_plugin.c 2011-05-27
> 21:25:06.000000000 +0300
> +++ slurm-2.2.6/src/slurmd/common/task_plugin.c 2011-06-08
> 14:52:13.000000000 +0300
> @@ -73,7 +73,8 @@ typedef struct slurmd_task_context {
> slurmd_task_ops_t ops;
> } slurmd_task_context_t;
>
> -static slurmd_task_context_t *g_task_context = NULL;
> +static slurmd_task_context_t **g_task_context = NULL;
> +static int g_task_context_num = -1;
> static pthread_mutex_t g_task_context_lock =
> PTHREAD_MUTEX_INITIALIZER;
>
>
> @@ -195,34 +196,57 @@ _slurmd_task_context_destroy(slurmd_task
> */
> extern int slurmd_task_init(void)
> {
> - int retval = SLURM_SUCCESS;
> + int retval = SLURM_SUCCESS, i;
> char *task_plugin_type = NULL;
> + char *last = NULL, *task_plugin_list, *task_plugin = NULL;
>
> slurm_mutex_lock( &g_task_context_lock );
>
> - if ( g_task_context )
> + if ( g_task_context_num >= 0 )
> goto done;
>
> task_plugin_type = slurm_get_task_plugin();
> - g_task_context = _slurmd_task_context_create( task_plugin_type );
> - if ( g_task_context == NULL ) {
> - error( "cannot create task context for %s",
> - task_plugin_type );
> - retval = SLURM_ERROR;
> + g_task_context_num = 0; /* mark it before anything else */
> + if (task_plugin_type == NULL || task_plugin_type[0] == '\0')
> goto done;
> - }
>
> - if ( _slurmd_task_get_ops( g_task_context ) == NULL ) {
> - error( "cannot resolve task plugin operations" );
> - _slurmd_task_context_destroy( g_task_context );
> - g_task_context = NULL;
> - retval = SLURM_ERROR;
> + task_plugin_list = task_plugin_type;
> + while ((task_plugin = strtok_r(task_plugin_list, ",", &last))) {
> + i = g_task_context_num++;
> + xrealloc(g_task_context,
> + (sizeof(slurmd_task_context_t *) *
> g_task_context_num));
> + if (strncmp(task_plugin, "task/", 5) == 0)
> + task_plugin += 5; /* backward compatibility */
> + task_plugin = xstrdup_printf("task/%s", task_plugin);
> + g_task_context[i] = _slurmd_task_context_create( task_plugin );
> + if ( g_task_context[i] == NULL ) {
> + error( "cannot create task context for %s",
> + task_plugin );
> + goto error;
> + }
> +
> + if ( _slurmd_task_get_ops( g_task_context[i] ) == NULL ) {
> + error( "cannot resolve task plugin operations for %s",
> + task_plugin );
> + goto error;
> + }
> + xfree(task_plugin);
> + task_plugin_list = NULL; /* for next iteration */
> }
>
> done:
> slurm_mutex_unlock( &g_task_context_lock );
> xfree(task_plugin_type);
> return retval;
> +
> +error:
> + xfree(task_plugin);
> + retval = SLURM_ERROR;
> + for (i = 0; i < g_task_context_num; i++)
> + if (g_task_context[i])
> + _slurmd_task_context_destroy(g_task_context[i]);
> + xfree(g_task_context);
> + goto done;
> }
>
> /*
> @@ -232,13 +256,21 @@ extern int slurmd_task_init(void)
> */
> extern int slurmd_task_fini(void)
> {
> - int rc;
> + int i, rc = SLURM_SUCCESS;
>
> + slurm_mutex_lock( &g_task_context_lock );
> if (!g_task_context)
> - return SLURM_SUCCESS;
> + goto done;
>
> - rc = _slurmd_task_context_destroy(g_task_context);
> - g_task_context = NULL;
> + for (i = 0; i < g_task_context_num; i++)
> + if (_slurmd_task_context_destroy(g_task_context[i]) !=
> SLURM_SUCCESS)
> + rc = SLURM_ERROR;
> +
> + xfree(g_task_context);
> + g_task_context_num = -1;
> +
> +done:
> + slurm_mutex_unlock( &g_task_context_lock );
> return rc;
> }
>
> @@ -249,10 +281,17 @@ extern int slurmd_task_fini(void)
> */
> extern int slurmd_batch_request(uint32_t job_id, batch_job_launch_msg_t *req)
> {
> + int i, rc = SLURM_SUCCESS;
> +
> if (slurmd_task_init())
> return SLURM_ERROR;
>
> - return (*(g_task_context->ops.slurmd_batch_request))(job_id, req);
> + slurm_mutex_lock( &g_task_context_lock );
> + for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> + rc = (*(g_task_context[i]->ops.slurmd_batch_request))(job_id,
> req);
> + slurm_mutex_unlock( &g_task_context_lock );
> +
> + return (rc);
> }
>
> /*
> @@ -264,10 +303,18 @@ extern int slurmd_launch_request(uint32_
> launch_tasks_request_msg_t *req,
> uint32_t node_id)
> {
> + int i, rc = SLURM_SUCCESS;
> +
> if (slurmd_task_init())
> return SLURM_ERROR;
>
> - return (*(g_task_context->ops.slurmd_launch_request))(job_id, req,
> node_id);
> + slurm_mutex_lock( &g_task_context_lock );
> + for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> + rc = (*(g_task_context[i]->ops.slurmd_launch_request))(job_id,
> + req, node_id);
> + slurm_mutex_unlock( &g_task_context_lock );
> +
> + return (rc);
> }
>
> /*
> @@ -279,10 +326,18 @@ extern int slurmd_reserve_resources(uint
> launch_tasks_request_msg_t *req,
> uint32_t node_id )
> {
> + int i, rc = SLURM_SUCCESS;
> +
> if (slurmd_task_init())
> return SLURM_ERROR;
>
> - return (*(g_task_context->ops.slurmd_reserve_resources))(job_id, req,
> node_id);
> + slurm_mutex_lock( &g_task_context_lock );
> + for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> + rc =
> (*(g_task_context[i]->ops.slurmd_reserve_resources))(job_id,
> + req, node_id);
> + slurm_mutex_unlock( &g_task_context_lock );
> +
> + return (rc);
> }
>
> /*
> @@ -292,10 +347,17 @@ extern int slurmd_reserve_resources(uint
> */
> extern int slurmd_suspend_job(uint32_t job_id)
> {
> + int i, rc = SLURM_SUCCESS;
> +
> if (slurmd_task_init())
> return SLURM_ERROR;
>
> - return (*(g_task_context->ops.slurmd_suspend_job))(job_id);
> + slurm_mutex_lock( &g_task_context_lock );
> + for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> + rc = (*(g_task_context[i]->ops.slurmd_suspend_job))(job_id);
> + slurm_mutex_unlock( &g_task_context_lock );
> +
> + return (rc);
> }
>
> /*
> @@ -305,10 +367,17 @@ extern int slurmd_suspend_job(uint32_t j
> */
> extern int slurmd_resume_job(uint32_t job_id)
> {
> + int i, rc = SLURM_SUCCESS;
> +
> if (slurmd_task_init())
> return SLURM_ERROR;
>
> - return (*(g_task_context->ops.slurmd_resume_job))(job_id);
> + slurm_mutex_lock( &g_task_context_lock );
> + for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> + rc = (*(g_task_context[i]->ops.slurmd_resume_job))(job_id);
> + slurm_mutex_unlock( &g_task_context_lock );
> +
> + return (rc);
> }
>
> /*
> @@ -318,10 +387,17 @@ extern int slurmd_resume_job(uint32_t jo
> */
> extern int slurmd_release_resources(uint32_t job_id)
> {
> + int i, rc = SLURM_SUCCESS;
> +
> if (slurmd_task_init())
> return SLURM_ERROR;
>
> - return (*(g_task_context->ops.slurmd_release_resources))(job_id);
> + slurm_mutex_lock( &g_task_context_lock );
> + for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> + rc =
> (*(g_task_context[i]->ops.slurmd_release_resources))(job_id);
> + slurm_mutex_unlock( &g_task_context_lock );
> +
> + return (rc);
> }
>
> /*
> @@ -332,10 +408,17 @@ extern int slurmd_release_resources(uint
> */
> extern int pre_setuid(slurmd_job_t *job)
> {
> + int i, rc = SLURM_SUCCESS;
> +
> if (slurmd_task_init())
> return SLURM_ERROR;
>
> - return (*(g_task_context->ops.pre_setuid))(job);
> + slurm_mutex_lock( &g_task_context_lock );
> + for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> + rc = (*(g_task_context[i]->ops.pre_setuid))(job);
> + slurm_mutex_unlock( &g_task_context_lock );
> +
> + return (rc);
> }
>
> /*
> @@ -345,10 +428,17 @@ extern int pre_setuid(slurmd_job_t *job)
> */
> extern int pre_launch(slurmd_job_t *job)
> {
> + int i, rc = SLURM_SUCCESS;
> +
> if (slurmd_task_init())
> return SLURM_ERROR;
>
> - return (*(g_task_context->ops.pre_launch))(job);
> + slurm_mutex_lock( &g_task_context_lock );
> + for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> + rc = (*(g_task_context[i]->ops.pre_launch))(job);
> + slurm_mutex_unlock( &g_task_context_lock );
> +
> + return (rc);
> }
>
> /*
> @@ -358,8 +448,15 @@ extern int pre_launch(slurmd_job_t *job)
> */
> extern int post_term(slurmd_job_t *job)
> {
> + int i, rc = SLURM_SUCCESS;
> +
> if (slurmd_task_init())
> return SLURM_ERROR;
>
> - return (*(g_task_context->ops.post_term))(job);
> + slurm_mutex_lock( &g_task_context_lock );
> + for (i = 0; ((i < g_task_context_num) && (rc == SLURM_SUCCESS)); i++)
> + rc = (*(g_task_context[i]->ops.post_term))(job);
> + slurm_mutex_unlock( &g_task_context_lock );
> +
> + return (rc);
> }