Thanks again Ralph for your reply.
> There's your problem - that module is run in the daemon, where the > orte_job_data pointer array isn't used. You have to use the > orte_local_jobdata and orte_local_children lists instead. So once the HNP > replies with the jobid, you look up the orte_odls_job_t for that job from > the orte_local_jobdata list. > I'm sending now to you all the piece of code involved, at the beginning i'm doing something about what you are saying. Then having the child info i ask to the hnp for the jobdata of the child, but i'm still getting no data about the child (that is the dead process). I'm trying to get this info to send info to another orted to restart this failed process. > I'm not sure what you are trying to accomplish, so I can't give further > advice. Note that daemons have limited knowledge of application processes > that are not their own immediate children. What little they know regarding > processes other than their own is stored in the nidmap/pidmap arrays - > limited to location, local rank, and node rank. They have no storage > currently allocated for things like the state of a non-local process. > I want to restart the process in another node, that's why i'm needing the jobdata. So, the hnp cannot do something like: *jdata = orte_get_job_data_object(proc.jobid))* when the proc doesn't belong to him?? So where i can obtain this information, because i'm asumming that i cannot ask about the dead process to his daemon (because i assume that the daemon also is dead, but that's not true). I was supossing that in the HNP i could execute the sentence above. I'm attaching all the code involving the described situation. But i have made some changes after my first email, but what i'm trying to do is basically the same. In the line 23 of the orted_comm.c, that i'm sending, i'm always getting NULL as a result, so i can't obtain the jdata. Thanks a lot again for your help. Best Regards. Hugo Meyer
case ORTE_DAEMON_UPDATE_STATE_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_recv: (RADIC)update state request from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender)); } /* Unpack the buffer... */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &state, &n, OPAL_UINT16))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* OK. Update status, but what is the new status? */ if (ORTE_PROC_STATE_FAULT == state) { if (ORTE_PROC_IS_HNP) { opal_output(0, "Intentando recuperar, soy el HNP"); /* Load the structure for application or orted job */ if (NULL == (jdata = orte_get_job_data_object(proc.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto CLEANUP; } procs = (orte_proc_t**)jdata->procs->addr; if (NULL == procs[proc.vpid] ) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto CLEANUP; } /* load the nodes structure */ nodes = (orte_node_t**)orte_node_pool->addr; /* Is it a new request or just a new detection? */ if (procs[proc.vpid]->state == state || ORTE_PROC_STATE_RESTORE == procs[proc.vpid]->state) { if (orte_debug_daemons_flag) { opal_output(0, "%s orted_update_state: the process %s is already restarting (state=%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc), (int)state); } break; } else { if (orte_debug_daemons_flag) { opal_output(0, "%s orted_update_state: updating state (%d) for process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)state, ORTE_NAME_PRINT(&proc)); } procs[proc.vpid]->state = state; } if (orte_debug_daemons_flag) { opal_output(0, "%s orted_update_state: update state request to %s (orted)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc)); } /* If and ORTED has fault, we need to update the nodes table to inform that the node dies */ if (orte_debug_daemons_flag) { opal_output(0, "%s orted_update_state: updating state (%d) for node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)node_state, procs[proc.vpid]->node->name); } node_state = ORTE_NODE_STATE_DOWN; procs[proc.vpid]->node->state = node_state; /* to all process on the faulty node... */ procs_rec = (orte_proc_t**)procs[proc.vpid]->node->procs->addr; for (i=0; i < (int)procs[proc.vpid]->node->num_procs; i++) { /* ...request a restart to its protector */ if (orte_debug_daemons_flag) { opal_output(0, "%s orted_update_state: requesting to %s a (protector) restart for application %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(procs_rec[i]->protector)), ORTE_NAME_PRINT(&proc)); } request_restart(&(procs_rec[i]->name), &(procs_rec[i]->protector), 0); } //} } /* end if HNP */ }
/* find this proc in the local children */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; if (child->name->jobid == proc->jobid && child->name->vpid == proc->vpid) { /*Si encontramos el child lo enviamos de momento al protector, para eso lo empaquetamos primero con el comando ORTE_DAEMON_RESTORE_CHECKPOINT_CMD y luego lo enviamos con rml_send*/ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,"CHILD a restaurar %s",ORTE_NAME_PRINT(child->name))); command = ORTE_DAEMON_UPDATE_STATE_CMD; buffer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); goto CLEANUP; } orte_proc_state_t state = ORTE_PROC_STATE_FAULT; /* Pack the faulty vpid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* Pack the state */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &state, 1, OPAL_UINT16))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /*Aqui se envia el buffer al lugar donde se restaurĂ¡.*/ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "ENVIANDO AL HNP")); if (0 > orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); exit_status = ORTE_ERR_COMM_FAILURE; } return ORTE_SUCCESS; CLEANUP: OBJ_DESTRUCT(&buffer); } } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted Children NOT found!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));