On Thu, 14 Feb 2008, Pete Wyckoff wrote:
> [EMAIL PROTECTED] wrote on Thu, 14 Feb 2008 00:19 -0500:
> > [EMAIL PROTECTED] wrote on Wed, 13 Feb 2008 21:25 -0600:
> > > What happens when you restart the client daemon? Does the segfault occur
> > > with bmi_tcp?
> >
> > Yeah, I'm getting the same sort of thing, with TCP. 1 client, 1
> > md+data server. 2.6.24-rc6. A few "ls -la /pvfs" will crash
> > client-core, and it automatically is restarted. Similar sort of
> > backtrace. Valgrind doesn't show anything before where it all goes
> > bad in Troy's traces.
> >
> > ==7517== Invalid read of size 8
> > ==7517== at 0x4C6989E: qlist_empty (quicklist.h:117)
> > ==7517== by 0x4C697DD: PINT_sm_frame (state-machine-fns.c:595)
> > ==7517== by 0x4C270AB: completion_list_retrieve_completed
> > (client-state-machine.c:140)
> > ==7517== by 0x4C281DB: PINT_client_state_machine_testsome
> > (client-state-machine.c:694)
> > ==7517== by 0x4C285EB: PVFS_sys_testsome (client-state-machine.c:907)
> > ==7517== by 0x407BED: process_vfs_requests (pvfs2-client-core.c:2943)
> > ==7517== by 0x40A284: main (pvfs2-client-core.c:3379)
> > ==7517== Address 0x53BD870 is 80 bytes inside a block of size 176 free'd
> > ==7517== at 0x4A0560B: free (vg_replace_malloc.c:233)
> > ==7517== by 0x4C696D7: PINT_smcb_free (state-machine-fns.c:551)
> > ==7517== by 0x4C2768A: PINT_client_state_machine_post
> > (client-state-machine.c:395)
> > ==7517== by 0x4C29FE5: PVFS_isys_getattr (sys-getattr.sm:211)
> > ==7517== by 0x403D94: post_getattr_request (pvfs2-client-core.c:558)
> > ==7517== by 0x408648: handle_unexp_vfs_request (pvfs2-client-core.c:2708)
> > ==7517== by 0x407D70: process_vfs_requests (pvfs2-client-core.c:2990)
> > ==7517== by 0x40A284: main (pvfs2-client-core.c:3379)
> >
> > (Parse the second half of this first:)
> >
> > handle_unexp_vfs_request goes off to post a getattr.
> > PVFS_isys_getattr allocs a new smcb. PINT_client_state_machine_post
> > starts the SM and it must have finished immediately through a
> > successful acache lookup. PINT_client_state_machine_post frees the
> > smcb.
> >
> > (Now the top half:)
> >
> > Later testsome decides it has a completed smcb. The same one that
> > had been freed as above. Although maybe not related.
> >
> > This is the CVS head _before_ the big cleanup Sam did today. Are
> > we forgetting to initialize smcb->frames somewhere related? Looking
> > back for suspicious changes.
> >
> > There's the memmove() fix on s_completion_list[] by Phil back on 15
> > jan, but that's obviously a big fix, and it's probably not getting
> > triggered either. And a bunch of locking changes that are harmless.
> >
> > Needs more debugging.
>
> It's definitely this commit that causes the problem. Gotta love
> git-bisect. Does it offer any clues?
>
Yep, I'm looking into that. Thanks Pete!
-sam
> -- Pete
>
> commit 3d0ba5f2ee107c9cec913d6ba1a56b571c87c2b7
> Author: slang <slang>
> Date: Thu Feb 7 16:16:35 2008 +0000
>
> fixes to sm code for parallel jumps.
>
> diff --git a/src/common/misc/state-machine-fns.c
> b/src/common/misc/state-machine-fns.c
> index 6098ea0..c512db1 100644
> --- a/src/common/misc/state-machine-fns.c
> +++ b/src/common/misc/state-machine-fns.c
> @@ -70,30 +70,40 @@ int PINT_state_machine_halt(void)
> */
> int PINT_state_machine_terminate(struct PINT_smcb *smcb, job_status_s *r)
> {
> - struct PINT_frame_s *current_frame;
> + struct PINT_frame_s *my_frame, *f;
> job_id_t id;
>
> /* notify parent */
> if (smcb->parent_smcb)
> {
> - assert(smcb->parent_smcb->children_running > 0);
> -
> - current_frame = qlist_entry(
> - &smcb->frames.next, struct PINT_frame_s, link);
> - current_frame->error = r->error_code;
> -
> - if (--smcb->parent_smcb->children_running > 0)
> - {
> - /* SM is still deferred */
> - return SM_ACTION_DEFERRED;
> - }
> - else
> + gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> + "[SM Terminating Child]: (%p) %s:%s (error_code: %d)\n",
> + smcb,
> + /* skip pvfs2_ */
> + PINT_state_machine_current_machine_name(smcb),
> + PINT_state_machine_current_state_name(smcb),
> + (int32_t)r->error_code);
> + assert(smcb->parent_smcb->children_running > 0);
> +
> + my_frame = qlist_entry(
> + smcb->frames.next, struct PINT_frame_s, link);
> + qlist_for_each_entry(f, &smcb->parent_smcb->frames, link)
> + {
> + if(my_frame->frame == f->frame)
> + {
> + f->error = r->error_code;
> + break;
> + }
> + }
> +
> + if (--smcb->parent_smcb->children_running <= 0)
> {
> /* no more child state machines running, so we can
> * start up the parent state machine again
> */
> job_null(0, smcb->parent_smcb, 0, r, &id, smcb->context);
> }
> + return SM_ACTION_DEFERRED;
> }
> /* call state machine completion function */
> if (smcb->terminate_fn)
> @@ -127,10 +137,6 @@ PINT_sm_action PINT_state_machine_invoke(struct
> PINT_smcb *smcb,
> return SM_ERROR;
> }
>
> - /* print pre-call debugging info */
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM invoke smcb %p op %d\n",smcb,(smcb)->op);
> -
> state_name = PINT_state_machine_current_state_name(smcb);
> machine_name = PINT_state_machine_current_machine_name(smcb);
>
> @@ -148,17 +154,10 @@ PINT_sm_action PINT_state_machine_invoke(struct
> PINT_smcb *smcb,
> switch (retval)
> {
> case SM_ACTION_TERMINATE :
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM Terminates (%p)\n", smcb);
> smcb->op_terminate = 1;
> break;
> case SM_ACTION_COMPLETE :
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM Returns Complete (%p)\n", smcb);
> - break;
> case SM_ACTION_DEFERRED :
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM Returns Deferred (%p)\n", smcb);
> break;
> default :
> /* error */
> @@ -169,7 +168,7 @@ PINT_sm_action PINT_state_machine_invoke(struct PINT_smcb
> *smcb,
>
> /* print post-call debugging info */
> gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "[SM Exiting]: (%p) %s:%s (error code: %d), (sm action:
> %s)\n",
> + "[SM Exiting]: (%p) %s:%s (error code: %d), (action: %s)\n",
> smcb,
> /* skip pvfs2_ */
> machine_name,
> @@ -181,9 +180,6 @@ PINT_sm_action PINT_state_machine_invoke(struct PINT_smcb
> *smcb,
> {
> /* start child SMs */
> PINT_sm_start_child_frames(smcb);
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM (%p) started %d child frames\n",
> - smcb, smcb->children_running);
> if (smcb->children_running > 0)
> retval = SM_ACTION_DEFERRED;
> else
> @@ -210,7 +206,7 @@ PINT_sm_action PINT_state_machine_start(struct PINT_smcb
> *smcb, job_status_s *r)
> if (ret == SM_ACTION_COMPLETE || ret == SM_ACTION_TERMINATE)
> {
> /* keep running until state machine deferrs or terminates */
> - ret = PINT_state_machine_next(smcb, r);
> + ret = PINT_state_machine_continue(smcb, r);
>
> /* note that if ret == SM_ACTION_TERMINATE, we _don't_ call
> * PINT_state_machine_terminate here because that adds the smcb
> @@ -239,8 +235,6 @@ PINT_sm_action PINT_state_machine_next(struct PINT_smcb
> *smcb, job_status_s *r)
> gossip_err("SM next called on invald smcb\n");
> return -1;
> }
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM next smcb %p op %d\n",smcb,(smcb)->op);
> /* loop while invoke of new state returns COMPLETED */
> do {
> /* loop while returning from nested SM */
> @@ -254,12 +248,12 @@ PINT_sm_action PINT_state_machine_next(struct PINT_smcb
> *smcb, job_status_s *r)
> return -1;
> }
> transtbl = smcb->current_state->trtbl;
> -
> - /* for each entry in the transition table there is a return
> - * code followed by a next state pointer to the new state.
> - * This loops through each entry, checking for a match on the
> - * return address, and then sets the new current_state and calls
> - * the new state action function */
> +
> + /* for each entry in the transition table there is a return
> + * code followed by a next state pointer to the new state.
> + * This loops through each entry, checking for a match on the
> + * return address, and then sets the new current_state and calls
> + * the new state action function */
> for (i = 0; transtbl[i].return_value != DEFAULT_ERROR; i++)
> {
> if (transtbl[i].return_value == r->error_code)
> @@ -296,10 +290,8 @@ PINT_sm_action PINT_state_machine_next(struct PINT_smcb
> *smcb, job_status_s *r)
> if(!smcb->current_state ||
> smcb->current_state->trtbl[0].flag == SM_TERM)
> {
> - /* assume nested machine was invoked without
> - * a parent, or nested machine completion results
> - * in immediate termination
> - */
> + /* assume nested state machine was invoked without
> + * a parent */
> return SM_ACTION_TERMINATE;
> }
> }
> @@ -355,6 +347,8 @@ int PINT_state_machine_locate(struct PINT_smcb *smcb)
> {
> struct PINT_state_s *current_tmp;
> struct PINT_state_machine_s *op_sm;
> + const char *state_name;
> + const char *machine_name;
>
> /* check for valid inputs */
> if (!smcb || smcb->op < 0 || !smcb->op_get_state_machine)
> @@ -363,7 +357,7 @@ int PINT_state_machine_locate(struct PINT_smcb *smcb)
> return -PVFS_EINVAL;
> }
> gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM locate smcb %p op %d\n",smcb,(smcb)->op);
> + "[SM Locating]: (%p) op-id: %d\n",smcb,(smcb)->op);
> /* this is a the usage dependant routine to look up the SM */
> op_sm = (*smcb->op_get_state_machine)(smcb->op);
> if (op_sm != NULL)
> @@ -379,6 +373,14 @@ int PINT_state_machine_locate(struct PINT_smcb *smcb)
> current_tmp->action.nested)->first_state;
> }
> smcb->current_state = current_tmp;
> +
> + state_name = PINT_state_machine_current_state_name(smcb);
> + machine_name = PINT_state_machine_current_machine_name(smcb);
> +
> + gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> + "[SM Locating]: (%p) located: %s:%s\n",
> + smcb, machine_name, state_name);
> +
> return 1; /* indicates successful locate */
> }
> gossip_err("State machine not found for operation %d\n",smcb->op);
> @@ -393,8 +395,6 @@ int PINT_state_machine_locate(struct PINT_smcb *smcb)
> */
> int PINT_smcb_set_op(struct PINT_smcb *smcb, int op)
> {
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM set op smcb %p op %d\n",smcb,op);
> smcb->op = op;
> return PINT_state_machine_locate(smcb);
> }
> @@ -499,8 +499,6 @@ int PINT_smcb_alloc(
> {
> return -PVFS_ENOMEM;
> }
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM allocate smcb %p op %d\n",*smcb,op);
> /* zero out all members */
> memset(*smcb, 0, sizeof(struct PINT_smcb));
>
> @@ -540,8 +538,6 @@ void PINT_smcb_free(struct PINT_smcb *smcb)
> {
> struct PINT_frame_s *frame_entry, *tmp;
> assert(smcb);
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "SM free smcb %p op %d\n", smcb, smcb->op);
> qlist_for_each_entry_safe(frame_entry, tmp, &smcb->frames, link)
> {
> if (frame_entry->frame && frame_entry->task_id == 0)
> @@ -564,8 +560,10 @@ void PINT_smcb_free(struct PINT_smcb *smcb)
> */
> static struct PINT_state_s *PINT_pop_state(struct PINT_smcb *smcb)
> {
> - assert(smcb->stackptr > 0);
> -
> + if(smcb->stackptr == 0)
> + {
> + return NULL;
> + }
> return smcb->state_stack[--smcb->stackptr];
> }
>
> @@ -611,9 +609,6 @@ void *PINT_sm_frame(struct PINT_smcb *smcb, int index)
> next = next->next;
> }
> frame_entry = qlist_entry(next, struct PINT_frame_s, link);
> - gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "FRAME GET smcb %p index %d -> frame: %p\n",
> - smcb, index, frame_entry->frame);
> return frame_entry->frame;
> }
> }
> @@ -627,9 +622,8 @@ int PINT_sm_push_frame(struct PINT_smcb *smcb, int
> task_id, void *frame_p)
> {
> struct PINT_frame_s *newframe;
> gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "PUSH FRAME %p onto smcb %p\n",
> - frame_p, smcb);
> -
> + "[SM Frame PUSH]: (%p) frame: %p\n",
> + smcb, frame_p);
> newframe = malloc(sizeof(struct PINT_frame_s));
> if(!newframe)
> {
> @@ -638,15 +632,22 @@ int PINT_sm_push_frame(struct PINT_smcb *smcb, int
> task_id, void *frame_p)
> newframe->task_id = task_id;
> newframe->frame = frame_p;
> qlist_add(&newframe->link, &smcb->frames);
> + smcb->frame_count++;
> return 0;
> }
>
> /* Function: PINT_sm_pop_frame
> - * Params: pointer to an smcb pointer
> + * Params: smcb - pointer to an smcb pointer
> + * task_id - the task id of this frame
> + * error_code - the frame's error if there was one.
> + * remaining - count of remaining frames on the smcb.
> * Returns: frame pointer
> * Synopsis: pops a frame pointer from the frame_stack and returns it
> */
> -void *PINT_sm_pop_frame(struct PINT_smcb *smcb, int *error_code)
> +void *PINT_sm_pop_frame(struct PINT_smcb *smcb,
> + int *task_id,
> + int *error_code,
> + int *remaining)
> {
> struct PINT_frame_s *frame_entry;
> void *frame;
> @@ -656,17 +657,24 @@ void *PINT_sm_pop_frame(struct PINT_smcb *smcb, int
> *error_code)
> return NULL;
> }
>
> - frame_entry = qlist_entry(&smcb->frames.next, struct PINT_frame_s, link);
> + frame_entry = qlist_entry(smcb->frames.next, struct PINT_frame_s, link);
> qlist_del(smcb->frames.next);
> + smcb->frame_count--;
> +
> + if(remaining)
> + {
> + *remaining = smcb->frame_count;
> + }
>
> frame = frame_entry->frame;
> *error_code = frame_entry->error;
> + *task_id = frame_entry->task_id;
>
> free(frame_entry);
>
> gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> - "POP FRAME %p from smcb %p\n",
> - frame, smcb);
> + "[SM Frame POP]: (%p) frame: %p\n",
> + smcb, frame);
> return frame;
> }
>
> @@ -710,11 +718,22 @@ static void PINT_sm_start_child_frames(struct PINT_smcb
> *smcb)
> struct PINT_smcb *new_sm;
> struct PINT_frame_s *frame_entry;
> job_status_s r;
> + struct qlist_head *f;
>
> assert(smcb);
>
> - qlist_for_each_entry(frame_entry, &smcb->frames, link)
> + memset(&r, 0, sizeof(job_status_s));
> +
> + qlist_for_each(f, &smcb->frames)
> {
> + /* skip the last since its the parent frame */
> + if(f->next == &smcb->frames)
> + {
> + break;
> + }
> +
> + frame_entry = qlist_entry(f, struct PINT_frame_s, link);
> +
> /* allocate smcb */
> PINT_smcb_alloc(&new_sm, smcb->op, 0, NULL,
> child_sm_frame_terminate, smcb->context);
> diff --git a/src/common/misc/state-machine.h b/src/common/misc/state-machine.h
> index 86f3220..cddaead 100644
> --- a/src/common/misc/state-machine.h
> +++ b/src/common/misc/state-machine.h
> @@ -68,6 +68,7 @@ typedef struct PINT_smcb
> struct PINT_state_s *state_stack[PINT_STATE_STACK_SIZE];
>
> struct qlist_head frames;
> + int frame_count;
>
> /* usage specific routinet to look up SM from OP */
> struct PINT_state_machine_s *(*op_get_state_machine)(int);
> @@ -152,14 +153,6 @@ enum {
> #define SM_STATE_RETURN -1
> #define SM_NESTED_STATE 1
>
> -#define SM_NONE 0
> -#define SM_NEXT 1
> -#define SM_RETURN 2
> -#define SM_EXTERN 3
> -#define SM_NESTED 5
> -#define SM_JUMP 6
> -#define SM_TERMINATE 7
> -
> /* Prototypes for functions provided by user */
> int PINT_state_machine_complete(void *);
>
> @@ -196,9 +189,10 @@ int PINT_smcb_alloc(struct PINT_smcb **, int, int,
> void PINT_smcb_free(struct PINT_smcb *);
> void *PINT_sm_frame(struct PINT_smcb *, int);
> int PINT_sm_push_frame(struct PINT_smcb *smcb, int task_id, void *frame_p);
> -void *PINT_sm_pop_frame(struct PINT_smcb *smcb, int *error_code);
> -
> -int PINT_sm_pop_error(PINT_smcb *smcb, PVFS_error ret);
> +void *PINT_sm_pop_frame(struct PINT_smcb *smcb,
> + int *task_id,
> + int *error_code,
> + int *remaining);
>
> /* This macro is used in calls to PINT_sm_fram() */
> #define PINT_FRAME_CURRENT 0
>
>
_______________________________________________
Pvfs2-developers mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers