On Thu, 14 Feb 2008, Pete Wyckoff wrote:

> [EMAIL PROTECTED] wrote on Thu, 14 Feb 2008 00:19 -0500:
> > [EMAIL PROTECTED] wrote on Wed, 13 Feb 2008 21:25 -0600:
> > > What happens when you restart the client daemon?  Does the segfault occur
> > > with bmi_tcp?
> >
> > Yeah, I'm getting the same sort of thing, with TCP.  1 client, 1
> > md+data server.  2.6.24-rc6.  A few "ls -la /pvfs" will crash
> > client-core, and it automatically is restarted.  Similar sort of
> > backtrace.  Valgrind doesn't show anything before where it all goes
> > bad in Troy's traces.
> >
> > ==7517== Invalid read of size 8
> > ==7517==    at 0x4C6989E: qlist_empty (quicklist.h:117)
> > ==7517==    by 0x4C697DD: PINT_sm_frame (state-machine-fns.c:595)
> > ==7517==    by 0x4C270AB: completion_list_retrieve_completed 
> > (client-state-machine.c:140)
> > ==7517==    by 0x4C281DB: PINT_client_state_machine_testsome 
> > (client-state-machine.c:694)
> > ==7517==    by 0x4C285EB: PVFS_sys_testsome (client-state-machine.c:907)
> > ==7517==    by 0x407BED: process_vfs_requests (pvfs2-client-core.c:2943)
> > ==7517==    by 0x40A284: main (pvfs2-client-core.c:3379)
> > ==7517==  Address 0x53BD870 is 80 bytes inside a block of size 176 free'd
> > ==7517==    at 0x4A0560B: free (vg_replace_malloc.c:233)
> > ==7517==    by 0x4C696D7: PINT_smcb_free (state-machine-fns.c:551)
> > ==7517==    by 0x4C2768A: PINT_client_state_machine_post 
> > (client-state-machine.c:395)
> > ==7517==    by 0x4C29FE5: PVFS_isys_getattr (sys-getattr.sm:211)
> > ==7517==    by 0x403D94: post_getattr_request (pvfs2-client-core.c:558)
> > ==7517==    by 0x408648: handle_unexp_vfs_request (pvfs2-client-core.c:2708)
> > ==7517==    by 0x407D70: process_vfs_requests (pvfs2-client-core.c:2990)
> > ==7517==    by 0x40A284: main (pvfs2-client-core.c:3379)
> >
> > (Parse the second half of this first:)
> >
> > handle_unexp_vfs_request goes off to post a getattr.
> > PVFS_isys_getattr allocs a new smcb.  PINT_client_state_machine_post
> > starts the SM and it must have finished immediately through a
> > successful acache lookup.  PINT_client_state_machine_post frees the
> > smcb.
> >
> > (Now the top half:)
> >
> > Later testsome decides it has a completed smcb.  The same one that
> > had been freed as above.  Although maybe not related.
> >
> > This is the CVS head _before_ the big cleanup Sam did today.  Are
> > we forgetting to initialize smcb->frames somewhere related?  Looking
> > back for suspicious changes.
> >
> > There's the memmove() fix on s_completion_list[] by Phil back on 15
> > jan, but that's obviously a big fix, and it's probably not getting
> > triggered either.  And a bunch of locking changes that are harmless.
> >
> > Needs more debugging.
>
> It's definitely this commit that causes the problem.  Gotta love
> git-bisect.  Does it offer any clues?
>

Yep, I'm looking into that.  Thanks Pete!
-sam

>               -- Pete
>
> commit 3d0ba5f2ee107c9cec913d6ba1a56b571c87c2b7
> Author: slang <slang>
> Date:   Thu Feb 7 16:16:35 2008 +0000
>
>     fixes to sm code for parallel jumps.
>
> diff --git a/src/common/misc/state-machine-fns.c 
> b/src/common/misc/state-machine-fns.c
> index 6098ea0..c512db1 100644
> --- a/src/common/misc/state-machine-fns.c
> +++ b/src/common/misc/state-machine-fns.c
> @@ -70,30 +70,40 @@ int PINT_state_machine_halt(void)
>   */
>  int PINT_state_machine_terminate(struct PINT_smcb *smcb, job_status_s *r)
>  {
> -    struct PINT_frame_s *current_frame;
> +    struct PINT_frame_s *my_frame, *f;
>      job_id_t id;
>
>      /* notify parent */
>      if (smcb->parent_smcb)
>      {
> -        assert(smcb->parent_smcb->children_running > 0);
> -
> -        current_frame = qlist_entry(
> -            &smcb->frames.next, struct PINT_frame_s, link);
> -        current_frame->error = r->error_code;
> -
> -        if (--smcb->parent_smcb->children_running > 0)
> -        {
> -            /* SM is still deferred */
> -            return SM_ACTION_DEFERRED;
> -        }
> -        else
> +        gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> +                     "[SM Terminating Child]: (%p) %s:%s (error_code: %d)\n",
> +                     smcb,
> +                     /* skip pvfs2_ */
> +                     PINT_state_machine_current_machine_name(smcb),
> +                     PINT_state_machine_current_state_name(smcb),
> +                     (int32_t)r->error_code);
> +         assert(smcb->parent_smcb->children_running > 0);
> +
> +         my_frame = qlist_entry(
> +            smcb->frames.next, struct PINT_frame_s, link);
> +         qlist_for_each_entry(f, &smcb->parent_smcb->frames, link)
> +         {
> +             if(my_frame->frame == f->frame)
> +             {
> +                 f->error = r->error_code;
> +                 break;
> +             }
> +         }
> +
> +        if (--smcb->parent_smcb->children_running <= 0)
>          {
>              /* no more child state machines running, so we can
>               * start up the parent state machine again
>               */
>              job_null(0, smcb->parent_smcb, 0, r, &id, smcb->context);
>          }
> +        return SM_ACTION_DEFERRED;
>      }
>      /* call state machine completion function */
>      if (smcb->terminate_fn)
> @@ -127,10 +137,6 @@ PINT_sm_action PINT_state_machine_invoke(struct 
> PINT_smcb *smcb,
>          return SM_ERROR;
>      }
>
> -    /* print pre-call debugging info */
> -    gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -            "SM invoke smcb %p op %d\n",smcb,(smcb)->op);
> -
>      state_name = PINT_state_machine_current_state_name(smcb);
>      machine_name = PINT_state_machine_current_machine_name(smcb);
>
> @@ -148,17 +154,10 @@ PINT_sm_action PINT_state_machine_invoke(struct 
> PINT_smcb *smcb,
>      switch (retval)
>      {
>      case SM_ACTION_TERMINATE :
> -            gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -                    "SM Terminates (%p)\n", smcb);
>              smcb->op_terminate = 1;
>              break;
>      case SM_ACTION_COMPLETE :
> -            gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -                    "SM Returns Complete (%p)\n", smcb);
> -            break;
>      case SM_ACTION_DEFERRED :
> -            gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -                    "SM Returns Deferred (%p)\n", smcb);
>              break;
>      default :
>              /* error */
> @@ -169,7 +168,7 @@ PINT_sm_action PINT_state_machine_invoke(struct PINT_smcb 
> *smcb,
>
>      /* print post-call debugging info */
>      gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -                 "[SM Exiting]: (%p) %s:%s (error code: %d), (sm action: 
> %s)\n",
> +                 "[SM Exiting]: (%p) %s:%s (error code: %d), (action: %s)\n",
>                   smcb,
>                   /* skip pvfs2_ */
>                   machine_name,
> @@ -181,9 +180,6 @@ PINT_sm_action PINT_state_machine_invoke(struct PINT_smcb 
> *smcb,
>      {
>          /* start child SMs */
>          PINT_sm_start_child_frames(smcb);
> -        gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -                "SM (%p) started %d child frames\n",
> -                smcb, smcb->children_running);
>          if (smcb->children_running > 0)
>              retval = SM_ACTION_DEFERRED;
>          else
> @@ -210,7 +206,7 @@ PINT_sm_action PINT_state_machine_start(struct PINT_smcb 
> *smcb, job_status_s *r)
>      if (ret == SM_ACTION_COMPLETE || ret == SM_ACTION_TERMINATE)
>      {
>          /* keep running until state machine deferrs or terminates */
> -        ret = PINT_state_machine_next(smcb, r);
> +        ret = PINT_state_machine_continue(smcb, r);
>
>          /* note that if ret == SM_ACTION_TERMINATE, we _don't_ call
>           * PINT_state_machine_terminate here because that adds the smcb
> @@ -239,8 +235,6 @@ PINT_sm_action PINT_state_machine_next(struct PINT_smcb 
> *smcb, job_status_s *r)
>          gossip_err("SM next called on invald smcb\n");
>          return -1;
>      }
> -    gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -            "SM next smcb %p op %d\n",smcb,(smcb)->op);
>      /* loop while invoke of new state returns COMPLETED */
>      do {
>          /* loop while returning from nested SM */
> @@ -254,12 +248,12 @@ PINT_sm_action PINT_state_machine_next(struct PINT_smcb 
> *smcb, job_status_s *r)
>                  return -1;
>              }
>              transtbl = smcb->current_state->trtbl;
> -
> -         /* for each entry in the transition table there is a return
> -         * code followed by a next state pointer to the new state.
> -         * This loops through each entry, checking for a match on the
> -         * return address, and then sets the new current_state and calls
> -         * the new state action function */
> +
> +            /* for each entry in the transition table there is a return
> +             * code followed by a next state pointer to the new state.
> +             * This loops through each entry, checking for a match on the
> +             * return address, and then sets the new current_state and calls
> +             * the new state action function */
>              for (i = 0; transtbl[i].return_value != DEFAULT_ERROR; i++)
>              {
>                  if (transtbl[i].return_value == r->error_code)
> @@ -296,10 +290,8 @@ PINT_sm_action PINT_state_machine_next(struct PINT_smcb 
> *smcb, job_status_s *r)
>                  if(!smcb->current_state ||
>                     smcb->current_state->trtbl[0].flag == SM_TERM)
>                  {
> -                    /* assume nested machine was invoked without
> -                     * a parent, or nested machine completion results
> -                     * in immediate termination
> -                     */
> +                    /* assume nested state machine was invoked without
> +                     * a parent */
>                      return SM_ACTION_TERMINATE;
>                  }
>           }
> @@ -355,6 +347,8 @@ int PINT_state_machine_locate(struct PINT_smcb *smcb)
>  {
>      struct PINT_state_s *current_tmp;
>      struct PINT_state_machine_s *op_sm;
> +    const char *state_name;
> +    const char *machine_name;
>
>      /* check for valid inputs */
>      if (!smcb || smcb->op < 0 || !smcb->op_get_state_machine)
> @@ -363,7 +357,7 @@ int PINT_state_machine_locate(struct PINT_smcb *smcb)
>       return -PVFS_EINVAL;
>      }
>      gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -            "SM locate smcb %p op %d\n",smcb,(smcb)->op);
> +            "[SM Locating]: (%p) op-id: %d\n",smcb,(smcb)->op);
>      /* this is a the usage dependant routine to look up the SM */
>      op_sm = (*smcb->op_get_state_machine)(smcb->op);
>      if (op_sm != NULL)
> @@ -379,6 +373,14 @@ int PINT_state_machine_locate(struct PINT_smcb *smcb)
>                             current_tmp->action.nested)->first_state;
>       }
>          smcb->current_state = current_tmp;
> +
> +        state_name = PINT_state_machine_current_state_name(smcb);
> +        machine_name = PINT_state_machine_current_machine_name(smcb);
> +
> +        gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> +                     "[SM Locating]: (%p) located: %s:%s\n",
> +                     smcb, machine_name, state_name);
> +
>       return 1; /* indicates successful locate */
>      }
>      gossip_err("State machine not found for operation %d\n",smcb->op);
> @@ -393,8 +395,6 @@ int PINT_state_machine_locate(struct PINT_smcb *smcb)
>   */
>  int PINT_smcb_set_op(struct PINT_smcb *smcb, int op)
>  {
> -    gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -            "SM set op smcb %p op %d\n",smcb,op);
>      smcb->op = op;
>      return PINT_state_machine_locate(smcb);
>  }
> @@ -499,8 +499,6 @@ int PINT_smcb_alloc(
>      {
>          return -PVFS_ENOMEM;
>      }
> -    gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -            "SM allocate smcb %p op %d\n",*smcb,op);
>      /* zero out all members */
>      memset(*smcb, 0, sizeof(struct PINT_smcb));
>
> @@ -540,8 +538,6 @@ void PINT_smcb_free(struct PINT_smcb *smcb)
>  {
>      struct PINT_frame_s *frame_entry, *tmp;
>      assert(smcb);
> -    gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -             "SM free smcb %p op %d\n", smcb, smcb->op);
>      qlist_for_each_entry_safe(frame_entry, tmp, &smcb->frames, link)
>      {
>          if (frame_entry->frame && frame_entry->task_id == 0)
> @@ -564,8 +560,10 @@ void PINT_smcb_free(struct PINT_smcb *smcb)
>   */
>  static struct PINT_state_s *PINT_pop_state(struct PINT_smcb *smcb)
>  {
> -    assert(smcb->stackptr > 0);
> -
> +    if(smcb->stackptr == 0)
> +    {
> +        return NULL;
> +    }
>      return smcb->state_stack[--smcb->stackptr];
>  }
>
> @@ -611,9 +609,6 @@ void *PINT_sm_frame(struct PINT_smcb *smcb, int index)
>              next = next->next;
>          }
>          frame_entry = qlist_entry(next, struct PINT_frame_s, link);
> -        gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -                     "FRAME GET smcb %p index %d -> frame: %p\n",
> -                     smcb, index, frame_entry->frame);
>          return frame_entry->frame;
>      }
>  }
> @@ -627,9 +622,8 @@ int PINT_sm_push_frame(struct PINT_smcb *smcb, int 
> task_id, void *frame_p)
>  {
>      struct PINT_frame_s *newframe;
>      gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -                 "PUSH FRAME %p onto smcb %p\n",
> -                 frame_p, smcb);
> -
> +                 "[SM Frame PUSH]: (%p) frame: %p\n",
> +                 smcb, frame_p);
>      newframe = malloc(sizeof(struct PINT_frame_s));
>      if(!newframe)
>      {
> @@ -638,15 +632,22 @@ int PINT_sm_push_frame(struct PINT_smcb *smcb, int 
> task_id, void *frame_p)
>      newframe->task_id = task_id;
>      newframe->frame = frame_p;
>      qlist_add(&newframe->link, &smcb->frames);
> +    smcb->frame_count++;
>      return 0;
>  }
>
>  /* Function: PINT_sm_pop_frame
> - * Params: pointer to an smcb pointer
> + * Params: smcb - pointer to an smcb pointer
> + *         task_id - the task id of this frame
> + *         error_code - the frame's error if there was one.
> + *         remaining - count of remaining frames on the smcb.
>   * Returns: frame pointer
>   * Synopsis: pops a frame pointer from the frame_stack and returns it
>   */
> -void *PINT_sm_pop_frame(struct PINT_smcb *smcb, int *error_code)
> +void *PINT_sm_pop_frame(struct PINT_smcb *smcb,
> +                        int *task_id,
> +                        int *error_code,
> +                        int *remaining)
>  {
>      struct PINT_frame_s *frame_entry;
>      void *frame;
> @@ -656,17 +657,24 @@ void *PINT_sm_pop_frame(struct PINT_smcb *smcb, int 
> *error_code)
>          return NULL;
>      }
>
> -    frame_entry = qlist_entry(&smcb->frames.next, struct PINT_frame_s, link);
> +    frame_entry = qlist_entry(smcb->frames.next, struct PINT_frame_s, link);
>      qlist_del(smcb->frames.next);
> +    smcb->frame_count--;
> +
> +    if(remaining)
> +    {
> +        *remaining = smcb->frame_count;
> +    }
>
>      frame = frame_entry->frame;
>      *error_code = frame_entry->error;
> +    *task_id = frame_entry->task_id;
>
>      free(frame_entry);
>
>      gossip_debug(GOSSIP_STATE_MACHINE_DEBUG,
> -            "POP FRAME %p from smcb %p\n",
> -            frame, smcb);
> +            "[SM Frame POP]: (%p) frame: %p\n",
> +            smcb, frame);
>      return frame;
>  }
>
> @@ -710,11 +718,22 @@ static void PINT_sm_start_child_frames(struct PINT_smcb 
> *smcb)
>      struct PINT_smcb *new_sm;
>      struct PINT_frame_s *frame_entry;
>      job_status_s r;
> +    struct qlist_head *f;
>
>      assert(smcb);
>
> -    qlist_for_each_entry(frame_entry, &smcb->frames, link)
> +    memset(&r, 0, sizeof(job_status_s));
> +
> +    qlist_for_each(f, &smcb->frames)
>      {
> +        /* skip the last since its the parent frame */
> +        if(f->next == &smcb->frames)
> +        {
> +            break;
> +        }
> +
> +        frame_entry = qlist_entry(f, struct PINT_frame_s, link);
> +
>          /* allocate smcb */
>          PINT_smcb_alloc(&new_sm, smcb->op, 0, NULL,
>                  child_sm_frame_terminate, smcb->context);
> diff --git a/src/common/misc/state-machine.h b/src/common/misc/state-machine.h
> index 86f3220..cddaead 100644
> --- a/src/common/misc/state-machine.h
> +++ b/src/common/misc/state-machine.h
> @@ -68,6 +68,7 @@ typedef struct PINT_smcb
>      struct PINT_state_s *state_stack[PINT_STATE_STACK_SIZE];
>
>      struct qlist_head frames;
> +    int frame_count;
>
>      /* usage specific routinet to look up SM from OP */
>      struct PINT_state_machine_s *(*op_get_state_machine)(int);
> @@ -152,14 +153,6 @@ enum {
>  #define SM_STATE_RETURN -1
>  #define SM_NESTED_STATE 1
>
> -#define SM_NONE   0
> -#define SM_NEXT   1
> -#define SM_RETURN 2
> -#define SM_EXTERN 3
> -#define SM_NESTED 5
> -#define SM_JUMP   6
> -#define SM_TERMINATE 7
> -
>  /* Prototypes for functions provided by user */
>  int PINT_state_machine_complete(void *);
>
> @@ -196,9 +189,10 @@ int PINT_smcb_alloc(struct PINT_smcb **, int, int,
>  void PINT_smcb_free(struct PINT_smcb *);
>  void *PINT_sm_frame(struct PINT_smcb *, int);
>  int PINT_sm_push_frame(struct PINT_smcb *smcb, int task_id, void *frame_p);
> -void *PINT_sm_pop_frame(struct PINT_smcb *smcb, int *error_code);
> -
> -int PINT_sm_pop_error(PINT_smcb *smcb, PVFS_error ret);
> +void *PINT_sm_pop_frame(struct PINT_smcb *smcb,
> +                        int *task_id,
> +                        int *error_code,
> +                        int *remaining);
>
>  /* This macro is used in calls to PINT_sm_fram() */
>  #define PINT_FRAME_CURRENT 0
>
>
_______________________________________________
Pvfs2-developers mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers

Reply via email to