Any chance you can add a patch to perlreguts that includes part of
this commit message?

Yves

On 17 January 2013 12:28, Nicholas Clark <[email protected]> wrote:
> In perl.git, the branch smoke-me/remove-regcomp-setjmp has been updated
>
> <http://perl5.git.perl.org/perl.git/commitdiff/2c4e40455f15d9b1cdc122814d345bdacdd5fad8?hp=1f605877dc7b1d1b806d6f0e30dc1d5adbfe6ef3>
>
> - Log -----------------------------------------------------------------
> commit 2c4e40455f15d9b1cdc122814d345bdacdd5fad8
> Author: Nicholas Clark <[email protected]>
> Date:   Thu Jan 17 11:47:13 2013 +0100
>
>     Document when and why S_reg{,branch,piece,atom,class}() return NULL.
>
>     As documented in pod/perlreguts.pod, the call graph for regex parsing
>     involves several levels of functions in regcomp.c, sometimes recursing 
> more
>     than once.
>
>     The top level compiling function, S_reg(), calls S_regbranch() to parse 
> each
>     single branch of an alternation. In turn, that calls S_regpiece() to parse
>     a simple pattern followed by quantifier, which calls S_regatom() to parse
>     that simple pattern. S_regatom() can call S_regclass() to handle classes,
>     but can also recurse into S_reg() to handle subpatterns and some other
>     constructions. Some other routines call call S_reg(), sometimes using an
>     alternative pattern that they generate dynamically to represent their 
> input.
>
>     These routines all return a pointer to a regnode structure, and take a
>     pointer to an integer that holds flags, which is also used to return
>     information.
>
>     Historically, it has not been clear when and why they return NULL, and
>     whether the return value can be ignored. In particular, "Jumbo regexp 
> patch"
>     (commit c277df42229d99fe, from Nov 1997), added code with two calls from
>     S_reg() to S_regbranch(), one of which checks the return value and 
> generates
>     a LONGJMP node if it returns NULL, the other of which is called in void
>     context, and so both ignores any return value, or the possibility that it 
> is
>     NULL.
>
>     After some analysis I have untangled the possible return values from these
>     5 functions (and related functions which call S_reg()).
>
>     Starting from the top:
>     S_reg() will return NULL and set the flags to TRYAGAIN at the end of 
> pragma-
>     line constructions that it handles. Otherwise, historically it would 
> return
>     NULL if S_regbranch() returned NULL. In turn, S_regbranch() would return
>     NULL if S_regpiece() returned NULL without setting TRYAGAIN. If 
> S_regpiece()
>     returns TRYAGAIN, S_regbranch() loops, and ultimately will not return 
> NULL.
>
>     S_regpiece() returns NULL with TRYAGAIN if S_regatom() returns NULL with
>     TRYAGAIN, but (historically) if S_regatom() returns NULL without setting
>     the flags to TRYAGAIN, S_regpiece() would to. Where S_regatom() calls
>     S_reg() it has similar behaviour when passing back return values, although
>     often it is able to loop instead on getting a TRYAGAIN.
>
>     Which gets us back to S_reg(), which can only *generate* NULL in 
> conjunction
>     with TRYAGAIN. NULL without TRYAGAIN could only be returned if a routine 
> it
>     called generated it. All other functions that these call that return 
> regnode
>     structures cannot return NULL. Hence
>
>     1) in the loop of functions called, there is no source for a return value 
> of
>        NULL without the TRYAGAIN flag being set
>     2) a return value of NULL with TRYAGAIN set from an inner function does 
> not
>        propagate out past S_regbranch()
>
>     Hence the only return values that most functions can generate are 
> non-NULL,
>     or NULL with TRYAGAIN set, and as S_regbranch() catches these, it cannot
>     return NULL. The longest sequence of functions that can return NULL (with
>     TRYAGAIN set) is S_reg() -> S_regatom() -> S_regpiece() -> S_regbranch().
>     Rapidly returning right round the loop back to S_reg() is not possible.
>
>     Hence code added by commit c277df42229d99fe to handle a NULL return from
>     S_regbranch(), along with some other code is dead.
>
>     I have replaced all unreachable code with FAIL()s that panic.
> -----------------------------------------------------------------------
>
> Summary of changes:
>  regcomp.c |   31 ++++++++++++++++++++++++-------
>  1 files changed, 24 insertions(+), 7 deletions(-)
>
> diff --git a/regcomp.c b/regcomp.c
> index 22a3191..2c0cd97 100644
> --- a/regcomp.c
> +++ b/regcomp.c
> @@ -8321,6 +8321,9 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool 
> complement_b)
>  #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
>  #endif
>
> +/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
> +   flags. Otherwise would only return NULL if regbranch() returns NULL, which
> +   cannot happen.  */
>  STATIC regnode *
>  S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
>      /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
> @@ -8868,7 +8871,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
> *flagp,U32 depth)
>                      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 
> 0));
>                      br = regbranch(pRExC_state, &flags, 1,depth+1);
>                     if (br == NULL)
> -                       br = reganode(pRExC_state, LONGJMP, 0);
> +                        FAIL("panic: regbranch returned NULL");
>                     else
>                          REGTAIL(pRExC_state, br, reganode(pRExC_state, 
> LONGJMP, 0));
>                     c = *nextchar(pRExC_state);
> @@ -8878,7 +8881,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
> *flagp,U32 depth)
>                         if (is_define)
>                             vFAIL("(?(DEFINE)....) does not allow branches");
>                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake 
> one for optimizer. */
> -                        regbranch(pRExC_state, &flags, 1,depth+1);
> +                        if (!regbranch(pRExC_state, &flags, 1,depth+1))
> +                            FAIL("panic: regbranch returned NULL");
>                          REGTAIL(pRExC_state, ret, lastbr);
>                         if (flags&HASWIDTH)
>                             *flagp |= HASWIDTH;
> @@ -9128,7 +9132,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
> *flagp,U32 depth)
>      /*     branch_len = (paren != 0); */
>
>      if (br == NULL)
> -       return(NULL);
> +        FAIL("panic: regbranch returned NULL");
>      if (*RExC_parse == '|') {
>         if (!SIZE_ONLY && RExC_extralen) {
>             reginsert(pRExC_state, BRANCHJ, br, depth+1);
> @@ -9168,7 +9172,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
> *flagp,U32 depth)
>          br = regbranch(pRExC_state, &flags, 0, depth+1);
>
>         if (br == NULL)
> -           return(NULL);
> +            FAIL("panic: regbranch returned NULL");
>          REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. 
> */
>         lastbr = br;
>         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
> @@ -9325,6 +9329,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 
> *flagp,U32 depth)
>   - regbranch - one alternative of an | operator
>   *
>   * Implements the concatenation operator.
> + *
> + * would only return NULL if regpiece() returns NULL, which cannot happen.
>   */
>  STATIC regnode *
>  S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 
> depth)
> @@ -9364,7 +9370,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 
> *flagp, I32 first, U32 depth)
>         if (latest == NULL) {
>             if (flags & TRYAGAIN)
>                 continue;
> -           return(NULL);
> +            FAIL("panic: regpiece returned NULL");
>         }
>         else if (ret == NULL)
>             ret = latest;
> @@ -9398,6 +9404,9 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 
> *flagp, I32 first, U32 depth)
>   * both the endmarker for their branch list and the body of the last branch.
>   * It might seem that this node could be dispensed with entirely, but the
>   * endmarker role is not redundant.
> + *
> + * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
> + * TRYAGAIN.
>   */
>  STATIC regnode *
>  S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
> @@ -9428,6 +9437,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
> U32 depth)
>      if (ret == NULL) {
>         if (flags & TRYAGAIN)
>             *flagp |= TRYAGAIN;
> +        else
> +            FAIL("panic: regatom returned NULL without setting TRYAGAIN");
>         return(NULL);
>      }
>
> @@ -10048,6 +10059,9 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t 
> *pRExC_state, regnode *node, I32
>     escape sequences, with the one for handling literal escapes requiring
>     a dummy entry for all of the special escapes that are actually handled
>     by the other.
> +
> +   Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
> +   TRYAGAIN.  Otherwise does not return NULL.
>  */
>
>  STATIC regnode *
> @@ -10131,7 +10145,7 @@ tryagain:
>                     }
>                     goto tryagain;
>                 }
> -               return(NULL);
> +                FAIL("panic: reg returned NULL without setting TRYAGAIN");
>         }
>         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
>         break;
> @@ -11766,7 +11780,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
> *flagp, U32 depth,
>       * corresponding bit set if that character is in the list.  For 
> characters
>       * above 255, a range list or swash is used.  There are extra bits for 
> \w,
>       * etc. in locale ANYOFs, as what these match is not determinable at
> -     * compile time */
> +     * compile time
> +     *
> +     * Never returns NULL.
> +     */
>
>      dVAR;
>      UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
>
> --
> Perl5 Master Repository



-- 
perl -Mre=debug -e "/just|another|perl|hacker/"

Reply via email to