[perl.git] branch blead, updated. v5.21.8-48-g2b2266b

Karl Williamson Fri, 23 Jan 2015 20:55:29 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/2b2266b19c5f9a3d80040e731c02927e6be1dace?hp=10a51195f4a059336d372c2e632f5556281f3806>


- Log -----------------------------------------------------------------
commit 2b2266b19c5f9a3d80040e731c02927e6be1dace
Author: Karl Williamson <[email protected]>
Date:   Fri Jan 23 12:26:42 2015 -0700

    regcomp.c: Clarify comment

M       regcomp.c

commit 6635f04f27bfaa7296536aad8659044e8bed8df3
Author: Karl Williamson <[email protected]>
Date:   Fri Jan 23 12:00:49 2015 -0700

    handy.h: EXTERN_C-ize PL_charclass
    
    See thread http://nntp.perl.org/group/perl.perl5.porters/224999

M       handy.h

commit 7aa20b428680dffac746f9c0947535a222940bda
Author: Karl Williamson <[email protected]>
Date:   Fri Jan 23 11:57:14 2015 -0700

    regcomp.c: Another minor optimization
    
    The [:cased:] internal class now handles [:upper:] and/or [:lower:]
    under /i matching.  This code skipped possible optimizations because it
    didn't think to use this.

M       regcomp.c

commit f59fa626ecf8377ec531b277e5bd1c0f5958916d
Author: Karl Williamson <[email protected]>
Date:   Fri Jan 23 11:20:30 2015 -0700

    regcomp.c: Minor optimizations
    
    \d, [:digit:], and [:xdigit:] don't match anything in the upper Latin1
    range.  Therefore whether or not the target string is UTF-8 or not
    doesn't change what they match, hence the /d modifier acts exactly like
    the /u modifier for them.  At run-time /u executes fewer branches
    because it doesn't have to test if the target string is in UTF-8 or not,
    so treating these as if /u had instead been specified saves some
    runtime.

M       regcomp.c

commit c52b8b12af74c72a617744f9e9dfef7ea49a16c7
Author: Karl Williamson <[email protected]>
Date:   Fri Jan 23 11:19:23 2015 -0700

    regexec.c, regcomp.c: White-space only
    
    This changes some labels to be outdented 2 spaces from surrounding code

M       regcomp.c
M       regexec.c
-----------------------------------------------------------------------

Summary of changes:
 handy.h   |  2 ++
 regcomp.c | 75 +++++++++++++++++++++++++++++++++++++++------------------------
 regexec.c | 26 +++++++++++-----------
 3 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/handy.h b/handy.h
index 1256ea1..faa9f7a 100644
--- a/handy.h
+++ b/handy.h
@@ -1020,6 +1020,7 @@ static const char* const swash_property_names[] = {
 };
 #endif
 
+START_EXTERN_C
 #  ifdef DOINIT
 EXTCONST  U32 PL_charclass[] = {
 #    include "l1_char_class_tab.h"
@@ -1028,6 +1029,7 @@ EXTCONST  U32 PL_charclass[] = {
 #  else /* ! DOINIT */
 EXTCONST U32 PL_charclass[];
 #  endif
+END_EXTERN_C
 
     /* The 1U keeps Solaris from griping when shifting sets the uppermost bit 
*/
 #   define _CC_mask(classnum) (1U << (classnum))
diff --git a/regcomp.c b/regcomp.c
index b0256b7..3851d34 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -8447,7 +8447,7 @@ Perl__invlist_populate_swatch(SV* const invlist,
             swatch[offset >> 3] |= 1 << (offset & 7);
         }
 
-    join_end_of_list:
+      join_end_of_list:
 
        /* Quit if at the end of the list */
         if (i >= len) {
@@ -9619,7 +9619,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t 
*pRExC_state)
                      : REGEX_DEPENDS_CHARSET;
                 has_charset_modifier = DEPENDS_PAT_MOD;
                 break;
-            excess_modifier:
+              excess_modifier:
                 RExC_parse++;
                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of 
twice", ASCII_RESTRICT_PAT_MOD);
@@ -9632,7 +9632,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t 
*pRExC_state)
                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually 
exclusive", has_charset_modifier, *(RExC_parse - 1));
                 }
                 NOT_REACHED; /*NOTREACHED*/
-            neg_modifier:
+              neg_modifier:
                 RExC_parse++;
                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
                                     *(RExC_parse - 1));
@@ -9704,7 +9704,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t 
*pRExC_state)
                 return;
                 /*NOTREACHED*/
             default:
-            fail_modifiers:
+              fail_modifiers:
                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
                /* diag_listed_as: Sequence (?%s...) not recognized in regex; 
marked by <-- HERE in m/%s/ */
                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
@@ -10849,7 +10849,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                 nextchar(pRExC_state);
             }
 
-       do_curly:
+         do_curly:
            if ((flags&SIMPLE)) {
                 MARK_NAUGHTY_EXP(2, 2);
                reginsert(pRExC_state, CURLY, ret, depth+1);
@@ -11791,7 +11791,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
             /* FALLTHROUGH */
        case 'd':
             arg = ANYOF_DIGIT;
-            goto join_posix;
+            if (! DEPENDS_SEMANTICS) {
+                goto join_posix;
+            }
+
+            /* \d doesn't have any matches in the upper Latin1 range, hence /d
+             * is equivalent to /u.  Changing to /u saves some branches at
+             * runtime */
+            op = POSIXU;
+            goto join_posix_op_known;
 
        case 'R':
            ret = reg_node(pRExC_state, LNBREAK);
@@ -11820,7 +11828,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
        case 's':
             arg = ANYOF_SPACE;
 
-        join_posix:
+          join_posix:
 
            op = POSIXD + get_regex_charset(RExC_flags);
             if (op > POSIXA) {  /* /aa is same as /a */
@@ -11830,7 +11838,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                 RExC_contains_locale = 1;
             }
 
-        join_posix_op_known:
+          join_posix_op_known:
 
             if (invert) {
                 op += NPOSIXD - POSIXD;
@@ -11844,7 +11852,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
            *flagp |= HASWIDTH|SIMPLE;
             /* FALLTHROUGH */
 
-         finish_meta_pat:
+          finish_meta_pat:
            nextchar(pRExC_state);
             Set_Node_Length(ret, 2); /* MJD */
            break;
@@ -11900,7 +11908,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
             }
             break;
        case 'k':    /* Handle \k<NAME> and \k'NAME' */
-       parse_named_seq:
+      parse_named_seq:
         {
             char ch= RExC_parse[1];
            if (ch != '<' && ch != '\'' && ch != '{') {
@@ -12066,7 +12074,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
 
            RExC_parse++;
 
-       defchar: {
+         defchar: {
            STRLEN len = 0;
            UV ender = 0;
            char *p;
@@ -12105,7 +12113,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
 
             s0 = s;
 
-       reparse:
+         reparse:
 
             /* We do the EXACTFish to EXACT node only if folding.  (And we
              * don't need to figure this out until pass 2) */
@@ -12341,7 +12349,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                        if (IN_ENCODING && ender < 0x100)
                            goto recode_encoding;
                        break;
-                   recode_encoding:
+                     recode_encoding:
                        if (! RExC_override_recoding) {
                            SV* enc = _get_encoding();
                            ender = reg_recode((const char)(U8)ender, &enc);
@@ -12724,8 +12732,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
                 }
            }   /* End of verifying node ends with an appropriate char */
 
-       loopdone:   /* Jumped to when encounters something that shouldn't be in
-                      the node */
+          loopdone:   /* Jumped to when encounters something that shouldn't be
+                         in the node */
 
             /* I (khw) don't know if you can get here with zero length, but the
              * old code handled this situation by creating a zero-length EXACT
@@ -13193,7 +13201,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, 
SV** return_invlist,
             RExC_parse++;
         }
 
-        no_close:
+      no_close:
         FAIL("Syntax error in (?[...])");
     }
 
@@ -13891,7 +13899,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
             break;
         }
 
-    charclassloop:
+      charclassloop:
 
        namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
         save_value = value;
@@ -14258,7 +14266,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
                        goto recode_encoding;
                    break;
                }
-           recode_encoding:
+             recode_encoding:
                if (! RExC_override_recoding) {
                    SV* enc = _get_encoding();
                    value = reg_recode((const char)(U8)value, &enc);
@@ -14886,7 +14894,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
 
                 /* The actual POSIXish node for all the rest depends on the
                  * charset modifier.  The ones in the first set depend only on
-                 * ASCII or, if available on this platform, locale */
+                 * ASCII or, if available on this platform, also locale */
                 case ANYOF_ASCII:
                 case ANYOF_NASCII:
 #ifdef HAS_ISASCII
@@ -14896,19 +14904,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
 #endif
                     goto join_posix;
 
-                case ANYOF_NCASED:
+                /* The following don't have any matches in the upper Latin1
+                 * range, hence /d is equivalent to /u for them.  Making it /u
+                 * saves some branches at runtime */
+                case ANYOF_DIGIT:
+                case ANYOF_NDIGIT:
+                case ANYOF_XDIGIT:
+                case ANYOF_NXDIGIT:
+                    if (! DEPENDS_SEMANTICS) {
+                        goto treat_as_default;
+                    }
+
+                    op = POSIXU;
+                    goto join_posix;
+
+                /* The following change to CASED under /i */
                 case ANYOF_LOWER:
                 case ANYOF_NLOWER:
                 case ANYOF_UPPER:
                 case ANYOF_NUPPER:
-                    /* under /a could be alpha */
                     if (FOLD) {
-                        if (ASCII_RESTRICTED) {
-                            namedclass = ANYOF_ALPHA + (namedclass % 2);
-                        }
-                        else if (! LOC) {
-                            break;
-                        }
+                        namedclass = ANYOF_CASED + (namedclass % 2);
                     }
                     /* FALLTHROUGH */
 
@@ -14916,12 +14932,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
                  * We take advantage of the enum ordering of the charset
                  * modifiers to get the exact node type, */
                 default:
+                  treat_as_default:
                     op = POSIXD + get_regex_charset(RExC_flags);
                     if (op > POSIXA) { /* /aa is same as /a */
                         op = POSIXA;
                     }
 
-                join_posix:
+                  join_posix:
                     /* The odd numbered ones are the complements of the
                      * next-lower even number one */
                     if (namedclass % 2 == 1) {
@@ -16638,7 +16655,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const 
regnode *o, const regmatch_
                         sv_catpv(sv, t);
                     }
 
-                out_dump:
+                  out_dump:
 
                     Safefree(origs);
                     SvREFCNT_dec_NN(lv);
diff --git a/regexec.c b/regexec.c
index 6a209ad..dc940c3 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1845,7 +1845,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
 
         /* FALLTHROUGH */
 
-    do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
+      do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
                            are no glitches with fold-length differences
                            between the target string and pattern */
 
@@ -1879,8 +1879,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
         }
         break;
 
-    do_exactf_utf8:
-    {
+      do_exactf_utf8:
+      {
         unsigned expansion;
 
         /* If one of the operands is in utf8, we can't use the simpler folding
@@ -2021,7 +2021,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
         }
         else {
 
-      posix_utf8:
+          posix_utf8:
             classnum = (_char_class_number) FLAGS(c);
             if (classnum < _FIRST_NON_SWASH_CC) {
                 while (s < strend) {
@@ -4704,7 +4704,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
 
        case BOUNDA:  /*  /\b/a  */
 
-      bound_ascii_match_only:
+          bound_ascii_match_only:
             /* Here the string isn't utf8, or is utf8 and only ascii characters
              * are to match \w.  In the latter case looking at the byte just
              * prior to the current one may be just the final byte of a
@@ -4887,7 +4887,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
                 locinput += 2;
             }
             else {  /* Handle above Latin-1 code points */
-          utf8_posix_above_latin1:
+              utf8_posix_above_latin1:
                 classnum = (_char_class_number) FLAGS(scan);
                 if (classnum < _FIRST_NON_SWASH_CC) {
 
@@ -5143,7 +5143,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
                         locinput += UTF8SKIP(locinput);
                     }
                }
-            exit_utf8:
+              exit_utf8:
                if (locinput > reginfo->strend) sayNO;
            }
            break;
@@ -5580,7 +5580,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, 
regnode *prog)
                 maxopenparen = 0;
                 /* run the pattern returned from (??{...}) */
 
-        eval_recurse_doit: /* Share code with GOSUB below this line
+              eval_recurse_doit: /* Share code with GOSUB below this line
                             * At this point we expect the stack context to be
                             * set up correctly */
 
@@ -6682,7 +6682,7 @@ NULL
             /* NOTREACHED */
            NOT_REACHED;
 
-       curly_try_B_max:
+          curly_try_B_max:
            /* a successful greedy match: now try to match B */
             if (cur_eval && cur_eval->u.eval.close_paren &&
                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
@@ -6732,7 +6732,7 @@ NULL
 #undef ST
 
        case END: /*  last op of main pattern  */
-           fake_end:
+          fake_end:
            if (cur_eval) {
                /* we've just finished A in /(??{A})B/; now continue with B */
 
@@ -6983,7 +6983,7 @@ NULL
 
         /* this is a point to jump to in order to increment
          * locinput by one character */
-        increment_locinput:
+          increment_locinput:
             assert(!NEXTCHR_IS_EOS);
             if (utf8_target) {
                 locinput += PL_utf8skip[nextchr];
@@ -7376,7 +7376,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
     case EXACTFU:
        utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
 
-    do_exactf: {
+      do_exactf: {
         int c1, c2;
         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
 
@@ -7537,7 +7537,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const 
regnode *p,
             }
        }
        else {
-      utf8_posix:
+          utf8_posix:
             classnum = (_char_class_number) FLAGS(p);
             if (classnum < _FIRST_NON_SWASH_CC) {
 

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.21.8-48-g2b2266b

Reply via email to