[perl.git] branch blead, updated. v5.17.4-84-g8ebfc29

Karl Williamson Wed, 26 Sep 2012 10:50:35 -0700

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/8ebfc29a8e2bb27462b73f5459f31ca6287da41f?hp=2dc12fc0f84e0e6151d5266f167d197b834631ca>


- Log -----------------------------------------------------------------
commit 8ebfc29a8e2bb27462b73f5459f31ca6287da41f
Author: Karl Williamson <[email protected]>
Date:   Wed Sep 12 10:57:54 2012 -0600

    regcomp.c: Add a less confusing #define alias
    
    ALNUM (meaning \w) is too close to ALNUMC ([[:alnum:]]) for comfort

M       regcomp.c
M       regcomp.h

commit 963df1c93d511b0f1b6c15ae32dae824c66183ba
Author: Karl Williamson <[email protected]>
Date:   Wed Sep 12 10:04:29 2012 -0600

    regcomp.c: Properly handle no isblank(), isascii()
    
    Configure probes whether or not these two C library functions are
    present or not.  (However until commit
    1c6eef9acffe4b512210edba79119e423ea4874a it didn't find isblank() even
    if present.)
    
    However the code changed by this commit always presumed both functions
    were present.  That there were no failure reports from the field
    indicates that Perl is being run on systems where they are present.

M       regcomp.c

commit bc943be5e61ddc40e1b4dccb3f22d5adc6a48173
Author: Karl Williamson <[email protected]>
Date:   Mon Sep 17 15:47:17 2012 -0600

    pod/perlrecharclass.pod: Small corrections, typos

M       pod/perlrecharclass.pod

commit 94df5432700afa9b1cda1919857f958a0af99066
Author: Karl Williamson <[email protected]>
Date:   Wed Sep 12 19:43:02 2012 -0600

    perlvar.pod: Document UTF8CACHE

M       pod/perlvar.pod
-----------------------------------------------------------------------

Summary of changes:
 pod/perlrecharclass.pod |   22 ++++++------
 pod/perlvar.pod         |    4 ++-
 regcomp.c               |   89 ++++++++++++++++++++++++++++++++++++++--------
 regcomp.h               |    6 ++-
 4 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index 06d206b..a273a77 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -345,9 +345,9 @@ C</\pLl/> is valid, but means something different.
 It matches a two character string: a letter (Unicode property C<\pL>),
 followed by a lowercase C<l>.
 
-If neither the C</a> modifier nor locale rules are in effect, the use of
+If locale rules are not in effect, the use of
 a Unicode property will force the regular expression into using Unicode
-rules.
+rules, if it isn't already.
 
 Note that almost all properties are immune to case-insensitive matching.
 That is, adding a C</i> regular expression modifier does not change what
@@ -707,7 +707,7 @@ plus 127 (C<DEL>) are control characters.
 
 On EBCDIC platforms, it is likely that the code page will define C<[[:cntrl:]]>
 to be the EBCDIC equivalents of the ASCII controls, plus the controls
-that in Unicode have code pointss from 128 through 159.
+that in Unicode have code points from 128 through 159.
 
 =item [3]
 
@@ -840,11 +840,11 @@ either construct raises an exception.
  /[01[:lower:]]/          # Matches a character that is either a
                           # lowercase letter, or '0' or '1'.
  /[[:digit:][:^xdigit:]]/ # Matches a character that can be anything
-                         # except the letters 'a' to 'f'.  This is
-                         # because the main character class is composed
-                         # of two POSIX character classes that are ORed
-                         # together, one that matches any digit, and
-                         # the other that matches anything that isn't a
-                         # hex digit.  The result matches all
-                         # characters except the letters 'a' to 'f' and
-                         # 'A' to 'F'.
+                          # except the letters 'a' to 'f' and 'A' to
+                          # 'F'.  This is because the main character
+                          # class is composed of two POSIX character
+                          # classes that are ORed together, one that
+                          # matches any digit, and the other that
+                          # matches anything that isn't a hex digit.
+                          # The OR adds the digits, leaving only the
+                          # letters 'a' to 'f' and 'A' to 'F' excluded.
diff --git a/pod/perlvar.pod b/pod/perlvar.pod
index fc99b8e..3fb9899 100644
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -2082,7 +2082,9 @@ This variable controls the state of the internal UTF-8 
offset caching code.
 1 for on (the default), 0 for off, -1 to debug the caching code by checking
 all its results against linear scans, and panicking on any discrepancy.
 
-This variable was added in Perl v5.8.9.
+This variable was added in Perl v5.8.9.  It is subject to change or
+removal without notice, but is currently used to avoid recalculating the
+boundaries of multi-byte UTF-8-encoded characters.
 
 =item ${^UTF8LOCALE}
 X<${^UTF8LOCALE}>
diff --git a/regcomp.c b/regcomp.c
index 61b52c9..9d4fa71 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -91,6 +91,12 @@ extern const struct regexp_engine my_reg_engine;
 #include "inline_invlist.c"
 #include "unicode_constants.h"
 
+#ifdef HAS_ISBLANK
+#   define hasISBLANK 1
+#else
+#   define hasISBLANK 0
+#endif
+
 #define HAS_NONLATIN1_FOLD_CLOSURE(i) 
_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 #define IS_NON_FINAL_FOLD(c) 
_IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 
@@ -4209,7 +4215,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                case ALNUM:
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
+                           
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
                             if (OP(scan) == ALNUMU) {
                                 for (value = 0; value < 256; value++) {
                                     if (!isWORDCHAR_L1(value)) {
@@ -4227,7 +4233,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
+                           ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
 
                        /* Even if under locale, set the bits for non-locale
                         * in case it isn't a true locale-node.  This will
@@ -4250,7 +4256,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                case NALNUM:
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
+                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
                             if (OP(scan) == NALNUMU) {
                                 for (value = 0; value < 256; value++) {
                                     if (isWORDCHAR_L1(value)) {
@@ -4268,7 +4274,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
+                           ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
 
                        /* Even if under locale, set the bits for non-locale in
                         * case it isn't a true locale-node.  This will create
@@ -11118,7 +11124,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 
value)
                    switch (skip) {
                    case 4:
                        if (memEQ(posixcc, "word", 4)) /* this is not POSIX, 
this is the Perl \w */
-                           namedclass = ANYOF_ALNUM;
+                           namedclass = ANYOF_WORDCHAR;
                        break;
                    case 5:
                        /* Names all of length 5.  */
@@ -11559,8 +11565,8 @@ parseit:
             * A similar issue a little bit later when switching on
             * namedclass. --jhi */
            switch ((I32)value) {
-           case 'w':   namedclass = ANYOF_ALNUM;       break;
-           case 'W':   namedclass = ANYOF_NALNUM;      break;
+           case 'w':   namedclass = ANYOF_WORDCHAR;    break;
+           case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
            case 's':   namedclass = ANYOF_SPACE;       break;
            case 'S':   namedclass = ANYOF_NSPACE;      break;
            case 'd':   namedclass = ANYOF_DIGIT;       break;
@@ -11854,32 +11860,83 @@ parseit:
                         runtime_posix_matches_above_Unicode);
                    break;
                case ANYOF_ASCII:
+#ifdef HAS_ISASCII
                    if (LOC) {
                        ANYOF_CLASS_SET(ret, namedclass);
                    }
-                    else {
+                    else
+#endif  /* Not isascii(); just use the hard-coded definition for it */
                         _invlist_union(posixes, PL_ASCII, &posixes);
-                    }
                    break;
                case ANYOF_NASCII:
+#ifdef HAS_ISASCII
                    if (LOC) {
                        ANYOF_CLASS_SET(ret, namedclass);
                    }
                     else {
+#endif
                         _invlist_union_complement_2nd(posixes,
                                                     PL_ASCII, &posixes);
                         if (DEPENDS_SEMANTICS) {
                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
                         }
+#ifdef HAS_ISASCII
                     }
+#endif
                    break;
                case ANYOF_BLANK:
-                    DO_POSIX(ret, namedclass, posixes,
+                    if (hasISBLANK || ! LOC) {
+                        DO_POSIX(ret, namedclass, posixes,
                                             PL_PosixBlank, PL_XPosixBlank);
+                    }
+                    else { /* There is no isblank() and we are in locale:  We
+                              use the ASCII range and the above-Latin1 range
+                              code points */
+                        SV* scratch_list = NULL;
+
+                        /* Include all above-Latin1 blanks */
+                        _invlist_intersection(PL_AboveLatin1,
+                                              PL_XPosixBlank,
+                                              &scratch_list);
+                        /* Add it to the running total of posix classes */
+                        if (! posixes) {
+                            posixes = scratch_list;
+                        }
+                        else {
+                            _invlist_union(posixes, scratch_list, &posixes);
+                            SvREFCNT_dec(scratch_list);
+                        }
+                        /* Add the ASCII-range blanks to the running total. */
+                        _invlist_union(posixes, PL_PosixBlank, &posixes);
+                    }
                    break;
                case ANYOF_NBLANK:
-                    DO_N_POSIX(ret, namedclass, posixes,
-                                            PL_PosixBlank, PL_XPosixBlank);
+                    if (hasISBLANK || ! LOC) {
+                        DO_N_POSIX(ret, namedclass, posixes,
+                                                PL_PosixBlank, PL_XPosixBlank);
+                    }
+                    else { /* There is no isblank() and we are in locale */
+                        SV* scratch_list = NULL;
+
+                        /* Include all above-Latin1 non-blanks */
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, 
&scratch_list);
+
+                        /* Add them to the running total of posix classes */
+                        _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, 
&scratch_list);
+                        if (! posixes) {
+                            posixes = scratch_list;
+                        }
+                        else {
+                            _invlist_union(posixes, scratch_list, &posixes);
+                            SvREFCNT_dec(scratch_list);
+                        }
+
+                        /* Get the list of all non-ASCII-blanks in Latin 1, and
+                         * add them to the running total */
+                        _invlist_subtract(PL_Latin1, PL_PosixBlank, 
&scratch_list);
+                        _invlist_union(posixes, scratch_list, &posixes);
+                        SvREFCNT_dec(scratch_list);
+                    }
                    break;
                case ANYOF_CNTRL:
                     DO_POSIX(ret, namedclass, posixes,
@@ -12015,11 +12072,11 @@ parseit:
                    }
                    break;
                }
-               case ANYOF_ALNUM:   /* Really is 'Word' */
+               case ANYOF_WORDCHAR:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", 
listsv);
                    break;
-               case ANYOF_NALNUM:
+               case ANYOF_NWORDCHAR:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
                             runtime_posix_matches_above_Unicode);
@@ -12150,10 +12207,10 @@ parseit:
                  * modifier to the regex.  We first calculate the base node
                  * type, and if it should be inverted */
 
-                case ANYOF_NALNUM:
+                case ANYOF_NWORDCHAR:
                     invert = ! invert;
                     /* FALLTHROUGH */
-                case ANYOF_ALNUM:
+                case ANYOF_WORDCHAR:
                     op = ALNUM;
                     goto join_charset_classes;
 
diff --git a/regcomp.h b/regcomp.h
index 6eb13f2..b54ac84 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -387,8 +387,8 @@ struct regnode_charclass_class {
 /* Should be synchronized with a table in regprop() */
 /* 2n should be the normal one, paired with its complement at 2n+1 */
 
-#define ANYOF_ALNUM    ((_CC_WORDCHAR) * 2)  /* \w, PL_utf8_alnum, 
utf8::IsWord, ALNUM */
-#define ANYOF_NALNUM   ((ANYOF_ALNUM) + 1)
+#define ANYOF_WORDCHAR ((_CC_WORDCHAR) * 2)  /* \w, PL_utf8_alnum, 
utf8::IsWord, ALNUM */
+#define ANYOF_NWORDCHAR   ((ANYOF_WORDCHAR) + 1)
 #define ANYOF_SPACE    ((_CC_SPACE) * 2)     /* \s */
 #define ANYOF_NSPACE   ((ANYOF_SPACE) + 1)
 #define ANYOF_DIGIT    ((_CC_DIGIT) * 2)     /* \d */
@@ -437,6 +437,8 @@ struct regnode_charclass_class {
 #define ANYOF_NALNUML   ANYOF_NALNUM
 #define ANYOF_SPACEL    ANYOF_SPACE
 #define ANYOF_NSPACEL   ANYOF_NSPACE
+#define ANYOF_ALNUM ANYOF_WORDCHAR
+#define ANYOF_NALNUM ANYOF_NWORDCHAR
 
 /* Utility macros for the bitmap and classes of ANYOF */
 

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.17.4-84-g8ebfc29

Reply via email to