In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/8ebfc29a8e2bb27462b73f5459f31ca6287da41f?hp=2dc12fc0f84e0e6151d5266f167d197b834631ca>
- Log ----------------------------------------------------------------- commit 8ebfc29a8e2bb27462b73f5459f31ca6287da41f Author: Karl Williamson <[email protected]> Date: Wed Sep 12 10:57:54 2012 -0600 regcomp.c: Add a less confusing #define alias ALNUM (meaning \w) is too close to ALNUMC ([[:alnum:]]) for comfort M regcomp.c M regcomp.h commit 963df1c93d511b0f1b6c15ae32dae824c66183ba Author: Karl Williamson <[email protected]> Date: Wed Sep 12 10:04:29 2012 -0600 regcomp.c: Properly handle no isblank(), isascii() Configure probes whether or not these two C library functions are present or not. (However until commit 1c6eef9acffe4b512210edba79119e423ea4874a it didn't find isblank() even if present.) However the code changed by this commit always presumed both functions were present. That there were no failure reports from the field indicates that Perl is being run on systems where they are present. M regcomp.c commit bc943be5e61ddc40e1b4dccb3f22d5adc6a48173 Author: Karl Williamson <[email protected]> Date: Mon Sep 17 15:47:17 2012 -0600 pod/perlrecharclass.pod: Small corrections, typos M pod/perlrecharclass.pod commit 94df5432700afa9b1cda1919857f958a0af99066 Author: Karl Williamson <[email protected]> Date: Wed Sep 12 19:43:02 2012 -0600 perlvar.pod: Document UTF8CACHE M pod/perlvar.pod ----------------------------------------------------------------------- Summary of changes: pod/perlrecharclass.pod | 22 ++++++------ pod/perlvar.pod | 4 ++- regcomp.c | 89 ++++++++++++++++++++++++++++++++++++++-------- regcomp.h | 6 ++- 4 files changed, 91 insertions(+), 30 deletions(-) diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index 06d206b..a273a77 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -345,9 +345,9 @@ C</\pLl/> is valid, but means something different. It matches a two character string: a letter (Unicode property C<\pL>), followed by a lowercase C<l>. -If neither the C</a> modifier nor locale rules are in effect, the use of +If locale rules are not in effect, the use of a Unicode property will force the regular expression into using Unicode -rules. +rules, if it isn't already. Note that almost all properties are immune to case-insensitive matching. That is, adding a C</i> regular expression modifier does not change what @@ -707,7 +707,7 @@ plus 127 (C<DEL>) are control characters. On EBCDIC platforms, it is likely that the code page will define C<[[:cntrl:]]> to be the EBCDIC equivalents of the ASCII controls, plus the controls -that in Unicode have code pointss from 128 through 159. +that in Unicode have code points from 128 through 159. =item [3] @@ -840,11 +840,11 @@ either construct raises an exception. /[01[:lower:]]/ # Matches a character that is either a # lowercase letter, or '0' or '1'. /[[:digit:][:^xdigit:]]/ # Matches a character that can be anything - # except the letters 'a' to 'f'. This is - # because the main character class is composed - # of two POSIX character classes that are ORed - # together, one that matches any digit, and - # the other that matches anything that isn't a - # hex digit. The result matches all - # characters except the letters 'a' to 'f' and - # 'A' to 'F'. + # except the letters 'a' to 'f' and 'A' to + # 'F'. This is because the main character + # class is composed of two POSIX character + # classes that are ORed together, one that + # matches any digit, and the other that + # matches anything that isn't a hex digit. + # The OR adds the digits, leaving only the + # letters 'a' to 'f' and 'A' to 'F' excluded. diff --git a/pod/perlvar.pod b/pod/perlvar.pod index fc99b8e..3fb9899 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -2082,7 +2082,9 @@ This variable controls the state of the internal UTF-8 offset caching code. 1 for on (the default), 0 for off, -1 to debug the caching code by checking all its results against linear scans, and panicking on any discrepancy. -This variable was added in Perl v5.8.9. +This variable was added in Perl v5.8.9. It is subject to change or +removal without notice, but is currently used to avoid recalculating the +boundaries of multi-byte UTF-8-encoded characters. =item ${^UTF8LOCALE} X<${^UTF8LOCALE}> diff --git a/regcomp.c b/regcomp.c index 61b52c9..9d4fa71 100644 --- a/regcomp.c +++ b/regcomp.c @@ -91,6 +91,12 @@ extern const struct regexp_engine my_reg_engine; #include "inline_invlist.c" #include "unicode_constants.h" +#ifdef HAS_ISBLANK +# define hasISBLANK 1 +#else +# define hasISBLANK 0 +#endif + #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) @@ -4209,7 +4215,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case ALNUM: if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR); if (OP(scan) == ALNUMU) { for (value = 0; value < 256; value++) { if (!isWORDCHAR_L1(value)) { @@ -4227,7 +4233,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); + ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR); /* Even if under locale, set the bits for non-locale * in case it isn't a true locale-node. This will @@ -4250,7 +4256,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case NALNUM: if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR); if (OP(scan) == NALNUMU) { for (value = 0; value < 256; value++) { if (isWORDCHAR_L1(value)) { @@ -4268,7 +4274,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); + ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR); /* Even if under locale, set the bits for non-locale in * case it isn't a true locale-node. This will create @@ -11118,7 +11124,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) switch (skip) { case 4: if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ - namedclass = ANYOF_ALNUM; + namedclass = ANYOF_WORDCHAR; break; case 5: /* Names all of length 5. */ @@ -11559,8 +11565,8 @@ parseit: * A similar issue a little bit later when switching on * namedclass. --jhi */ switch ((I32)value) { - case 'w': namedclass = ANYOF_ALNUM; break; - case 'W': namedclass = ANYOF_NALNUM; break; + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; case 's': namedclass = ANYOF_SPACE; break; case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; @@ -11854,32 +11860,83 @@ parseit: runtime_posix_matches_above_Unicode); break; case ANYOF_ASCII: +#ifdef HAS_ISASCII if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } - else { + else +#endif /* Not isascii(); just use the hard-coded definition for it */ _invlist_union(posixes, PL_ASCII, &posixes); - } break; case ANYOF_NASCII: +#ifdef HAS_ISASCII if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } else { +#endif _invlist_union_complement_2nd(posixes, PL_ASCII, &posixes); if (DEPENDS_SEMANTICS) { ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; } +#ifdef HAS_ISASCII } +#endif break; case ANYOF_BLANK: - DO_POSIX(ret, namedclass, posixes, + if (hasISBLANK || ! LOC) { + DO_POSIX(ret, namedclass, posixes, PL_PosixBlank, PL_XPosixBlank); + } + else { /* There is no isblank() and we are in locale: We + use the ASCII range and the above-Latin1 range + code points */ + SV* scratch_list = NULL; + + /* Include all above-Latin1 blanks */ + _invlist_intersection(PL_AboveLatin1, + PL_XPosixBlank, + &scratch_list); + /* Add it to the running total of posix classes */ + if (! posixes) { + posixes = scratch_list; + } + else { + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } + /* Add the ASCII-range blanks to the running total. */ + _invlist_union(posixes, PL_PosixBlank, &posixes); + } break; case ANYOF_NBLANK: - DO_N_POSIX(ret, namedclass, posixes, - PL_PosixBlank, PL_XPosixBlank); + if (hasISBLANK || ! LOC) { + DO_N_POSIX(ret, namedclass, posixes, + PL_PosixBlank, PL_XPosixBlank); + } + else { /* There is no isblank() and we are in locale */ + SV* scratch_list = NULL; + + /* Include all above-Latin1 non-blanks */ + _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list); + + /* Add them to the running total of posix classes */ + _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list); + if (! posixes) { + posixes = scratch_list; + } + else { + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } + + /* Get the list of all non-ASCII-blanks in Latin 1, and + * add them to the running total */ + _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list); + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } break; case ANYOF_CNTRL: DO_POSIX(ret, namedclass, posixes, @@ -12015,11 +12072,11 @@ parseit: } break; } - case ANYOF_ALNUM: /* Really is 'Word' */ + case ANYOF_WORDCHAR: DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); break; - case ANYOF_NALNUM: + case ANYOF_NWORDCHAR: DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv, runtime_posix_matches_above_Unicode); @@ -12150,10 +12207,10 @@ parseit: * modifier to the regex. We first calculate the base node * type, and if it should be inverted */ - case ANYOF_NALNUM: + case ANYOF_NWORDCHAR: invert = ! invert; /* FALLTHROUGH */ - case ANYOF_ALNUM: + case ANYOF_WORDCHAR: op = ALNUM; goto join_charset_classes; diff --git a/regcomp.h b/regcomp.h index 6eb13f2..b54ac84 100644 --- a/regcomp.h +++ b/regcomp.h @@ -387,8 +387,8 @@ struct regnode_charclass_class { /* Should be synchronized with a table in regprop() */ /* 2n should be the normal one, paired with its complement at 2n+1 */ -#define ANYOF_ALNUM ((_CC_WORDCHAR) * 2) /* \w, PL_utf8_alnum, utf8::IsWord, ALNUM */ -#define ANYOF_NALNUM ((ANYOF_ALNUM) + 1) +#define ANYOF_WORDCHAR ((_CC_WORDCHAR) * 2) /* \w, PL_utf8_alnum, utf8::IsWord, ALNUM */ +#define ANYOF_NWORDCHAR ((ANYOF_WORDCHAR) + 1) #define ANYOF_SPACE ((_CC_SPACE) * 2) /* \s */ #define ANYOF_NSPACE ((ANYOF_SPACE) + 1) #define ANYOF_DIGIT ((_CC_DIGIT) * 2) /* \d */ @@ -437,6 +437,8 @@ struct regnode_charclass_class { #define ANYOF_NALNUML ANYOF_NALNUM #define ANYOF_SPACEL ANYOF_SPACE #define ANYOF_NSPACEL ANYOF_NSPACE +#define ANYOF_ALNUM ANYOF_WORDCHAR +#define ANYOF_NALNUM ANYOF_NWORDCHAR /* Utility macros for the bitmap and classes of ANYOF */ -- Perl5 Master Repository
