In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/0abd0d78a73da1c4d13b1c700526b7e5d03b32d4?hp=fe88edf0c4ada4230b84bdb5417029b8f766694a>

- Log -----------------------------------------------------------------
commit 0abd0d78a73da1c4d13b1c700526b7e5d03b32d4
Author: Yves Orton <[email protected]>
Date:   Sun Oct 25 20:37:08 2009 +0100

    disable non-unicode case insensitive trie matching
    
    Also revert 8902bb05b18c9858efa90229ca1ee42b17277554 as it merely
    masked one symptom of the deeper problems.
    
    Also fixes RT #69973, which was a segfault which was exposed by
    8902bb05, see the ticket for further details.
    
    http://rt.perl.org/rt3//Public/Bug/Display.html?id=69973
    
    At the code of this is the problem that in unicode matching a bunch
    of code points have case folding rules beyond just A-Z/a-z. Since
    the case folding rules are decided at runtime by the string, we cant
    use the same TRIE tables for both unicode/non-unicode matching.
    
    Until this is reconciled or some other solution is found case insensitive
    matching only gets the TRIE optimisation when the pattern is uniocde.
    
    From CaseFolding.txt:
    
    00B5; C; 03BC; # MICRO SIGN
    00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE
    00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE
    00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
    00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE
    00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS
    00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE
    00C6; C; 00E6; # LATIN CAPITAL LETTER AE
    00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA
    00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE
    00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE
    00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
    00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS
    00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE
    00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE
    00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
    00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS
    00D0; C; 00F0; # LATIN CAPITAL LETTER ETH
    00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE
    00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE
    00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE
    00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
    00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE
    00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS
    00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE
    00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE
    00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE
    00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX
    00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS
    00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE
    00DE; C; 00FE; # LATIN CAPITAL LETTER THORN
    00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
-----------------------------------------------------------------------

Summary of changes:
 ext/re/t/regop.t |   12 ++++++------
 regcomp.c        |   17 +++++++++++------
 regexec.c        |    9 ++-------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t
index 9118bf6..46e6ec0 100644
--- a/ext/re/t/regop.t
+++ b/ext/re/t/regop.t
@@ -231,12 +231,12 @@ anchored "ABC" at 0
 #Freeing REx: 
"(\\.COM|\\.EXE|\\.BAT|\\.CMD|\\.VBS|\\.VBE|\\.JS|\\.JSE|\\."......
 %MATCHED%
 floating ""$ at 3..4 (checking floating)
-1:1[1] 3:2[1] 5:2[64] 45:83[1] 47:84[1] 48:85[0]
-stclass EXACTF <.> minlen 3
-Found floating substr ""$ at offset 30...
-Does not contradict STCLASS...
-Guessed: match at offset 26
-Matching stclass EXACTF <.> against ".exe"
+#1:1[1] 3:2[1] 5:2[64] 45:83[1] 47:84[1] 48:85[0]
+#stclass EXACTF <.> minlen 3
+#Found floating substr ""$ at offset 30...
+#Does not contradict STCLASS...
+#Guessed: match at offset 26
+#Matching stclass EXACTF <.> against ".exe"
 ---
 #Compiling REx "[q]"
 #size 12 nodes Got 100 bytes for offset annotations.
diff --git a/regcomp.c b/regcomp.c
index 6e9fa26..eb5f12f 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2833,13 +2833,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                                 }
                             } else {
 /* 
-    Currently we assume that the trie can handle unicode and ascii
-    matches fold cased matches. If this proves true then the following
-    define will prevent tries in this situation. 
-    
-    #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
-*/
+    Currently we do not believe that the trie logic can
+    handle case insensitive matching properly when the
+    pattern is not unicode (thus forcing unicode semantics).
+
+    If/when this is fixed the following define can be swapped
+    in below to fully enable trie logic.
+
 #define TRIE_TYPE_IS_SAFE 1
+
+*/
+#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+
                                 if ( last && TRIE_TYPE_IS_SAFE ) {
                                     make_trie( pRExC_state, 
                                             startbranch, first, cur, tail, 
count, 
diff --git a/regexec.c b/regexec.c
index 402ede3..ec09c28 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1105,16 +1105,15 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, 
char *strpos,
 
 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
-    UV uvc_unfolded = 0;                                                   \
     switch (trie_type) {                                                    \
     case trie_utf8_fold:                                                    \
        if ( foldlen>0 ) {                                                  \
-           uvc_unfolded = uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, 
uniflags ); \
+           uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
            foldlen -= len;                                                 \
            uscan += len;                                                   \
            len=0;                                                          \
        } else {                                                            \
-           uvc_unfolded = uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, 
uniflags ); \
+           uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
            uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
            foldlen -= UNISKIP( uvc );                                      \
            uscan = foldbuf + UNISKIP( uvc );                               \
@@ -1140,7 +1139,6 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {     
                  \
        uvc = (UV)*uc;                                                      \
        len = 1;                                                            \
     }                                                                       \
-                                                                           \
     if (uvc < 256) {                                                        \
        charid = trie->charmap[ uvc ];                                      \
     }                                                                       \
@@ -1153,9 +1151,6 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {     
                  \
                charid = (U16)SvIV(*svpp);                                  \
        }                                                                   \
     }                                                                       \
-    if (!charid && trie_type == trie_utf8_fold && !UTF) {                  \
-       charid = trie->charmap[uvc_unfolded];                               \
-    }                                                                      \
 } STMT_END
 
 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \

--
Perl5 Master Repository

Reply via email to