Change 34769: create new unicode props as defined in POSIX spec (optionally use them in the regex engine)

Yves Orton Fri, 07 Nov 2008 12:30:18 -0800

Change 34769 by [EMAIL PROTECTED] on 2008/11/07 20:20:21

        create new unicode props as defined in POSIX spec (optionally use them 
in the regex engine)
        
        Perlbug #60156 and #49302 (and probably others) resolve down to the 
problem
        that the definition of \s and \w and \d and the POSIX charclasses are 
different
        for unicode strings and for non-unicode strings. This broke the 
character class 
        logic in the regex engine. The easiest fix to make the character class 
logic sane
        again is to define new properties which do match.
        
        This change creates new property classes that can be used instead of the
        traditional ones (it does not change the previously defined ones). If 
the
        define in regcomp.h:
        
        #define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1
        
        is changed to 0, then the new mappings will be used. This will fix a 
bunch 
        of bugs that are reported as TODO items in the new reg_posixcc.t test 
file.


Affected files ...

... //depot/perl/MANIFEST#1745 edit
... //depot/perl/lib/unicore/mktables#48 edit
... //depot/perl/regcomp.c#674 edit
... //depot/perl/regcomp.h#136 edit
... //depot/perl/t/op/pat.t#315 edit
... //depot/perl/t/op/reg_posixcc.t#1 add

Differences ...

==== //depot/perl/MANIFEST#1745 (text) ====
Index: perl/MANIFEST
--- perl/MANIFEST#1744~34761~   2008-11-07 02:32:32.000000000 -0800
+++ perl/MANIFEST       2008-11-07 12:20:21.000000000 -0800
@@ -4032,6 +4032,7 @@
 t/op/recurse.t                 See if deep recursion works
 t/op/ref.t                     See if refs and objects work
 t/op/reg_email.t               See if regex recursion works by parsing email 
addresses
+t/op/reg_posixcc.t             See if posix characterclasses behave 
consistantly
 t/op/reg_email_thr.t           See if regex recursion works by parsing email 
addresses in another thread
 t/op/regexp_noamp.t            See if regular expressions work with 
optimizations
 t/op/regexp_notrie.t           See if regular expressions work without trie 
optimisation

==== //depot/perl/lib/unicore/mktables#48 (text) ====
Index: perl/lib/unicore/mktables
--- perl/lib/unicore/mktables#47~31026~ 2007-04-23 02:04:31.000000000 -0700
+++ perl/lib/unicore/mktables   2008-11-07 12:20:21.000000000 -0800
@@ -779,6 +779,31 @@
 ##
 ## Process UnicodeData.txt (Categories, etc.)
 ##
+# These are the character mappings as defined in the POSIX standard
+# and in the case of PerlSpace and PerlWord as is defined in the test macros
+# for binary strings. IOW, PerlWord is [A-Za-z_] and PerlSpace is [\f\r\n\t ]
+# This differs from Word and the existing SpacePerl (note the prefix/suffix 
difference)
+# which is basically the Unicode WhiteSpace without the vertical tab included
+#
+my %TRUE_POSIX_PERL_CC= (
+    PosixAlnum => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 
0x0061..0x007a )},
+    PosixAlpha => { map { $_ => 1 } ( 0x0041..0x005a, 0x0061..0x007a )},
+    # Not Needed: Ascii => { map { $_ => 1 } ( 0x0000..0x007f )},
+    PosixBlank => { map { $_ => 1 } ( 0x0009, 0x0020 )},
+    PosixCntrl => { map { $_ => 1 } ( 0x0000..0x001f, 0x007f )},
+    PosixGraph => { map { $_ => 1 } ( 0x0021..0x007e )},
+    PosixLower => { map { $_ => 1 } ( 0x0061..0x007a )},
+    PosixPrint => { map { $_ => 1 } ( 0x0020..0x007e )},
+    PosixPunct => { map { $_ => 1 } ( 0x0021..0x002f, 0x003a..0x0040, 
0x005b..0x0060, 0x007b..0x007e )},
+    PosixSpace => { map { $_ => 1 } ( 0x0009..0x000d, 0x0020 )},
+    PosixUpper => { map { $_ => 1 } ( 0x0041..0x005a )},
+    # Not needed:  PosixXdigit => { map { $_ => 1 } ( 0x0030..0x0039, 
0x0041..0x0046, 0x0061..0x0066 )},
+    PosixDigit => { map { $_ => 1 } ( 0x0030..0x0039 )},
+    
+    PerlSpace  => { map { $_ => 1 } ( 0x0009..0x000a, 0x000c..0x000d, 0x0020 
)},
+    PerlWord   => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 0x005f, 
0x0061..0x007a )},
+);
+
 sub UnicodeData_Txt()
 {
     my $Bidi     = Table->New();
@@ -795,7 +820,7 @@
     $DC{can} = Table->New();
     $DC{com} = Table->New();
 
-    ## Initialize Perl-generated categories
+    ## Initialize Broken Perl-generated categories
     ## (Categories from UnicodeData.txt are auto-initialized in gencat)
     $Cat{Alnum}  =
        Table->New(Is => 'Alnum',  Desc => "[[:Alnum:]]",  Fuzzy => 0);
@@ -839,6 +864,10 @@
     $To{Title} = Table->New();
     $To{Digit} = Table->New();
 
+    foreach my $cat (keys %TRUE_POSIX_PERL_CC) {
+        $Cat{$cat} = Table->New(Is=>$cat, Fuzzy => 0);
+    }
+
     sub gencat($$$$)
     {
         my ($name, ## Name ("LATIN CAPITAL LETTER A")
@@ -920,6 +949,13 @@
         $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39)  ## 0..9
                                  || ($code >= 0x41 && $code <= 0x46)  ## A..F
                                  || ($code >= 0x61 && $code <= 0x66); ## a..f
+        if ($code<=0x7F) {
+            foreach my $cat (keys %TRUE_POSIX_PERL_CC) {
+                if ($TRUE_POSIX_PERL_CC{$cat}{$code}) {
+                    $Cat{$cat}->$op($code);
+                }
+            }
+        }
     }
 
     ## open ane read file.....

==== //depot/perl/regcomp.c#674 (text) ====
Index: perl/regcomp.c
--- perl/regcomp.c#673~34766~   2008-11-07 03:45:22.000000000 -0800
+++ perl/regcomp.c      2008-11-07 12:20:21.000000000 -0800
@@ -7804,6 +7804,22 @@
     what = WORD;                                        \
     break
 
+/* 
+   We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+   so that it is possible to override the option here without having to 
+   rebuild the entire core. as we are required to do if we change regcomp.h
+   which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
 /*
    parse a class specification and produce either an ANYOF node that
    matches the pattern or if the pattern matches a single char only and
@@ -8092,18 +8108,24 @@
                 * A similar issue a little earlier when switching on value.
                 * --jhi */
                switch ((I32)namedclass) {
+               
+               case _C_C_T_(ALNUMC, isALNUMC(value), 
POSIX_CC_UNI_NAME("Alnum"));
+               case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+               case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+               case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+               case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+               case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+               case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+               case _C_C_T_(PSXSPC, isPSXSPC(value), 
POSIX_CC_UNI_NAME("Space"));
+               case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+               case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
                case _C_C_T_(ALNUM, isALNUM(value), "Word");
-               case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
-               case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
-               case _C_C_T_(BLANK, isBLANK(value), "Blank");
-               case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
-               case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
-               case _C_C_T_(LOWER, isLOWER(value), "Lower");
-               case _C_C_T_(PRINT, isPRINT(value), "Print");
-               case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
-               case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
                case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
-               case _C_C_T_(UPPER, isUPPER(value), "Upper");
+#else
+               case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
+               case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+#endif         
                case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
                case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), 
"VertSpace");
                case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), 
"HorizSpace");
@@ -8150,7 +8172,7 @@
                            ANYOF_BITMAP_SET(ret, value);
                    }
                    yesno = '+';
-                   what = "Digit";
+                   what = POSIX_CC_UNI_NAME("Digit");
                    break;
                case ANYOF_NDIGIT:
                    if (LOC)
@@ -8163,7 +8185,7 @@
                            ANYOF_BITMAP_SET(ret, value);
                    }
                    yesno = '!';
-                   what = "Digit";
+                   what = POSIX_CC_UNI_NAME("Digit");
                    break;              
                case ANYOF_MAX:
                    /* this is to handle \p and \P */

==== //depot/perl/regcomp.h#136 (text) ====
Index: perl/regcomp.h
--- perl/regcomp.h#135~34755~   2008-11-06 10:48:28.000000000 -0800
+++ perl/regcomp.h      2008-11-07 12:20:21.000000000 -0800
@@ -18,6 +18,24 @@
 /* Be really agressive about optimising patterns with trie sequences? */
 #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1
 
+/* Use old style unicode mappings for perl and posix character classes
+ *
+ * NOTE: Enabling this essentially breaks character class matching against 
unicode 
+ * strings, so that POSIX char classes match when they shouldn't, and \d 
matches 
+ * way more than 10 characters, and sometimes a charclass and its complement 
either
+ * both match or neither match.
+ * NOTE: Disabling this will cause various backwards compatibility issues to 
rear 
+ * their head, and tests to fail. However it will make the charclass behaviour 
+ * consistant regardless of internal string type, and make character class 
inversions
+ * consistant. The tests that fail in the regex engine are basically broken 
tests.
+ *
+ * Personally I think 5.12 should disable this for sure. Its a bit more 
debatable for
+ * 5.10, so for now im leaving it enabled.
+ *
+ * -demerphq
+ */
+#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1
+
 /* Should the optimiser take positive assertions into account? */
 #define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0
 

==== //depot/perl/t/op/pat.t#315 (xtext) ====
Index: perl/t/op/pat.t
--- perl/t/op/pat.t#314~34581~  2008-10-25 02:12:05.000000000 -0700
+++ perl/t/op/pat.t     2008-11-07 12:20:21.000000000 -0800
@@ -4617,6 +4617,9 @@
 }
 
 SKIP: {
+    # XXX: This set of tests is essentially broken, POSIX character classes
+    # should not have differing definitions under unicode. 
+    # There are property names for that.
     unless ($ordA == 65) { skip("Assumes ASCII", 4) }
 
     my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/}

==== //depot/perl/t/op/reg_posixcc.t#1 (text) ====
Index: perl/t/op/reg_posixcc.t
--- /dev/null   2008-11-04 07:18:13.288883315 -0800
+++ perl/t/op/reg_posixcc.t     2008-11-07 12:20:21.000000000 -0800
@@ -0,0 +1,127 @@
+#!perl
+
+BEGIN {
+    chdir 't' if -d 't';
+    @INC = '../lib';
+}
+
+use strict;
+use warnings;
+use Test::More tests => 1;
+my @pats=(
+            "\\w",
+           "\\W",
+           "\\s",
+           "\\S",
+           "\\d",
+           "\\D",
+           "[:alnum:]",
+           "[:^alnum:]",
+           "[:alpha:]",
+           "[:^alpha:]",
+           "[:ascii:]",
+           "[:^ascii:]",
+           "[:cntrl:]",
+           "[:^cntrl:]",
+           "[:graph:]",
+           "[:^graph:]",
+           "[:lower:]",
+           "[:^lower:]",
+           "[:print:]",
+           "[:^print:]",
+           "[:punct:]",
+           "[:^punct:]",
+           "[:upper:]",
+           "[:^upper:]",
+           "[:xdigit:]",
+           "[:^xdigit:]",
+           "[:space:]",
+           "[:^space:]",
+           "[:blank:]",
+           "[:^blank:]" );
+sub rangify {
+    my $ary= shift;
+    my $fmt= shift || '%d';
+    my $sep= shift || ' ';
+    my $rng= shift || '..';
+    
+    
+    my $first= $ary->[0];
+    my $last= $ary->[0];
+    my $ret= sprintf $fmt, $first;
+    for my $idx (1..$#$ary) {
+        if ( $ary->[$idx] != $last + 1) {
+            if ($last!=$first) {
+                $ret.=sprintf "%s$fmt",$rng, $last;
+            }             
+            $first= $last= $ary->[$idx];
+            $ret.=sprintf "%s$fmt",$sep,$first;
+         } else {
+            $last= $ary->[$idx];
+         }
+    }
+    if ( $last != $first) {
+        $ret.=sprintf "%s$fmt",$rng, $last;
+    }
+    return $ret;
+}
+
+my $description = "";
+while (@pats) {
+    my ($yes,$no)= splice @pats,0,2;
+    
+    my %err_by_type;
+    my %singles;
+    foreach my $b (0..255) {
+        my %got;
+        for my $type ('unicode','not-unicode') {
+            my $str=chr($b).chr($b);
+            if ($type eq 'unicode') {
+                $str.=chr(256);
+                chop $str;
+            }
+            if ($str=~/[$yes][$no]/) {
+                push @{$err_by_type{$type}},$b;
+            }
+            $got{"[$yes]"}{$type} = $str=~/[$yes]/ ? 1 : 0;
+            $got{"[$no]"}{$type} = $str=~/[$no]/ ? 1 : 0;
+            $got{"[^$yes]"}{$type} = $str=~/[^$yes]/ ? 1 : 0;
+            $got{"[^$no]"}{$type} = $str=~/[^$no]/ ? 1 : 0;
+        }
+        foreach my $which ("[$yes]","[$no]","[^$yes]","[^$no]") {
+            if ($got{$which}{'unicode'} != $got{$which}{'not-unicode'}) {
+                push @{$singles{$which}},$b;
+            }
+        }
+    }
+    
+    
+    if (%err_by_type || %singles) {
+        $description||=" Error:\n";
+        $description .= "/[$yes][$no]/\n";
+        if (%err_by_type) {
+            foreach my $type (keys %err_by_type) {
+                $description .= "\tmatches $type codepoints:\t";
+                $description .= rangify($err_by_type{$type});
+                $description .= "\n";
+            }
+            $description .= "\n";
+        }
+        if (%singles) {
+            $description .= "Unicode/Nonunicode mismatches:\n";
+            foreach my $type (keys %singles) {
+                $description .= "\t$type:\t";
+                $description .= rangify($singles{$type});
+                $description .= "\n";
+            }
+            $description .= "\n";
+        }
+     
+    }
+    
+}
+TODO: {
+    local $TODO = "Only works under PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS = 
0";
+    is( $description, "", "POSIX and perl charclasses should not depend on 
string type");
+};
+__DATA__
End of Patch.

Change 34769: create new unicode props as defined in POSIX spec (optionally use them in the regex engine)

Reply via email to