In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/d5944336d74c819152158dabfd806d49ad0ecb21?hp=8eb023a9cc1ea46c4dc9b9bb6dd651817ac32889>
- Log ----------------------------------------------------------------- commit d5944336d74c819152158dabfd806d49ad0ecb21 Author: Karl Williamson <pub...@khwilliamson.com> Date: Sat Oct 30 10:13:48 2010 -0600 Add consistent synonyms for \p{PosxFOO} This patch adds a set of synonyms \p{XPosixFOO} for the full extended Unicode version of \p{PosixFOO}, so only one rule need be remembered. Similarly, \p{XPerlSpace} is added to preserve the rule for the one similar class that doesn't have Posix in its name. M lib/unicore/mktables M pod/perlrecharclass.pod commit b6dac59a93d03037bfa91e14bd72ebe78feb54ea Author: Karl Williamson <pub...@khwilliamson.com> Date: Sat Oct 30 10:13:35 2010 -0600 perlrecharclass: Nits M pod/perlrecharclass.pod commit f3a73f6e2e4498dda2550f2149d95e42cd551095 Author: Karl Williamson <pub...@khwilliamson.com> Date: Sat Oct 30 09:53:06 2010 -0600 mktables: Clarify \d description for perluniprops M lib/unicore/mktables commit 45e32b91012d25c005eeed1854b16d65b27931cb Author: Karl Williamson <pub...@khwilliamson.com> Date: Sat Oct 30 09:43:50 2010 -0600 mktables: Add tests for wrong equivalence attempts mktables allows for multiple tables to be made equivalent, which in Unix terminology means that they are essentially symbolic links. However this should happen only when they have the same code points in them to begin with. This adds a little more error checking. M lib/unicore/mktables ----------------------------------------------------------------------- Summary of changes: lib/unicore/mktables | 61 ++++++++++++++++++++++++++++++++++++---------- pod/perlrecharclass.pod | 57 +++++++++++++++++++++++++------------------ 2 files changed, 80 insertions(+), 38 deletions(-) diff --git a/lib/unicore/mktables b/lib/unicore/mktables index b7cda64..8a5c89a 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -6560,12 +6560,21 @@ sub trace { return main::trace(@_); } my $addr = do { no overloading; pack 'J', $self; }; my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; - if ($related && - ! $other->perl_extension - && ! $current_leader->perl_extension) - { - Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); - $related = 0; + if ($related) { + if ($current_leader->perl_extension) { + if ($other->perl_extension) { + Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent."); + return; + } + } elsif (! $other->perl_extension) { + Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); + $related = 0; + } + } + + if (! $self->is_empty && ! $self->matches_identically_to($other)) { + Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent"); + return; } my $leader = do { no overloading; pack 'J', $current_leader; }; @@ -11121,7 +11130,8 @@ sub compile_perl() { # range, with their names prefaced by 'Posix', to signify that these match # what the Posix standard says they should match. A couple are # effectively this, but the name doesn't have 'Posix' in it because there - # just isn't any Posix equivalent. + # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended + # to the full Unicode range, by our guesses as to what is appropriate. # 'Any' is all code points. As an error check, instead of just setting it # to be that, construct it to be the union of all the major categories @@ -11186,6 +11196,7 @@ sub compile_perl() { $Lower->set_equivalent_to($gc->table('Lowercase_Letter'), Related => 1); } + $Lower->add_alias('XPosixLower'); $perl->add_match_table("PosixLower", Description => "[a-z]", Initialize => $Lower & $ASCII, @@ -11200,6 +11211,7 @@ sub compile_perl() { $Upper->set_equivalent_to($gc->table('Uppercase_Letter'), Related => 1); } + $Upper->add_alias('XPosixUpper'); $perl->add_match_table("PosixUpper", Description => "[A-Z]", Initialize => $Upper & $ASCII, @@ -11294,6 +11306,7 @@ sub compile_perl() { $Alpha += $gc->table('Nl') if defined $gc->table('Nl'); $Alpha->add_description('Alphabetic'); } + $Alpha->add_alias('XPosixAlpha'); $perl->add_match_table("PosixAlpha", Description => "[A-Za-z]", Initialize => $Alpha & $ASCII, @@ -11303,6 +11316,7 @@ sub compile_perl() { Description => 'Alphabetic and (Decimal) Numeric', Initialize => $Alpha + $gc->table('Decimal_Number'), ); + $Alnum->add_alias('XPosixAlnum'); $perl->add_match_table("PosixAlnum", Description => "[A-Za-z0-9]", Initialize => $Alnum & $ASCII, @@ -11312,14 +11326,16 @@ sub compile_perl() { Description => '\w, including beyond ASCII', Initialize => $Alnum + $gc->table('Mark'), ); + $Word->add_alias('XPosixWord'); my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 $Word += $Pc if defined $Pc; # This is a Perl extension, so the name doesn't begin with Posix. - $perl->add_match_table('PerlWord', + my $PerlWord = $perl->add_match_table('PerlWord', Description => '\w, restricted to ASCII = [A-Za-z0-9_]', Initialize => $Word & $ASCII, ); + $PerlWord->add_alias('PosixWord'); my $Blank = $perl->add_match_table('Blank', Description => '\h, Horizontal white space', @@ -11332,6 +11348,7 @@ sub compile_perl() { - 0x200B, # ZWSP ); $Blank->add_alias('HorizSpace'); # Another name for it. + $Blank->add_alias('XPosixBlank'); $perl->add_match_table("PosixBlank", Description => "\\t and ' '", Initialize => $Blank & $ASCII, @@ -11353,24 +11370,28 @@ sub compile_perl() { Description => '\s including beyond ASCII plus vertical tab', Initialize => $Blank + $VertSpace, ); + $Space->add_alias('XPosixSpace'); $perl->add_match_table("PosixSpace", Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)", Initialize => $Space & $ASCII, ); # Perl's traditional space doesn't include Vertical Tab - my $SpacePerl = $perl->add_match_table('SpacePerl', + my $XPerlSpace = $perl->add_match_table('XPerlSpace', Description => '\s, including beyond ASCII', Initialize => $Space - 0x000B, ); - $perl->add_match_table('PerlSpace', + $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym + my $PerlSpace = $perl->add_match_table('PerlSpace', Description => '\s, restricted to ASCII', - Initialize => $SpacePerl & $ASCII, + Initialize => $XPerlSpace & $ASCII, ); + my $Cntrl = $perl->add_match_table('Cntrl', Description => 'Control characters'); $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); + $Cntrl->add_alias('XPosixCntrl'); $perl->add_match_table("PosixCntrl", Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ES ... [28 chars truncated] Initialize => $Cntrl & $ASCII, @@ -11387,6 +11408,7 @@ sub compile_perl() { Description => 'Characters that are graphical', Initialize => ~ ($Space + $controls), ); + $Graph->add_alias('XPosixGraph'); $perl->add_match_table("PosixGraph", Description => '[-!"#$%&\'()*+,./:;<>?...@[\\\]^_`{|}~0-9A-Za-z]', @@ -11397,6 +11419,7 @@ sub compile_perl() { Description => 'Characters that are graphical plus space characters (but no controls)', Initialize => $Blank + $Graph - $gc->table('Control'), ); + $print->add_alias('XPosixPrint'); $perl->add_match_table("PosixPrint", Description => '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?...@[\\\]^_`{|}~]', @@ -11407,15 +11430,20 @@ sub compile_perl() { $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); # \p{punct} doesn't include the symbols, which posix does + my $XPosixPunct = $perl->add_match_table('XPosixPunct', + Description => '\p{Punct} + ASCII-range \p{Symbol}', + Initialize => $gc->table('Punctuation') + + ($ASCII & $gc->table('Symbol')), + ); $perl->add_match_table('PosixPunct', Description => '[-!"#$%&\'()*+,./:;<>?...@[\\\]^_`{|}~]', - Initialize => $ASCII & ($gc->table('Punctuation') - + $gc->table('Symbol')), + Initialize => $ASCII & $XPosixPunct, ); my $Digit = $perl->add_match_table('Digit', - Description => '\d, extended beyond just [0-9]'); + Description => '[0-9] + all other decimal digits'); $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); + $Digit->add_alias('XPosixDigit'); my $PosixDigit = $perl->add_match_table("PosixDigit", Description => '[0-9]', Initialize => $Digit & $ASCII, @@ -11423,6 +11451,7 @@ sub compile_perl() { # Hex_Digit was not present in first release my $Xdigit = $perl->add_match_table('XDigit'); + $Xdigit->add_alias('XPosixXDigit'); my $Hex = property_ref('Hex_Digit'); if (defined $Hex && ! $Hex->is_empty) { $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); @@ -11434,6 +11463,10 @@ sub compile_perl() { 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO'); } + $perl->add_match_table('PosixXDigit', + Initialize => $ASCII & $Xdigit, + Description => '[0-9A-Fa-f]', + ); my $dt = property_ref('Decomposition_Type'); $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index 7cb2f78..7f96b4b 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -522,7 +522,8 @@ The other counterpart, in the column labelled "Full-range Unicode", matches any appropriate characters in the full Unicode character set. For example, C<\p{Alpha}> will match not just the ASCII alphabetic characters, but any character in the entire Unicode character set that is considered to be -alphabetic. +alphabetic. The backslash sequence column is a (short) synonym for +the Full-range Unicode form. (Each of the counterparts has various synonyms as well. L<perluniprops/Properties accessible through \p{} and \P{}> lists all the @@ -548,25 +549,25 @@ EBCDIC code page is present, they will behave in accordance with those; if absent, the classes will match only their ASCII-range counterparts. If you disagree with this proposal, send email to C<perl5-port...@perl.org>. - [[:...:]] ASCII-range Full-range backslash Note - Unicode Unicode sequence + [[:...:]] ASCII-range Full-range backslash Note + Unicode Unicode sequence ----------------------------------------------------- - alpha \p{PosixAlpha} \p{Alpha} - alnum \p{PosixAlnum} \p{Alnum} + alpha \p{PosixAlpha} \p{XPosixAlpha} + alnum \p{PosixAlnum} \p{XPosixAlnum} ascii \p{ASCII} - blank \p{PosixBlank} \p{Blank} = [1] - \p{HorizSpace} \h [1] - cntrl \p{PosixCntrl} \p{Cntrl} [2] - digit \p{PosixDigit} \p{Digit} \d - graph \p{PosixGraph} \p{Graph} [3] - lower \p{PosixLower} \p{Lower} - print \p{PosixPrint} \p{Print} [4] - punct \p{PosixPunct} \p{Punct} [5] - \p{PerlSpace} \p{SpacePerl} \s [6] - space \p{PosixSpace} \p{Space} [6] - upper \p{PosixUpper} \p{Upper} - word \p{PerlWord} \p{Word} \w - xdigit \p{ASCII_Hex_Digit} \p{XDigit} + blank \p{PosixBlank} \p{XPosixBlank} \h [1] + or \p{HorizSpace} [1] + cntrl \p{PosixCntrl} \p{XPosixCntrl} [2] + digit \p{PosixDigit} \p{XPosixDigit} \d + graph \p{PosixGraph} \p{XPosixGraph} [3] + lower \p{PosixLower} \p{XPosixLower} + print \p{PosixPrint} \p{XPosixPrint} [4] + punct \p{PosixPunct} \p{XPosixPunct} [5] + \p{PerlSpace} \p{XPerlSpace} \s [6] + space \p{PosixSpace} \p{XPosixSpace} [6] + upper \p{PosixUpper} \p{XPosixUpper} + word \p{PosixWord} \p{XPosixWord} \w + xdigit \p{ASCII_Hex_Digit} \p{XPosixXDigit} =over 4 @@ -595,7 +596,7 @@ of all the alphanumerical characters and all punctuation characters. All printable characters, which is the set of all the graphical characters plus whitespace characters that are not also controls. -=item [5] (punct) +=item [5] C<\p{PosixPunct}> and C<[[:punct:]]> in the ASCII range match all the non-controls, non-alphanumeric, non-space characters: @@ -621,6 +622,11 @@ matches the vertical tab, C<\cK>. Same for the two ASCII-only range forms. =back +There are various other synonyms that can be used for these besides +C<\p{HorizSpace}> and \C<\p{XPosixBlank}>. For example +C<\p{PosixAlpha}> can be written as C<\p{Alpha}>. All are listed +in L<perluniprops/Properties accessible through \p{} and \P{}>. + =head4 Negation X<character class, negation> @@ -631,10 +637,12 @@ Some examples: POSIX ASCII-range Full-range backslash Unicode Unicode sequence ----------------------------------------------------- - [[:^digit:]] \P{PosixDigit} \P{Digit} \D - [[:^space:]] \P{PosixSpace} \P{Space} - \P{PerlSpace} \P{SpacePerl} \S - [[:^word:]] \P{PerlWord} \P{Word} \W + [[:^digit:]] \P{PosixDigit} \P{XPosixDigit} \D + [[:^space:]] \P{PosixSpace} \P{XPosixSpace} + \P{PerlSpace} \P{XPerlSpace} \S + [[:^word:]] \P{PerlWord} \P{XPosixWord} \W + +Again, the backslash sequence means Full-range Unicode. =head4 [= =] and [. .] @@ -683,7 +691,8 @@ A regular expression is marked for Unicode semantics if it is encoded in utf8 (usually as a result of including a literal character whose code point is above 255), or if it contains a C<\N{U+...}> or C<\N{I<name>}> construct, or (starting in Perl 5.14) if it was compiled in the scope of a -C<S<use feature "unicode_strings">> pragma. +C<S<use feature "unicode_strings">> pragma, or has the C<"u"> regular +expression modifier. The differences in behavior between locale and non-locale semantics can affect any character whose code point is 255 or less. The -- Perl5 Master Repository