In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/2bdc80de969b38bd35347b37a3c6378bb5e97519?hp=42b68fb1f934ac50514f30d47e6528efa653e54f>
- Log ----------------------------------------------------------------- commit 2bdc80de969b38bd35347b37a3c6378bb5e97519 Author: Karl Williamson <[email protected]> Date: Wed Sep 26 21:40:22 2012 -0600 perlreguts: Fit long verbatim lines to 79 cols M pod/perlreguts.pod M t/porting/known_pod_issues.dat commit 8ee2793fb909a6ffc6d6ad6631ed31f3931b77ed Author: Karl Williamson <[email protected]> Date: Thu Sep 27 10:12:41 2012 -0600 mktables: Mention USourceData in generated pod These files were included by Unicode for the first time in the final version of its version 6.2. They document proposals for encoding Han characters in Unicode. As far as I can tell, they have no real use except to people working on such proposals. They are considered part of the Unicode Character Database, however, and should be mentioned in perluniprops as data that Perl ignores from that database. M lib/unicore/mktables commit caa7539541ed1f0d9fcf590a46c35f3255d8db61 Author: Karl Williamson <[email protected]> Date: Thu Sep 27 10:12:06 2012 -0600 mktables: Nits in comments, generated pod M lib/unicore/mktables ----------------------------------------------------------------------- Summary of changes: lib/unicore/mktables | 8 +++-- pod/perlreguts.pod | 74 +++++++++++++++++++++------------------ t/porting/known_pod_issues.dat | 1 - 3 files changed, 45 insertions(+), 38 deletions(-) diff --git a/lib/unicore/mktables b/lib/unicore/mktables index e779b08..633686f 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -1021,7 +1021,7 @@ if ($v_version ge v4.1.0) { $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; } if ($v_version ge v6.0.0) { - $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"'; + $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)'; $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; } @@ -1078,7 +1078,7 @@ END # The input files don't list every code point. Those not listed are to be # defaulted to some value. Below are hard-coded what those values are for # non-binary properties as of 5.1. Starting in 5.0, there are -# machine-parsable comment lines in the files the give the defaults; so this +# machine-parsable comment lines in the files that give the defaults; so this # list shouldn't have to be extended. The claim is that all missing entries # for binary properties will default to 'N'. Unicode tried to change that in # 5.2, but the beta period produced enough protest that they backed off. @@ -1149,6 +1149,8 @@ my %ignored_files = ( 'ReadMe.txt' => 'Documentation', 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate U ... [48 chars truncated] 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values', + 'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters', + 'USourceData.pdf' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters', 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests', 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests', 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests', @@ -15336,7 +15338,7 @@ the left brace completely changes the meaning of the construct, from "match" (for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is for improved legibility. -Also, white space, hyphens, and underscores are also normally ignored +Also, white space, hyphens, and underscores are normally ignored everywhere between the {braces}, and hence can be freely added or removed even if the C</x> modifier hasn't been specified on the regular expression. But $a_bold_stricter at the beginning of an entry in the table below diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index ec1c243..75dc6dd 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -182,9 +182,9 @@ POSIX char classes called C<regnode_charclass_class> which has an additional 4-byte (32-bit) bitmap indicating which POSIX char classes have been included. - regnode_charclass_class U32 arg1; - char bitmap[ANYOF_BITMAP_SIZE]; - char classflags[ANYOF_CLASSBITMAP_SIZE]; + regnode_charclass_class U32 arg1; + char bitmap[ANYOF_BITMAP_SIZE]; + char classflags[ANYOF_CLASSBITMAP_SIZE]; =back @@ -354,20 +354,23 @@ simpler form. The call graph looks like this: - reg() # parse a top level regex, or inside of parens - regbranch() # parse a single branch of an alternation - regpiece() # parse a pattern followed by a quantifier - regatom() # parse a simple pattern - regclass() # used to handle a class - reg() # used to handle a parenthesised subpattern - .... - ... - regtail() # finish off the branch - ... - regtail() # finish off the branch sequence. Tie each - # branch's tail to the tail of the sequence - # (NEW) In Debug mode this is - # regtail_study(). + reg() # parse a top level regex, or inside of + # parens + regbranch() # parse a single branch of an alternation + regpiece() # parse a pattern followed by a quantifier + regatom() # parse a simple pattern + regclass() # used to handle a class + reg() # used to handle a parenthesised + # subpattern + .... + ... + regtail() # finish off the branch + ... + regtail() # finish off the branch sequence. Tie each + # branch's tail to the tail of the + # sequence + # (NEW) In Debug mode this is + # regtail_study(). A grammar form might be something like this: @@ -489,11 +492,11 @@ Now for something much more complex: C</x(?:foo*|b[a][rR])(foo|bar)$/> atom >)$< 34 tail~ BRANCH (28) 36 tsdy~ BRANCH (END) (31) - ~ attach to CLOSE1 (34) offset to 3 + ~ attach to CLOSE1 (34) offset to 3 tsdy~ EXACT <foo> (EXACT) (29) - ~ attach to CLOSE1 (34) offset to 5 + ~ attach to CLOSE1 (34) offset to 5 tsdy~ EXACT <bar> (EXACT) (32) - ~ attach to CLOSE1 (34) offset to 2 + ~ attach to CLOSE1 (34) offset to 2 >$< tail~ BRANCH (3) ~ BRANCH (9) ~ TAIL (25) @@ -765,7 +768,7 @@ implement things such as the stringification of C<qr//>. The other structure is pointed to be the C<regexp> struct's C<pprivate> and is in addition to C<intflags> in the same struct considered to be the property of the regex engine which compiled the -regular expression; +regular expression; The regexp structure contains all the data that perl needs to be aware of to properly work with the regular expression. It includes data about @@ -792,19 +795,22 @@ The following structure is used as the C<pprivate> struct by perl's regex engine. Since it is specific to perl it is only of curiosity value to other engine implementations. - typedef struct regexp_internal { - regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */ - U32 *offsets; /* offset annotations 20001228 MJD - data about mapping the program to the - string*/ - regnode *regstclass; /* Optional startclass as identified or constructed - by the optimiser */ - struct reg_data *data; /* Additional miscellaneous data used by the program. - Used to make it easier to clone and free arbitrary - data that the regops need. Often the ARG field of - a regop is an index into this structure */ - regnode program[1]; /* Unwarranted chumminess with compiler. */ - } regexp_internal; + typedef struct regexp_internal { + regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */ + U32 *offsets; /* offset annotations 20001228 MJD + * data about mapping the program to + * the string*/ + regnode *regstclass; /* Optional startclass as identified or + * constructed by the optimiser */ + struct reg_data *data; /* Additional miscellaneous data used + * by the program. Used to make it + * easier to clone and free arbitrary + * data that the regops need. Often the + * ARG field of a regop is an index + * into this structure */ + regnode program[1]; /* Unwarranted chumminess with + * compiler. */ + } regexp_internal; =over 5 diff --git a/t/porting/known_pod_issues.dat b/t/porting/known_pod_issues.dat index f90ab1f..6274a8d 100644 --- a/t/porting/known_pod_issues.dat +++ b/t/porting/known_pod_issues.dat @@ -267,7 +267,6 @@ pod/perlpodspec.pod Verbatim line length including indents exceeds 79 by 9 pod/perlpodstyle.pod Verbatim line length including indents exceeds 79 by 1 pod/perlrebackslash.pod Verbatim line length including indents exceeds 79 by 1 pod/perlref.pod Verbatim line length including indents exceeds 79 by 1 -pod/perlreguts.pod Verbatim line length including indents exceeds 79 by 17 pod/perlrequick.pod Verbatim line length including indents exceeds 79 by 3 pod/perlretut.pod Verbatim line length including indents exceeds 79 by 13 pod/perlrun.pod Verbatim line length including indents exceeds 79 by 2 -- Perl5 Master Repository
