In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/21baa9a2e2a9246add77c4670eeee6383cc1ccb9?hp=3edba68397e487b39cca6e7fc0b75ab4a2f6a341>
- Log ----------------------------------------------------------------- commit 21baa9a2e2a9246add77c4670eeee6383cc1ccb9 Author: Karl Williamson <[email protected]> Date: Thu Nov 30 13:47:30 2017 -0700 perluniprops: Display controls sorted by alpha The complete set of C0 controls is listed by standard abbreviation, but it is better to display them alphabetically, and not in ASCII-platform code point order. commit b9ed226ea83c3e6793242859ed66a48c851a6872 Author: Karl Williamson <[email protected]> Date: Thu Nov 30 13:31:52 2017 -0700 mktables: Add safety code This isn't currently necessary to add, but I discovered this deficiency during debugging, and it could come up in some later change. This code only writes one file when two tables match identically. But it could happen that we've got the pointers to the two tables intertwined so that they each think the other one is the one getting written out, so neither of them do. This checks for that. commit 0ab6e880393a97aa9a4748ff7169e2fe3c6cd82e Author: Karl Williamson <[email protected]> Date: Thu Nov 30 13:18:18 2017 -0700 perluniprops: Make sc property refer to scx The scx is an improved version of the sc(ript) property. This changes mktables to generate perluniprops so that the entries for sc tables refer to the equivalent scx ones. commit f3651218fa1e8bce8ea0dd4c0ffbb63cd82aea22 Author: Karl Williamson <[email protected]> Date: Thu Nov 30 12:59:39 2017 -0700 perluniprops/mktables: Fix bad entry I spotted this entry in perluniprops recently: \p{Nko} \p{Script_Extensions=Nko} (NOT \p{NKo}) It's saying Nko is not NKo. But case isn't supposed to matter. It turned out that the bug was doing an eq without first canonicalizing the names to account for case differences. I was expecting there to be more entries that were erroneous, but it was just this one. commit 7fdcbfeb6ff2c7b15dffdedc4218e4a0803b28b5 Author: Karl Williamson <[email protected]> Date: Thu Nov 30 12:51:43 2017 -0700 mktables: Comment fixes only commit 8fb542fb61a07463a6c7d43ea17e35ecb0d407f6 Author: Karl Williamson <[email protected]> Date: Thu Nov 30 12:38:34 2017 -0700 mktables: Use global for Script_Extensions object This is used in several places, so make its scope global to the program. commit c4880455a025944bb5d97b712f6eafd21810a711 Author: Karl Williamson <[email protected]> Date: Wed Nov 29 20:42:25 2017 -0700 perluniprops: \p{Greek} is a shortcut for scx:greek Since 5.26.0, this (generated) pod has been wrong. The single-form Perl shortcuts for script names now use the Script_Extensions property instead of the (inferior) plain Script property. ----------------------------------------------------------------------- Summary of changes: charclass_invlists.h | 2 +- lib/unicore/mktables | 94 +++++++++++++++++++++++++++++++++++++--------------- regcharclass.h | 2 +- 3 files changed, 69 insertions(+), 29 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index 841820512c..b31e91cefd 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -97454,7 +97454,7 @@ static const U8 WB_table[24][24] = { * be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba lib/unicore/extracted/DLineBreak.txt * 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 lib/unicore/extracted/DNumType.txt * e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 lib/unicore/extracted/DNumValues.txt - * 7e82d9210fb1c8ffadda5a3a04912fc34a165bfe98ac80c1669c1e67c3de044a lib/unicore/mktables + * d690e26d30064cf6ecf46f003b690bad4668750cbbaccb77175aa9b237a1b3da lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl * 48418cbf454eb9ef35c73468ed5ef72ad8603490eabe74181ce4fae42ec72579 regen/mk_invlists.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 8a7be25759..032701b663 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -135,7 +135,7 @@ my $map_directory = 'To'; # Where map files go. # each one of the tens of thousands individually. # # In a match table, the value of a range is irrelevant (and hence the type as -# well, which will always be 0), and arbitrarily set to the null string. +# well, which will always be 0), and arbitrarily set to the empty string. # Using the example above, there would be two match tables for those two # entries, one named Upper would contain the 0x41..0x5A range, and the other # named Lower would contain 0x61..0x7A. @@ -1143,8 +1143,9 @@ my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; # We work with above-Unicode code points, up to IV_MAX, but we may want to use # sentinels above that number. Therefore for internal use, we use a much # smaller number, translating it to IV_MAX only for output. The exact number -# is immaterial (all Unicode code points are treated exactly the same), but -# the algorithm requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1; +# is immaterial (all above-Unicode code points are treated exactly the same), +# but the algorithm requires it to be at least +# 2 * $MAX_UNICODE_CODEPOINTS + 1 my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); @@ -1453,6 +1454,7 @@ my $Assigned; # All assigned characters in this Unicode release my $DI; # Default_Ignorable_Code_Point property my $NChar; # Noncharacter_Code_Point property my $script; +my $scx; # Script_Extensions property # Are there conflicting names because of beginning with 'In_', or 'Is_' my $has_In_conflicts = 0; @@ -8066,7 +8068,7 @@ sub trace { return main::trace(@_); } # disambiguate with). if (defined $conflicting_object) { foreach my $alias ($self->aliases) { - if ($alias->name eq $conflicting_name) { + if (standardize($alias->name) eq standardize($conflicting_name)) { # Here, there is an exact match. This results in # ambiguous comments, so disambiguate by changing the @@ -8159,7 +8161,19 @@ sub trace { return main::trace(@_); } # add_alias() # instead for same # property - && ! $other->perl_extension) + && ! $other->perl_extension + + # We allow the sc and scx properties to be marked as + # related. They are in fact related, and this allows + # the pod to show that better. This test isn't valid + # if this is an early Unicode release without the scx + # property (having that also implies the sc property + # exists, so don't have to test for no 'sc') + && ( ! defined $scx + && ! ( ( $self->property == $script + || $self->property == $scx) + && ( $self->property == $script + || $self->property == $scx)))) { Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); $related = 0; @@ -13160,7 +13174,7 @@ sub setup_script_extensions { # The Script_Extensions property starts out with a clone of the Script # property. - my $scx = property_ref("Script_Extensions"); + $scx = property_ref("Script_Extensions"); $scx = Property->new("scx", Full_Name => "Script_Extensions") if ! defined $scx; $scx->_set_format($STRING_WHITE_SPACE_LIST); @@ -13755,7 +13769,6 @@ END # data is retained in the map table for reference, but the spurious match # tables are deleted. - my $scx = property_ref("Script_Extensions"); if (defined $scx) { foreach my $table ($scx->tables) { next unless $table->name =~ /\s/; # All the new and only the new @@ -13768,6 +13781,22 @@ END } $scx->delete_match_table($table); } + + # Mark the scx table as the parent of the corresponding sc table for + # those which are identical. This causes the pod for the script table + # to refer to the corresponding scx one. + # + # This has to be in a separate loop from above, so as to wait until + # the tables are stabilized before checking for equivalency. + if (defined $pod_directory) { + my $sc = property_ref("Script"); + foreach my $table ($scx->tables) { + my $plain_sc_equiv = $sc->table($table->name); + if ($table->matches_identically_to($plain_sc_equiv)) { + $plain_sc_equiv->set_equivalent_to($table, Related => 1); + } + } + } } return; @@ -14582,7 +14611,12 @@ sub compile_perl() { Description => 'Control characters'); $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); $perl->add_match_table("PosixCntrl", - Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL", + Description => "ASCII control characters " + . "ACK, BEL, BS, CAN, CR, DC1, DC2," + . " DC3, DC4, DEL, DLE, ENQ, EOM," + . " EOT, ESC, ETB, ETX, FF, FS, GS," + . " HT, LF, NAK, NUL, RS, SI, SO," + . " SOH, STX, SUB, SYN, US, VT", Initialize => $Cntrl & $ASCII, ); @@ -15389,7 +15423,6 @@ sub add_perl_synonyms() { # If the version of Unicode includes the Script Extensions (preferably), # or Script property, add its tables - my $scx = property_ref("Script_Extensions"); if (defined $scx) { push @tables, $scx->tables; } @@ -16929,14 +16962,16 @@ constructs, both single and compound forms. B<Compound forms> consist of two components, separated by an equals sign or a colon. The first component is the property name, and the second component is the particular value of the property to match against, for example, -C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters -whose Script property value is Greek. +C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean +to match characters whose Script_Extensions property value is Greek. +(C<Script_Extensions> is an improved version of the C<Script> property.) B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for their equivalent compound forms. The table shows these equivalences. (In our -example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.) -There are also a few Perl-defined single forms that are not shortcuts for a -compound form. One such is C<\\p{Word}>. These are also listed in the table. +example, C<\\p{Greek}> is a just a shortcut for +C<\\p{Script_Extensions=Greek}>). There are also a few Perl-defined single +forms that are not shortcuts for a compound form. One such is C<\\p{Word}>. +These are also listed in the table. In parsing these constructs, Perl always ignores Upper/lower case differences everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as @@ -18120,18 +18155,19 @@ sub write_all_tables() { make_re_pod_entries($table) if defined $pod_directory; # See if the table matches identical code points with - # something that has already been output. In that case, - # no need to have two files with the same code points in - # them. We use the table's hash() method to store these - # in buckets, so that it is quite likely that if two - # tables are in the same bucket they will be identical, so - # don't have to compare tables frequently. The tables - # have to have the same status to share a file, so add - # this to the bucket hash. (The reason for this latter is - # that Heavy.pl associates a status with a file.) - # We don't check tables that are inverses of others, as it - # would lead to some coding complications, and checking - # all the regular ones should find everything. + # something that has already been processed and is ready + # for output. In that case, no need to have two files + # with the same code points in them. We use the table's + # hash() method to store these in buckets, so that it is + # quite likely that if two tables are in the same bucket + # they will be identical, so don't have to compare tables + # frequently. The tables have to have the same status to + # share a file, so add this to the bucket hash. (The + # reason for this latter is that Heavy.pl associates a + # status with a file.) We don't check tables that are + # inverses of others, as it would lead to some coding + # complications, and checking all the regular ones should + # find everything. if ($table->complement == 0) { my $hash = $table->hash . ';' . $table->status; @@ -18140,7 +18176,11 @@ sub write_all_tables() { foreach my $comparison (@{$match_tables_to_write{$hash}}) { - if ($table->matches_identically_to($comparison)) { + # If the table doesn't point back to this one, we + # see if it matches identically + if ( $comparison->leader != $table + && $table->matches_identically_to($comparison)) + { $table->set_equivalent_to($comparison, Related => 0); next TABLE; diff --git a/regcharclass.h b/regcharclass.h index 9ca56f3306..ea9c5d6beb 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1898,7 +1898,7 @@ * be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba lib/unicore/extracted/DLineBreak.txt * 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 lib/unicore/extracted/DNumType.txt * e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 lib/unicore/extracted/DNumValues.txt - * 7e82d9210fb1c8ffadda5a3a04912fc34a165bfe98ac80c1669c1e67c3de044a lib/unicore/mktables + * d690e26d30064cf6ecf46f003b690bad4668750cbbaccb77175aa9b237a1b3da lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl * 9ea6338945a7d70e5ea4b31ac7856c0b521df96be002e94b4b3b7d31debbf3ab regen/regcharclass.pl -- Perl5 Master Repository
