In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/3601832e0e3e0dc07d27e6e3da50ad346500f469?hp=2f58fd735e91ad2cc0f1cb43a17451fea41d9d57>
- Log ----------------------------------------------------------------- commit 3601832e0e3e0dc07d27e6e3da50ad346500f469 Author: Karl Williamson <k...@cpan.org> Date: Thu Dec 6 16:57:17 2018 -0700 regen/mk_invlists.pl: Add new table This table contains all the code points that are in any multi-character fold (not the folded-from character, but what that character folds to). It will be used in a future commit. commit 85b52c7a801df3fe16026c0c2ff86663e8c4132e Author: Karl Williamson <k...@cpan.org> Date: Thu Dec 6 16:53:23 2018 -0700 regen/mk_invlists.pl: Rmv no longer used array ----------------------------------------------------------------------- Summary of changes: charclass_invlists.h | 282 ++++++++++++++++++++++++++++++++++++++++++++++++++- embedvar.h | 2 + perlapi.h | 2 + perlvars.h | 1 + regcomp.c | 1 + regen/mk_invlists.pl | 20 +++- uni_keywords.h | 2 +- 7 files changed, 303 insertions(+), 7 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index b55581177e..2ec681d31c 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -28984,6 +28984,286 @@ static const GCB_enum _Perl_GCB_invmap[] = { /* for EBCDIC 037 */ # if 'A' == 65 /* ASCII/Latin1 */ +static const UV _Perl_Is_In_Multi_Char_Fold_invlist[] = { /* for ASCII/Latin1 */ + 79, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x61, + 0x62, + 0x66, + 0x67, + 0x68, + 0x6B, + 0x6C, + 0x6D, + 0x6E, + 0x6F, + 0x73, + 0x75, + 0x77, + 0x78, + 0x79, + 0x7A, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x56B, + 0x56C, + 0x56D, + 0x56E, + 0x574, + 0x575, + 0x576, + 0x577, + 0x57E, + 0x57F, + 0x582, + 0x583, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +# endif /* ASCII/Latin1 */ + +# if 'A' == 193 /* EBCDIC 1047 */ \ + && '\\' == 224 && '[' == 173 && ']' == 189 && '{' == 192 && '}' == 208 \ + && '^' == 95 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ + && '$' == 91 && '@' == 124 && '`' == 121 + +static const UV _Perl_Is_In_Multi_Char_Fold_invlist[] = { /* for EBCDIC 1047 */ + 81, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x81, + 0x82, + 0x86, + 0x87, + 0x88, + 0x8A, + 0x91, + 0x92, + 0x93, + 0x94, + 0x95, + 0x96, + 0xA2, + 0xA4, + 0xA6, + 0xA7, + 0xA8, + 0xA9, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x56B, + 0x56C, + 0x56D, + 0x56E, + 0x574, + 0x575, + 0x576, + 0x577, + 0x57E, + 0x57F, + 0x582, + 0x583, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +# endif /* EBCDIC 1047 */ + +# if 'A' == 193 /* EBCDIC 037 */ \ + && '\\' == 224 && '[' == 186 && ']' == 187 && '{' == 192 && '}' == 208 \ + && '^' == 176 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ + && '$' == 91 && '@' == 124 && '`' == 121 + +static const UV _Perl_Is_In_Multi_Char_Fold_invlist[] = { /* for EBCDIC 037 */ + 81, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x81, + 0x82, + 0x86, + 0x87, + 0x88, + 0x8A, + 0x91, + 0x92, + 0x93, + 0x94, + 0x95, + 0x96, + 0xA2, + 0xA4, + 0xA6, + 0xA7, + 0xA8, + 0xA9, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x56B, + 0x56C, + 0x56D, + 0x56E, + 0x574, + 0x575, + 0x576, + 0x577, + 0x57E, + 0x57F, + 0x582, + 0x583, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +# endif /* EBCDIC 037 */ + +# if 'A' == 65 /* ASCII/Latin1 */ + static const UV _Perl_IVCF_invlist[] = { /* for ASCII/Latin1 */ 1297, /* Number of elements */ 148565664, /* Version and data structure type */ @@ -383428,5 +383708,5 @@ static const U8 WB_table[23][23] = { * 7bd6bcbe3813e0cd55e0998053d182b7bc8c97dcfd0b85028e9f7f55af4ad61b lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl * 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl - * 743fbd71a854b7898795d351668ad5059d4f07dcfa870904618c97e6b4809e93 regen/mk_invlists.pl + * 35eecb67dfc9b89a150036e4dcd76de5d46f20d6ddd6976188e1df94a4055b7b regen/mk_invlists.pl * ex: set ro: */ diff --git a/embedvar.h b/embedvar.h index 5bd4a4ea9e..8743da7778 100644 --- a/embedvar.h +++ b/embedvar.h @@ -359,6 +359,8 @@ #define PL_GHasMultiCharFold (my_vars->GHasMultiCharFold) #define PL_InBitmap (my_vars->GInBitmap) #define PL_GInBitmap (my_vars->GInBitmap) +#define PL_InMultiCharFold (my_vars->GInMultiCharFold) +#define PL_GInMultiCharFold (my_vars->GInMultiCharFold) #define PL_LB_invlist (my_vars->GLB_invlist) #define PL_GLB_invlist (my_vars->GLB_invlist) #define PL_Latin1 (my_vars->GLatin1) diff --git a/perlapi.h b/perlapi.h index af5b042b72..bd1d4348d0 100644 --- a/perlapi.h +++ b/perlapi.h @@ -111,6 +111,8 @@ END_EXTERN_C #define PL_HasMultiCharFold (*Perl_GHasMultiCharFold_ptr(NULL)) #undef PL_InBitmap #define PL_InBitmap (*Perl_GInBitmap_ptr(NULL)) +#undef PL_InMultiCharFold +#define PL_InMultiCharFold (*Perl_GInMultiCharFold_ptr(NULL)) #undef PL_LB_invlist #define PL_LB_invlist (*Perl_GLB_invlist_ptr(NULL)) #undef PL_Latin1 diff --git a/perlvars.h b/perlvars.h index 82bce27886..4f0b6c07d8 100644 --- a/perlvars.h +++ b/perlvars.h @@ -276,6 +276,7 @@ PERLVAR(G, AboveLatin1, SV *) PERLVAR(G, Assigned_invlist, SV *) PERLVAR(G, GCB_invlist, SV *) PERLVAR(G, HasMultiCharFold, SV *) +PERLVAR(G, InMultiCharFold, SV *) PERLVAR(G, Latin1, SV *) PERLVAR(G, LB_invlist, SV *) PERLVAR(G, NonL1NonFinalFold, SV *) diff --git a/regcomp.c b/regcomp.c index f4d7af2926..a0cd4d4db3 100644 --- a/regcomp.c +++ b/regcomp.c @@ -21427,6 +21427,7 @@ Perl_init_uniprops(pTHX) PL_utf8_foldable = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]); PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ UNI__PERL_FOLDS_TO_MULTI_CHAR]); + PL_InMultiCharFold = _new_invlist_C_array(_Perl_Is_In_Multi_Char_Fold_invlist); PL_NonL1NonFinalFold = _new_invlist_C_array( NonL1_Perl_Non_Final_Folds_invlist); diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index 5219051259..980b90cafe 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -887,19 +887,28 @@ die "Could not find inversion map for Case_Folding" unless defined $format; die "Incorrect format '$format' for Case_Folding inversion map" unless $format eq 'al' || $format eq 'a'; -my @has_multi_char_fold; +my @is_in_multi_char_fold; my @is_non_final_fold; for my $i (0 .. @$folds_ref - 1) { next unless ref $folds_ref->[$i]; # Skip single-char folds - push @has_multi_char_fold, $cp_ref->[$i]; - # Add to the non-finals list each code point that is in a non-final - # position - for my $j (0 .. @{$folds_ref->[$i]} - 2) { + # Add to the is_in_multis ls list each code point that is in a + # multi-character fold, and to the non-finals list each code point that is + # in a non-final position + for my $j (0 .. @{$folds_ref->[$i]} - 1) { + push @is_in_multi_char_fold, $folds_ref->[$i][$j]; + last if $j == @{$folds_ref->[$i]} - 1; push @is_non_final_fold, $folds_ref->[$i][$j]; } @is_non_final_fold = uniques @is_non_final_fold; + @is_in_multi_char_fold = uniques @is_in_multi_char_fold; +} + +sub _Perl_Is_In_Multi_Char_Fold { + @is_in_multi_char_fold = sort { $a <=> $b } @is_in_multi_char_fold; + my @return = mk_invlist_from_sorted_cp_list(\@is_in_multi_char_fold); + return \@return; } sub _Perl_Non_Final_Folds { @@ -2340,6 +2349,7 @@ no warnings 'qw'; my @props; push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( &NonL1_Perl_Non_Final_Folds + &_Perl_Is_In_Multi_Char_Fold &UpperLatin1 _Perl_GCB,EDGE,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,XPG_XX _Perl_LB,EDGE,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner diff --git a/uni_keywords.h b/uni_keywords.h index aa7b8f669b..54ae6ade92 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -6994,6 +6994,6 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) { * 7bd6bcbe3813e0cd55e0998053d182b7bc8c97dcfd0b85028e9f7f55af4ad61b lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl * 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl - * 743fbd71a854b7898795d351668ad5059d4f07dcfa870904618c97e6b4809e93 regen/mk_invlists.pl + * 35eecb67dfc9b89a150036e4dcd76de5d46f20d6ddd6976188e1df94a4055b7b regen/mk_invlists.pl * c42c035b18a0426443184e9f889aa2b16bef5a9add9805cd853c4e2a783712ff regen/mph.pl * ex: set ro: */ -- Perl5 Master Repository