In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/b2dbdb71e39b92287c295d34124bc97a9229bce6?hp=d339f0618ad0f4e3f2deaf62c06e762f885c8ab4>
- Log ----------------------------------------------------------------- commit b2dbdb71e39b92287c295d34124bc97a9229bce6 Author: Karl Williamson <k...@cpan.org> Date: Tue Dec 18 21:14:45 2018 -0700 regexec.c: Swap parameters to some EQ functions This is in preparation for a future commit where the 2nd parameter will be folded and can use the EQ functions calls which will allow for that. commit b440891300f1367ed87e991de808a772043db4d8 Author: Karl Williamson <k...@cpan.org> Date: Tue Dec 18 21:08:57 2018 -0700 Generalize foldEQ_utf8_flags() Prior to this commit, the second string parameter had to be UTF-8 encoded on input if it was pre-folded. This commit removes that restriction, but leaving it in place for the first string parameter. Should we ever have both parameters possibly pre-folded, we would make the same change to the first parameter, but now, the parameters can just be swapped if necessary to meet this restriction, with no loss of generality. This saves a few instructions. commit cfd2398321b12901ccf268fac831f8dbc085b44d Author: Karl Williamson <k...@cpan.org> Date: Sun Dec 23 13:05:33 2018 -0700 Fix bug in foldEQ_utf8_flags() We need to pass the flag that says to not allow non-ASCII characters to fold to ASCII on to the code that actually does it. There are apparently no current errors that arise from this bug, but a future commit would otherwise expose this problem. commit 68a23e40ba252d7a69cc8dab05663094b1807109 Author: Karl Williamson <k...@cpan.org> Date: Tue Dec 18 21:05:27 2018 -0700 utf8.c: White-space only Vertically align some conditionals, and add some space for visibility commit a308c7e049aaaf6b8a593459292704ecb29655ac Author: Karl Williamson <k...@cpan.org> Date: Fri Dec 21 21:10:26 2018 -0700 regcomp.c, regexec.c: Rename some related variables The new names are shorter and more meaningful. commit d8711abbb076525d5282ec4e1d64fb24f4540072 Author: Karl Williamson <k...@cpan.org> Date: Fri Dec 21 12:55:08 2018 -0700 regcomp.c: Shorten variable name Change 'has_upper_latin1_only_utf8_matches' to 'upper_latin1_only_utf8_matches ', as the initial 'has_' is unnecessary and somewhat misleading in spots commit 712f802a05d71853a2308fec7d80b5a28885e97d Author: Karl Williamson <k...@cpan.org> Date: Wed Dec 19 11:00:10 2018 -0700 Change name of PL_NonL1NonFinalFold The inversion list this refers to now includes the Latin 1 range, so the name was misleading. commit 441f4830ce677a4eb1d149826527fd2eadfd2bda Author: Karl Williamson <k...@cpan.org> Date: Sat Dec 22 22:02:27 2018 -0700 Move 2 property defns to mktables These 2 Unicode-like property definitions used internally by the regular expression compiler are moved by this commit from regen/mk_invlists.pl to lib/unicore/mktables. By placing all these in the same place, maintainers only have to learn one bit of code, instead of two. commit 7e9b4fe4d85e9b669993bf96a7e33ffff3197e20 Author: Karl Williamson <k...@cpan.org> Date: Sat Dec 22 21:32:55 2018 -0700 regen/mk_invlists.pl: Fix bug when 2 ident tables If two tables are identical, the code created a #define of one index of a pointer array to be the other index. But in some cases, that's not sufficient, and the actual pointer must be defined in terms of the other. This showed up in compiling perl with an early Unicode version, but the circumstances could arise again in a future version. commit 8deb65d1e9fb6398838859f9fba258ddc14f34b6 Author: Karl Williamson <k...@cpan.org> Date: Thu Dec 20 11:01:54 2018 -0700 regcomp.c: Avoid a NULL dereference This refactors the code so that it doesn't refer to an object before it makes sure it exists and isn't empty. This hasn't been a problem in the past, but a future commit will call this subroutine with parameters that expose this bug. commit ac7b6cfc962deceb2adc1c179141d8a92384e9aa Author: Karl Williamson <k...@cpan.org> Date: Wed Dec 19 10:13:27 2018 -0700 Change name of PL_utf8_foldable variable This variable's name was out-of-date and misleading. It is the name of an inversion list that contains all the code points in the current version of Unicode that participate in any way in a /i type of fold. commit 20fda8317523a13dd63cdafcc2a2209f13d564f8 Author: Karl Williamson <k...@cpan.org> Date: Fri Dec 21 08:59:04 2018 -0700 regcomp.c: qr/[\xFF]/di doesn't have runtime dependencies Prior to this commit, a class containing U+FF, LATIN SMALL LETTER Y WITH DIAERESIS, generated an ANYOFD regnode because it thought that what matched depended on the UTF-8ness of the target string. But it doesn't. No bugs were introduced because when ANYOFD is encountered the code looks at some flags to determine what sorts of dependencies to further look for, and the flags remained clear. But ANYOFD is less desirable than plain ANYOF, because it adds extra branches to execute. Tests for this fix will be added in a future commit. commit 70efdf699a1b0b6fa493d73f244c7c25a45db011 Author: Karl Williamson <k...@cpan.org> Date: Sat Dec 22 21:21:57 2018 -0700 regcomp.c: Add #ifdef If perl is compiled on an early enough Unicode release, these won't be defined, but since they're only used in deprecated functions, we can just #ifdef them away. commit d421ebf05e17c18e735107b7059d5fe88878998f Author: Karl Williamson <k...@cpan.org> Date: Wed Dec 19 12:45:47 2018 -0700 regcomp.c: Fix comment This comment was out-of-date commit 542d7e4abd75683a540c4f4b2680400a659fff68 Author: Karl Williamson <k...@cpan.org> Date: Sat Dec 22 12:14:38 2018 -0700 re/anyof.t: Extract code into a function This is in preparation for a future commit where it will be used in more than one place. commit 87cafc451fd288662c07600d10523c676c8b56ef Author: Karl Williamson <k...@cpan.org> Date: Sat Dec 22 11:00:54 2018 -0700 t/re/anyof.t: Add capability to utf8::upgrade() ANYOF nodes can generate different things depending on the UTF-8ness of the pattern. This adds the capability of conveniently specifying in a test that the pattern should be upgraded commit 671d696700da249f5b6eb793178ca4a47178eddf Author: Karl Williamson <k...@cpan.org> Date: Fri Dec 21 22:06:15 2018 -0700 t/re/anyof.t: Add 'strict', 'warnings' pragmas commit 743dd5b8b2dec98efbdefa8d388609f5a8373d19 Author: Karl Williamson <k...@cpan.org> Date: Sat Dec 22 13:55:15 2018 -0700 regexec.c: Make sure variable is initialized I don't think this is required, but I think some compiler complained about it (but this commit is being made too long after I changed the line for me to remember for sure). commit d5d8574042ac6aa91f53a0b601f44b560a0ff540 Author: Karl Williamson <k...@cpan.org> Date: Sat Dec 22 13:50:26 2018 -0700 regexec.c: White space only, comment only This commit removes some obsolete comments ----------------------------------------------------------------------- Summary of changes: charclass_invlists.h | 939 ++-- embedvar.h | 8 +- lib/unicore/mktables | 19 +- perlapi.h | 8 +- perlvars.h | 7 +- regcharclass.h | 2 +- regcomp.c | 154 +- regen/mk_invlists.pl | 44 +- regexec.c | 64 +- t/re/anyof.t | 96 +- uni_keywords.h | 13678 +++++++++++++++++++++++++------------------------ utf8.c | 43 +- 12 files changed, 7559 insertions(+), 7503 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index 2ec681d31c..fb7e26781d 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -18045,180 +18045,6 @@ static const int Lowercase_Mapping_invmap[] = { /* for EBCDIC 037 */ # if 'A' == 65 /* ASCII/Latin1 */ -static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for ASCII/Latin1 */ - 45, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x2BC, - 0x2BD, - 0x308, - 0x309, - 0x313, - 0x314, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x574, - 0x575, - 0x57E, - 0x57F, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -# endif /* ASCII/Latin1 */ - -# if 'A' == 193 /* EBCDIC 1047 */ \ - && '\\' == 224 && '[' == 173 && ']' == 189 && '{' == 192 && '}' == 208 \ - && '^' == 95 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ - && '$' == 91 && '@' == 124 && '`' == 121 - -static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 1047 */ - 45, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x2BC, - 0x2BD, - 0x308, - 0x309, - 0x313, - 0x314, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x574, - 0x575, - 0x57E, - 0x57F, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -# endif /* EBCDIC 1047 */ - -# if 'A' == 193 /* EBCDIC 037 */ \ - && '\\' == 224 && '[' == 186 && ']' == 187 && '{' == 192 && '}' == 208 \ - && '^' == 176 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ - && '$' == 91 && '@' == 124 && '`' == 121 - -static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { /* for EBCDIC 037 */ - 45, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x2BC, - 0x2BD, - 0x308, - 0x309, - 0x313, - 0x314, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x574, - 0x575, - 0x57E, - 0x57F, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -# endif /* EBCDIC 037 */ - -# if 'A' == 65 /* ASCII/Latin1 */ - static const UV _Perl_GCB_invlist[] = { /* for ASCII/Latin1 */ 1767, /* Number of elements */ 148565664, /* Version and data structure type */ @@ -28984,286 +28810,6 @@ static const GCB_enum _Perl_GCB_invmap[] = { /* for EBCDIC 037 */ # if 'A' == 65 /* ASCII/Latin1 */ -static const UV _Perl_Is_In_Multi_Char_Fold_invlist[] = { /* for ASCII/Latin1 */ - 79, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x61, - 0x62, - 0x66, - 0x67, - 0x68, - 0x6B, - 0x6C, - 0x6D, - 0x6E, - 0x6F, - 0x73, - 0x75, - 0x77, - 0x78, - 0x79, - 0x7A, - 0x2BC, - 0x2BD, - 0x2BE, - 0x2BF, - 0x300, - 0x302, - 0x307, - 0x309, - 0x30A, - 0x30B, - 0x30C, - 0x30D, - 0x313, - 0x314, - 0x331, - 0x332, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x56B, - 0x56C, - 0x56D, - 0x56E, - 0x574, - 0x575, - 0x576, - 0x577, - 0x57E, - 0x57F, - 0x582, - 0x583, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -# endif /* ASCII/Latin1 */ - -# if 'A' == 193 /* EBCDIC 1047 */ \ - && '\\' == 224 && '[' == 173 && ']' == 189 && '{' == 192 && '}' == 208 \ - && '^' == 95 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ - && '$' == 91 && '@' == 124 && '`' == 121 - -static const UV _Perl_Is_In_Multi_Char_Fold_invlist[] = { /* for EBCDIC 1047 */ - 81, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x81, - 0x82, - 0x86, - 0x87, - 0x88, - 0x8A, - 0x91, - 0x92, - 0x93, - 0x94, - 0x95, - 0x96, - 0xA2, - 0xA4, - 0xA6, - 0xA7, - 0xA8, - 0xA9, - 0x2BC, - 0x2BD, - 0x2BE, - 0x2BF, - 0x300, - 0x302, - 0x307, - 0x309, - 0x30A, - 0x30B, - 0x30C, - 0x30D, - 0x313, - 0x314, - 0x331, - 0x332, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x56B, - 0x56C, - 0x56D, - 0x56E, - 0x574, - 0x575, - 0x576, - 0x577, - 0x57E, - 0x57F, - 0x582, - 0x583, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -# endif /* EBCDIC 1047 */ - -# if 'A' == 193 /* EBCDIC 037 */ \ - && '\\' == 224 && '[' == 186 && ']' == 187 && '{' == 192 && '}' == 208 \ - && '^' == 176 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ - && '$' == 91 && '@' == 124 && '`' == 121 - -static const UV _Perl_Is_In_Multi_Char_Fold_invlist[] = { /* for EBCDIC 037 */ - 81, /* Number of elements */ - 148565664, /* Version and data structure type */ - 1, /* 0 if the list starts at 0; - 1 if it starts at the element beyond 0 */ - 0x0, - 0x81, - 0x82, - 0x86, - 0x87, - 0x88, - 0x8A, - 0x91, - 0x92, - 0x93, - 0x94, - 0x95, - 0x96, - 0xA2, - 0xA4, - 0xA6, - 0xA7, - 0xA8, - 0xA9, - 0x2BC, - 0x2BD, - 0x2BE, - 0x2BF, - 0x300, - 0x302, - 0x307, - 0x309, - 0x30A, - 0x30B, - 0x30C, - 0x30D, - 0x313, - 0x314, - 0x331, - 0x332, - 0x342, - 0x343, - 0x3AC, - 0x3AD, - 0x3AE, - 0x3AF, - 0x3B1, - 0x3B2, - 0x3B7, - 0x3B8, - 0x3B9, - 0x3BA, - 0x3C1, - 0x3C2, - 0x3C5, - 0x3C6, - 0x3C9, - 0x3CA, - 0x3CE, - 0x3CF, - 0x565, - 0x566, - 0x56B, - 0x56C, - 0x56D, - 0x56E, - 0x574, - 0x575, - 0x576, - 0x577, - 0x57E, - 0x57F, - 0x582, - 0x583, - 0x1F00, - 0x1F08, - 0x1F20, - 0x1F28, - 0x1F60, - 0x1F68, - 0x1F70, - 0x1F71, - 0x1F74, - 0x1F75, - 0x1F7C, - 0x1F7D -}; - -# endif /* EBCDIC 037 */ - -# if 'A' == 65 /* ASCII/Latin1 */ - static const UV _Perl_IVCF_invlist[] = { /* for ASCII/Latin1 */ 1297, /* Number of elements */ 148565664, /* Version and data structure type */ @@ -181849,6 +181395,442 @@ static const UV UNI__PERL_IDSTART_invlist[] = { /* for EBCDIC 037 */ # endif /* EBCDIC 037 */ +# if 'A' == 65 /* ASCII/Latin1 */ + +static const UV UNI__PERL_IS_IN_MULTI_CHAR_FOLD_invlist[] = { /* for ASCII/Latin1 */ + 79, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x61, + 0x62, + 0x66, + 0x67, + 0x68, + 0x6B, + 0x6C, + 0x6D, + 0x6E, + 0x6F, + 0x73, + 0x75, + 0x77, + 0x78, + 0x79, + 0x7A, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x56B, + 0x56C, + 0x56D, + 0x56E, + 0x574, + 0x575, + 0x576, + 0x577, + 0x57E, + 0x57F, + 0x582, + 0x583, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +# endif /* ASCII/Latin1 */ + +# if 'A' == 193 /* EBCDIC 1047 */ \ + && '\\' == 224 && '[' == 173 && ']' == 189 && '{' == 192 && '}' == 208 \ + && '^' == 95 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ + && '$' == 91 && '@' == 124 && '`' == 121 + +static const UV UNI__PERL_IS_IN_MULTI_CHAR_FOLD_invlist[] = { /* for EBCDIC 1047 */ + 81, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x81, + 0x82, + 0x86, + 0x87, + 0x88, + 0x8A, + 0x91, + 0x92, + 0x93, + 0x94, + 0x95, + 0x96, + 0xA2, + 0xA4, + 0xA6, + 0xA7, + 0xA8, + 0xA9, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x56B, + 0x56C, + 0x56D, + 0x56E, + 0x574, + 0x575, + 0x576, + 0x577, + 0x57E, + 0x57F, + 0x582, + 0x583, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +# endif /* EBCDIC 1047 */ + +# if 'A' == 193 /* EBCDIC 037 */ \ + && '\\' == 224 && '[' == 186 && ']' == 187 && '{' == 192 && '}' == 208 \ + && '^' == 176 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ + && '$' == 91 && '@' == 124 && '`' == 121 + +static const UV UNI__PERL_IS_IN_MULTI_CHAR_FOLD_invlist[] = { /* for EBCDIC 037 */ + 81, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x81, + 0x82, + 0x86, + 0x87, + 0x88, + 0x8A, + 0x91, + 0x92, + 0x93, + 0x94, + 0x95, + 0x96, + 0xA2, + 0xA4, + 0xA6, + 0xA7, + 0xA8, + 0xA9, + 0x2BC, + 0x2BD, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x3AC, + 0x3AD, + 0x3AE, + 0x3AF, + 0x3B1, + 0x3B2, + 0x3B7, + 0x3B8, + 0x3B9, + 0x3BA, + 0x3C1, + 0x3C2, + 0x3C5, + 0x3C6, + 0x3C9, + 0x3CA, + 0x3CE, + 0x3CF, + 0x565, + 0x566, + 0x56B, + 0x56C, + 0x56D, + 0x56E, + 0x574, + 0x575, + 0x576, + 0x577, + 0x57E, + 0x57F, + 0x582, + 0x583, + 0x1F00, + 0x1F08, + 0x1F20, + 0x1F28, + 0x1F60, + 0x1F68, + 0x1F70, + 0x1F71, + 0x1F74, + 0x1F75, + 0x1F7C, + 0x1F7D +}; + +# endif /* EBCDIC 037 */ + +# if 'A' == 65 /* ASCII/Latin1 */ + +static const UV UNI__PERL_NON_FINAL_FOLDS_invlist[] = { /* for ASCII/Latin1 */ + 39, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x66, + 0x67, + 0x69, + 0x6A, + 0x6C, + 0x6D, + 0x6E, + 0x6F, + 0x73, + 0x75, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x3B9, + 0x3BA, + 0x565, + 0x566, + 0x56B, + 0x56C, + 0x56D, + 0x56E, + 0x576, + 0x577, + 0x582, + 0x583 +}; + +# endif /* ASCII/Latin1 */ + +# if 'A' == 193 /* EBCDIC 1047 */ \ + && '\\' == 224 && '[' == 173 && ']' == 189 && '{' == 192 && '}' == 208 \ + && '^' == 95 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ + && '$' == 91 && '@' == 124 && '`' == 121 + +static const UV UNI__PERL_NON_FINAL_FOLDS_invlist[] = { /* for EBCDIC 1047 */ + 39, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x86, + 0x87, + 0x89, + 0x8A, + 0x93, + 0x94, + 0x95, + 0x96, + 0xA2, + 0xA4, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x3B9, + 0x3BA, + 0x565, + 0x566, + 0x56B, + 0x56C, + 0x56D, + 0x56E, + 0x576, + 0x577, + 0x582, + 0x583 +}; + +# endif /* EBCDIC 1047 */ + +# if 'A' == 193 /* EBCDIC 037 */ \ + && '\\' == 224 && '[' == 186 && ']' == 187 && '{' == 192 && '}' == 208 \ + && '^' == 176 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \ + && '$' == 91 && '@' == 124 && '`' == 121 + +static const UV UNI__PERL_NON_FINAL_FOLDS_invlist[] = { /* for EBCDIC 037 */ + 39, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x86, + 0x87, + 0x89, + 0x8A, + 0x93, + 0x94, + 0x95, + 0x96, + 0xA2, + 0xA4, + 0x2BE, + 0x2BF, + 0x300, + 0x302, + 0x307, + 0x309, + 0x30A, + 0x30B, + 0x30C, + 0x30D, + 0x313, + 0x314, + 0x331, + 0x332, + 0x342, + 0x343, + 0x3B9, + 0x3BA, + 0x565, + 0x566, + 0x56B, + 0x56C, + 0x56D, + 0x56E, + 0x576, + 0x577, + 0x582, + 0x583 +}; + +# endif /* EBCDIC 037 */ + static const UV UNI__PERL_PROBLEMATIC_LOCALE_FOLDEDS_START_invlist[] = { /* for all charsets */ 26, /* Number of elements */ 148565664, /* Version and data structure type */ @@ -382319,7 +382301,9 @@ typedef enum { UNI__PERL_FOLDS_TO_MULTI_CHAR, UNI__PERL_IDCONT, UNI__PERL_IDSTART, + UNI__PERL_IS_IN_MULTI_CHAR_FOLD, UNI__PERL_NCHAR, + UNI__PERL_NON_FINAL_FOLDS, UNI__PERL_PATWS, UNI__PERL_PROBLEMATIC_LOCALE_FOLDEDS_START, UNI__PERL_PROBLEMATIC_LOCALE_FOLDS, @@ -383465,7 +383449,9 @@ static const UV * const uni_prop_ptrs[] = { UNI__PERL_FOLDS_TO_MULTI_CHAR_invlist, UNI__PERL_IDCONT_invlist, UNI__PERL_IDSTART_invlist, + UNI__PERL_IS_IN_MULTI_CHAR_FOLD_invlist, UNI__PERL_NCHAR_invlist, + UNI__PERL_NON_FINAL_FOLDS_invlist, UNI__PERL_PATWS_invlist, UNI__PERL_PROBLEMATIC_LOCALE_FOLDEDS_START_invlist, UNI__PERL_PROBLEMATIC_LOCALE_FOLDS_invlist, @@ -383474,46 +383460,85 @@ static const UV * const uni_prop_ptrs[] = { }; -/* Synonyms for perl properties */ +/* Synonyms for perl properties, and their tables */ #define UNI_AHEX UNI_POSIXXDIGIT +#define UNI_AHEX_invlist UNI_POSIXXDIGIT_invlist #define UNI_ALNUM UNI_XPOSIXALNUM +#define UNI_ALNUM_invlist UNI_XPOSIXALNUM_invlist #define UNI_ALPHA UNI_XPOSIXALPHA #define UNI_ALPHABETIC UNI_XPOSIXALPHA +#define UNI_ALPHABETIC_invlist UNI_XPOSIXALPHA_invlist +#define UNI_ALPHA_invlist UNI_XPOSIXALPHA_invlist #define UNI_ASCIIHEXDIGIT UNI_POSIXXDIGIT +#define UNI_ASCIIHEXDIGIT_invlist UNI_POSIXXDIGIT_invlist #define UNI_BASICLATIN UNI_ASCII +#define UNI_BASICLATIN_invlist UNI_ASCII_invlist #define UNI_BLANK UNI_XPOSIXBLANK +#define UNI_BLANK_invlist UNI_XPOSIXBLANK_invlist #define UNI_CC UNI_XPOSIXCNTRL +#define UNI_CC_invlist UNI_XPOSIXCNTRL_invlist #define UNI_CNTRL UNI_XPOSIXCNTRL +#define UNI_CNTRL_invlist UNI_XPOSIXCNTRL_invlist #define UNI_CONTROL UNI_XPOSIXCNTRL +#define UNI_CONTROL_invlist UNI_XPOSIXCNTRL_invlist #define UNI_DECIMALNUMBER UNI_XPOSIXDIGIT +#define UNI_DECIMALNUMBER_invlist UNI_XPOSIXDIGIT_invlist #define UNI_DIGIT UNI_XPOSIXDIGIT +#define UNI_DIGIT_invlist UNI_XPOSIXDIGIT_invlist #define UNI_GRAPH UNI_XPOSIXGRAPH +#define UNI_GRAPH_invlist UNI_XPOSIXGRAPH_invlist #define UNI_HEX UNI_XPOSIXXDIGIT #define UNI_HEXDIGIT UNI_XPOSIXXDIGIT +#define UNI_HEXDIGIT_invlist UNI_XPOSIXXDIGIT_invlist +#define UNI_HEX_invlist UNI_XPOSIXXDIGIT_invlist #define UNI_HORIZSPACE UNI_XPOSIXBLANK +#define UNI_HORIZSPACE_invlist UNI_XPOSIXBLANK_invlist #define UNI_LC UNI_CASEDLETTER +#define UNI_LC_invlist UNI_CASEDLETTER_invlist #define UNI_LL UNI_LOWERCASELETTER +#define UNI_LL_invlist UNI_LOWERCASELETTER_invlist #define UNI_LOWER UNI_XPOSIXLOWER #define UNI_LOWERCASE UNI_XPOSIXLOWER +#define UNI_LOWERCASE_invlist UNI_XPOSIXLOWER_invlist +#define UNI_LOWER_invlist UNI_XPOSIXLOWER_invlist #define UNI_LT UNI_TITLE +#define UNI_LT_invlist UNI_TITLE_invlist #define UNI_LU UNI_UPPERCASELETTER +#define UNI_LU_invlist UNI_UPPERCASELETTER_invlist #define UNI_L_ UNI_CASEDLETTER #define UNI_L_AMP_ UNI_CASEDLETTER +#define UNI_L_AMP__invlist UNI_CASEDLETTER_invlist +#define UNI_L__invlist UNI_CASEDLETTER_invlist #define UNI_ND UNI_XPOSIXDIGIT +#define UNI_ND_invlist UNI_XPOSIXDIGIT_invlist #define UNI_PERLSPACE UNI_POSIXSPACE +#define UNI_PERLSPACE_invlist UNI_POSIXSPACE_invlist #define UNI_PERLWORD UNI_POSIXWORD +#define UNI_PERLWORD_invlist UNI_POSIXWORD_invlist #define UNI_PRINT UNI_XPOSIXPRINT +#define UNI_PRINT_invlist UNI_XPOSIXPRINT_invlist #define UNI_SPACE UNI_XPOSIXSPACE #define UNI_SPACEPERL UNI_XPOSIXSPACE +#define UNI_SPACEPERL_invlist UNI_XPOSIXSPACE_invlist +#define UNI_SPACE_invlist UNI_XPOSIXSPACE_invlist #define UNI_TITLECASE UNI_TITLE #define UNI_TITLECASELETTER UNI_TITLE +#define UNI_TITLECASELETTER_invlist UNI_TITLE_invlist +#define UNI_TITLECASE_invlist UNI_TITLE_invlist #define UNI_UPPER UNI_XPOSIXUPPER #define UNI_UPPERCASE UNI_XPOSIXUPPER +#define UNI_UPPERCASE_invlist UNI_XPOSIXUPPER_invlist +#define UNI_UPPER_invlist UNI_XPOSIXUPPER_invlist #define UNI_WHITESPACE UNI_XPOSIXSPACE +#define UNI_WHITESPACE_invlist UNI_XPOSIXSPACE_invlist #define UNI_WORD UNI_XPOSIXWORD +#define UNI_WORD_invlist UNI_XPOSIXWORD_invlist #define UNI_WSPACE UNI_XPOSIXSPACE +#define UNI_WSPACE_invlist UNI_XPOSIXSPACE_invlist #define UNI_XDIGIT UNI_XPOSIXXDIGIT +#define UNI_XDIGIT_invlist UNI_XPOSIXXDIGIT_invlist #define UNI_XPERLSPACE UNI_XPOSIXSPACE +#define UNI_XPERLSPACE_invlist UNI_XPOSIXSPACE_invlist #endif /* (defined(PERL_IN_REGCOMP_C) && ! defined(PERL_IN_XSUB_RE)) */ @@ -383704,9 +383729,9 @@ static const U8 WB_table[23][23] = { * 018e20fa2f469667cc7ccd8a3d4a4a8cce8ad9bdf5fce5b2f61137660ea1065f lib/unicore/extracted/DLineBreak.txt * 88c30a794011f5e6dc62154342e8bab1bd4ce2d0c0ab06fb69ba47134dc75b23 lib/unicore/extracted/DNumType.txt * dab1e84f48990e30635a4f489d33212b25d0e35d80839c08e33a8afe5736346c lib/unicore/extracted/DNumValues.txt - * aa29d36570237ec49c1fbd59d9cfafefcce690a286d3273ff4da49da1dfc88f2 lib/unicore/mktables + * b4d304a8173a57d5baca855ba3465c0fbec45a723983445c03bb80cf2dff9f46 lib/unicore/mktables * 7bd6bcbe3813e0cd55e0998053d182b7bc8c97dcfd0b85028e9f7f55af4ad61b lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl * 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl - * 35eecb67dfc9b89a150036e4dcd76de5d46f20d6ddd6976188e1df94a4055b7b regen/mk_invlists.pl + * 915f594b9c5a2d11ee0e0e8fbb758939fd53601f9020d20c34e495bc13b65d96 regen/mk_invlists.pl * ex: set ro: */ diff --git a/embedvar.h b/embedvar.h index 8743da7778..539fc5a32e 100644 --- a/embedvar.h +++ b/embedvar.h @@ -365,8 +365,8 @@ #define PL_GLB_invlist (my_vars->GLB_invlist) #define PL_Latin1 (my_vars->GLatin1) #define PL_GLatin1 (my_vars->GLatin1) -#define PL_NonL1NonFinalFold (my_vars->GNonL1NonFinalFold) -#define PL_GNonL1NonFinalFold (my_vars->GNonL1NonFinalFold) +#define PL_NonFinalFold (my_vars->GNonFinalFold) +#define PL_GNonFinalFold (my_vars->GNonFinalFold) #define PL_Posix_ptrs (my_vars->GPosix_ptrs) #define PL_GPosix_ptrs (my_vars->GPosix_ptrs) #define PL_SB_invlist (my_vars->GSB_invlist) @@ -405,6 +405,8 @@ #define PL_Ghash_state (my_vars->Ghash_state) #define PL_hints_mutex (my_vars->Ghints_mutex) #define PL_Ghints_mutex (my_vars->Ghints_mutex) +#define PL_in_some_fold (my_vars->Gin_some_fold) +#define PL_Gin_some_fold (my_vars->Gin_some_fold) #define PL_keyword_plugin (my_vars->Gkeyword_plugin) #define PL_Gkeyword_plugin (my_vars->Gkeyword_plugin) #define PL_keyword_plugin_mutex (my_vars->Gkeyword_plugin_mutex) @@ -467,8 +469,6 @@ #define PL_Gutf8_charname_begin (my_vars->Gutf8_charname_begin) #define PL_utf8_charname_continue (my_vars->Gutf8_charname_continue) #define PL_Gutf8_charname_continue (my_vars->Gutf8_charname_continue) -#define PL_utf8_foldable (my_vars->Gutf8_foldable) -#define PL_Gutf8_foldable (my_vars->Gutf8_foldable) #define PL_utf8_foldclosures (my_vars->Gutf8_foldclosures) #define PL_Gutf8_foldclosures (my_vars->Gutf8_foldclosures) #define PL_utf8_idcont (my_vars->Gutf8_idcont) diff --git a/lib/unicore/mktables b/lib/unicore/mktables index da5a919f8e..75dad35cbf 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -14980,8 +14980,19 @@ sub compile_perl() { Description => "Code points whose fold is a string of more than one character", ); + my $in_multi_fold = $perl->add_match_table( + "_Perl_Is_In_Multi_Char_Fold", + Description => + "Code points that are in some multiple character fold", + ); + my $non_final_fold = $perl->add_match_table( + "_Perl_Non_Final_Folds", + Description => "Code points that are in some multiple character fold, but not in the final position", + ); if ($v_version lt v3.0.1) { - push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char'; + push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char', + '_Perl_Is_In_Multi_Char_Fold', + '_Perl_Non_Final_Folds'; } # Look through all the known folds to populate these tables. @@ -15009,6 +15020,12 @@ sub compile_perl() { $loc_problem_folds->add_range($start, $end); $found_locale_problematic = 1; } + + if (@hex_folds > 1) { + $in_multi_fold->add_range($cp, $cp); + next if $i < @hex_folds - 1; + $non_final_fold->add_range($cp, $cp); + } } # If this is a problematic fold, add to the start chars the diff --git a/perlapi.h b/perlapi.h index bd1d4348d0..cb77694651 100644 --- a/perlapi.h +++ b/perlapi.h @@ -117,8 +117,8 @@ END_EXTERN_C #define PL_LB_invlist (*Perl_GLB_invlist_ptr(NULL)) #undef PL_Latin1 #define PL_Latin1 (*Perl_GLatin1_ptr(NULL)) -#undef PL_NonL1NonFinalFold -#define PL_NonL1NonFinalFold (*Perl_GNonL1NonFinalFold_ptr(NULL)) +#undef PL_NonFinalFold +#define PL_NonFinalFold (*Perl_GNonFinalFold_ptr(NULL)) #undef PL_Posix_ptrs #define PL_Posix_ptrs (*Perl_GPosix_ptrs_ptr(NULL)) #undef PL_SB_invlist @@ -157,6 +157,8 @@ END_EXTERN_C #define PL_hash_state (*Perl_Ghash_state_ptr(NULL)) #undef PL_hints_mutex #define PL_hints_mutex (*Perl_Ghints_mutex_ptr(NULL)) +#undef PL_in_some_fold +#define PL_in_some_fold (*Perl_Gin_some_fold_ptr(NULL)) #undef PL_keyword_plugin #define PL_keyword_plugin (*Perl_Gkeyword_plugin_ptr(NULL)) #undef PL_keyword_plugin_mutex @@ -215,8 +217,6 @@ END_EXTERN_C #define PL_utf8_charname_begin (*Perl_Gutf8_charname_begin_ptr(NULL)) #undef PL_utf8_charname_continue #define PL_utf8_charname_continue (*Perl_Gutf8_charname_continue_ptr(NULL)) -#undef PL_utf8_foldable -#define PL_utf8_foldable (*Perl_Gutf8_foldable_ptr(NULL)) #undef PL_utf8_foldclosures #define PL_utf8_foldclosures (*Perl_Gutf8_foldclosures_ptr(NULL)) #undef PL_utf8_idcont diff --git a/perlvars.h b/perlvars.h index 4f0b6c07d8..2a4696bbcb 100644 --- a/perlvars.h +++ b/perlvars.h @@ -279,14 +279,13 @@ PERLVAR(G, HasMultiCharFold, SV *) PERLVAR(G, InMultiCharFold, SV *) PERLVAR(G, Latin1, SV *) PERLVAR(G, LB_invlist, SV *) -PERLVAR(G, NonL1NonFinalFold, SV *) +PERLVAR(G, NonFinalFold, SV *) PERLVAR(G, SB_invlist, SV *) PERLVAR(G, SCX_invlist, SV *) PERLVAR(G, UpperLatin1, SV *) /* Code points 128 - 255 */ -/* List of characters that participate in folds (except marks, etc in - * multi-char folds) */ -PERLVARI(G, utf8_foldable, SV *, NULL) +/* List of characters that participate in any fold defined by Unicode */ +PERLVARI(G, in_some_fold, SV *, NULL) PERLVAR(G, utf8_idcont, SV *) PERLVAR(G, utf8_idstart, SV *) diff --git a/regcharclass.h b/regcharclass.h index 179f2b3bf8..3a78d24066 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1909,7 +1909,7 @@ * 018e20fa2f469667cc7ccd8a3d4a4a8cce8ad9bdf5fce5b2f61137660ea1065f lib/unicore/extracted/DLineBreak.txt * 88c30a794011f5e6dc62154342e8bab1bd4ce2d0c0ab06fb69ba47134dc75b23 lib/unicore/extracted/DNumType.txt * dab1e84f48990e30635a4f489d33212b25d0e35d80839c08e33a8afe5736346c lib/unicore/extracted/DNumValues.txt - * aa29d36570237ec49c1fbd59d9cfafefcce690a286d3273ff4da49da1dfc88f2 lib/unicore/mktables + * b4d304a8173a57d5baca855ba3465c0fbec45a723983445c03bb80cf2dff9f46 lib/unicore/mktables * 7bd6bcbe3813e0cd55e0998053d182b7bc8c97dcfd0b85028e9f7f55af4ad61b lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl * 6b1c1caf1004a96c15218dba6cec482f36a036ed91bb545315ba280319133a80 regen/regcharclass.pl diff --git a/regcomp.c b/regcomp.c index 83e7029d50..6da2983617 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3802,7 +3802,7 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour * require special handling. The joining is only done if: * 1) there is room in the current conglomerated node to entirely contain the * next one. - * 2) they are the exact same node type + * 2) they are compatible node types * * The adjacent nodes actually may be separated by NOTHING-kind nodes, and * these get optimized out @@ -10535,13 +10535,30 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) * identical. The final argument, if TRUE, says to take the complement of * the second inversion list before doing the comparison */ - const UV* array_a = invlist_array(a); - const UV* array_b = invlist_array(b); - UV len_a = _invlist_len(a); + const UV len_a = _invlist_len(a); UV len_b = _invlist_len(b); + const UV* array_a = NULL; + const UV* array_b = NULL; + PERL_ARGS_ASSERT__INVLISTEQ; + /* This code avoids accessing the arrays unless it knows the length is + * non-zero */ + + if (len_a == 0) { + if (len_b == 0) { + return ! complement_b; + } + } + else { + array_a = invlist_array(a); + } + + if (len_b != 0) { + array_b = invlist_array(b); + } + /* If are to compare 'a' with the complement of b, set it * up so are looking at b's complement. */ if (complement_b) { @@ -10551,7 +10568,7 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) if (len_b == 0) { return (len_a == 1 && array_a[0] == 0); } - else if (array_b[0] == 0) { + if (array_b[0] == 0) { /* Otherwise, to complement, we invert. Here, the first element is * 0, just remove it. To do this, we just pretend the array starts @@ -10692,19 +10709,19 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) } else { /* Single char fold */ unsigned int k; - unsigned int first_folds_to; - const unsigned int * remaining_folds_to_list; - Size_t folds_to_count; + unsigned int first_fold; + const unsigned int * remaining_folds; + Size_t folds_count; /* It matches itself */ invlist = add_cp_to_invlist(invlist, fc); /* ... plus all the things that fold to it, which are found in * PL_utf8_foldclosures */ - folds_to_count = _inverse_folds(fc, &first_folds_to, - &remaining_folds_to_list); - for (k = 0; k < folds_to_count; k++) { - UV c = (k == 0) ? first_folds_to : remaining_folds_to_list[k-1]; + folds_count = _inverse_folds(fc, &first_fold, + &remaining_folds); + for (k = 0; k < folds_count; k++) { + UV c = (k == 0) ? first_fold : remaining_folds[k-1]; /* /aa doesn't allow folds between ASCII and non- */ if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE) @@ -13049,7 +13066,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, cases, avoiding the _invlist_contains_cp() overhead for those. */ - && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + && ! _invlist_contains_cp(PL_in_some_fold, code_point)) { OP(REGNODE_p(node)) = (LOC) ? EXACTL @@ -14345,7 +14362,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else /* regular fold; see if actually is in a fold */ if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender)) || (ender > 255 - && ! _invlist_contains_cp(PL_utf8_foldable, ender))) + && ! _invlist_contains_cp(PL_in_some_fold, ender))) { /* Here, folding, but the character isn't in a fold. * @@ -14546,7 +14563,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } else if (! _invlist_contains_cp( - PL_NonL1NonFinalFold, + PL_NonFinalFold, valid_utf8_to_uvchr((U8 *) s, NULL))) { break; @@ -16456,9 +16473,9 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl default: /* Other code points are checked against the data for the current Unicode version */ { - Size_t folds_to_count; - unsigned int first_folds_to; - const unsigned int * remaining_folds_to_list; + Size_t folds_count; + unsigned int first_fold; + const unsigned int * remaining_folds; UV folded_cp; if (isASCII(cp)) { @@ -16474,9 +16491,9 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl *invlist = add_cp_to_invlist(*invlist, folded_cp); } - folds_to_count = _inverse_folds(folded_cp, &first_folds_to, - &remaining_folds_to_list); - if (folds_to_count == 0) { + folds_count = _inverse_folds(folded_cp, &first_fold, + &remaining_folds); + if (folds_count == 0) { /* Use deprecated warning to increase the chances of this being * output */ @@ -16487,13 +16504,13 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl else { unsigned int i; - if (first_folds_to > 255) { - *invlist = add_cp_to_invlist(*invlist, first_folds_to); + if (first_fold > 255) { + *invlist = add_cp_to_invlist(*invlist, first_fold); } - for (i = 0; i < folds_to_count - 1; i++) { - if (remaining_folds_to_list[i] > 255) { + for (i = 0; i < folds_count - 1; i++) { + if (remaining_folds[i] > 255) { *invlist = add_cp_to_invlist(*invlist, - remaining_folds_to_list[i]); + remaining_folds[i]); } } } @@ -16696,7 +16713,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* inversion list of code points this node matches only when the target * string is in UTF-8. These are all non-ASCII, < 256. (Because is under * /d) */ - SV* has_upper_latin1_only_utf8_matches = NULL; + SV* upper_latin1_only_utf8_matches = NULL; /* Inversion list of code points this node matches regardless of things * like locale, folding, utf8ness of the target string */ @@ -17979,7 +17996,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * be checked. Get the intersection of this class and all the * possible characters that are foldable. This can quickly narrow * down a large class */ - _invlist_intersection(PL_utf8_foldable, cp_foldable_list, + _invlist_intersection(PL_in_some_fold, cp_foldable_list, &fold_intersection); /* Now look at the foldable characters in this class individually */ @@ -17993,9 +18010,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; unsigned int k; - Size_t folds_to_count; - unsigned int first_folds_to; - const unsigned int * remaining_folds_to_list; + Size_t folds_count; + unsigned int first_fold; + const unsigned int * remaining_folds; if (j < 256) { @@ -18008,11 +18025,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, *use_list = add_cp_to_invlist(*use_list, PL_fold_latin1[j]); } - else { - has_upper_latin1_only_utf8_matches - = add_cp_to_invlist( - has_upper_latin1_only_utf8_matches, - PL_fold_latin1[j]); + else if (j != PL_fold_latin1[j]) { + upper_latin1_only_utf8_matches + = add_cp_to_invlist( + upper_latin1_only_utf8_matches, + PL_fold_latin1[j]); } } @@ -18038,16 +18055,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Single character fold of above Latin1. Add everything * in its fold closure to the list that this node should * match. */ - folds_to_count = _inverse_folds(folded, &first_folds_to, - &remaining_folds_to_list); - for (k = 0; k <= folds_to_count; k++) { + folds_count = _inverse_folds(folded, &first_fold, + &remaining_folds); + for (k = 0; k <= folds_count; k++) { UV c = (k == 0) /* First time through use itself */ ? folded : (k == 1) /* 2nd time use, the first fold */ - ? first_folds_to + ? first_fold /* Then the remaining ones */ - : remaining_folds_to_list[k-2]; + : remaining_folds[k-2]; /* /aa doesn't allow folds between ASCII and non- */ if (( ASCII_FOLD_RESTRICTED @@ -18071,10 +18088,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, else { /* Similarly folds involving non-ascii Latin1 * characters under /d are added to their list */ - has_upper_latin1_only_utf8_matches - = add_cp_to_invlist( - has_upper_latin1_only_utf8_matches, - c); + upper_latin1_only_utf8_matches + = add_cp_to_invlist( + upper_latin1_only_utf8_matches, + c); } } } @@ -18154,12 +18171,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Likewise for anything else in the range that matched only * under UTF-8 */ - if (has_upper_latin1_only_utf8_matches) { + if (upper_latin1_only_utf8_matches) { _invlist_union(cp_list, - has_upper_latin1_only_utf8_matches, + upper_latin1_only_utf8_matches, &cp_list); - SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); - has_upper_latin1_only_utf8_matches = NULL; + SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); + upper_latin1_only_utf8_matches = NULL; } /* If we don't match all the upper Latin1 characters regardless @@ -18185,9 +18202,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, &nonascii_but_latin1_properties); /* And add them to the final list of such characters. */ - _invlist_union(has_upper_latin1_only_utf8_matches, + _invlist_union(upper_latin1_only_utf8_matches, nonascii_but_latin1_properties, - &has_upper_latin1_only_utf8_matches); + &upper_latin1_only_utf8_matches); /* Remove them from what now becomes the unconditional list */ _invlist_subtract(posixes, nonascii_but_latin1_properties, @@ -18208,12 +18225,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Get rid of any characters that we now know are matched * unconditionally from the conditional list, which may make * that list empty */ - _invlist_subtract(has_upper_latin1_only_utf8_matches, + _invlist_subtract(upper_latin1_only_utf8_matches, cp_list, - &has_upper_latin1_only_utf8_matches); - if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) { - SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); - has_upper_latin1_only_utf8_matches = NULL; + &upper_latin1_only_utf8_matches); + if (_invlist_len(upper_latin1_only_utf8_matches) == 0) { + SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); + upper_latin1_only_utf8_matches = NULL; } } } @@ -18228,7 +18245,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * class that isn't a Unicode property, and which matches above Unicode, \W * or [\x{110000}] for example. * (Note that in this case, unlike the Posix one above, there is no - * <has_upper_latin1_only_utf8_matches>, because having a Unicode property + * <upper_latin1_only_utf8_matches>, because having a Unicode property * forces Unicode semantics */ if (properties) { if (cp_list) { @@ -18305,7 +18322,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } else if ( DEPENDS_SEMANTICS - && ( has_upper_latin1_only_utf8_matches + && ( upper_latin1_only_utf8_matches || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))) { use_anyofd = TRUE; @@ -18408,7 +18425,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } else { - if (_invlist_contains_cp(PL_utf8_foldable, value)) { + if (_invlist_contains_cp(PL_in_some_fold, value)) { op = EXACT; } } @@ -18669,17 +18686,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Here, the bitmap has been populated with all the Latin1 code points that * always match. Can now add to the overall list those that match only - * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>). + * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>). * */ - if (has_upper_latin1_only_utf8_matches) { + if (upper_latin1_only_utf8_matches) { if (cp_list) { _invlist_union(cp_list, - has_upper_latin1_only_utf8_matches, + upper_latin1_only_utf8_matches, &cp_list); - SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); + SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); } else { - cp_list = has_upper_latin1_only_utf8_matches; + cp_list = upper_latin1_only_utf8_matches; } ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; } @@ -21618,12 +21635,11 @@ Perl_init_uniprops(pTHX) PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]); PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]); - PL_utf8_foldable = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]); + PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]); PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ UNI__PERL_FOLDS_TO_MULTI_CHAR]); - PL_InMultiCharFold = _new_invlist_C_array(_Perl_Is_In_Multi_Char_Fold_invlist); - PL_NonL1NonFinalFold = _new_invlist_C_array( - NonL1_Perl_Non_Final_Folds_invlist); + PL_InMultiCharFold = _new_invlist_C_array(UNI__PERL_IS_IN_MULTI_CHAR_FOLD_invlist); + PL_NonFinalFold = _new_invlist_C_array(UNI__PERL_NON_FINAL_FOLDS_invlist); PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist); PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist); @@ -21633,10 +21649,12 @@ Perl_init_uniprops(pTHX) PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist); PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]); +#ifdef UNI_XIDC /* The below are used only by deprecated functions. They could be removed */ PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]); PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]); PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]); +#endif } SV * diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index 980b90cafe..2587f0e0d4 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -137,6 +137,8 @@ my %keep_together = ( posixxdigit => 1, _perl_any_folds => 1, _perl_folds_to_multi_char => 1, + _perl_is_in_multi_char_fold => 1, + _perl_non_final_folds => 1, _perl_idstart => 1, _perl_idcont => 1, _perl_charname_begin => 1, @@ -887,36 +889,6 @@ die "Could not find inversion map for Case_Folding" unless defined $format; die "Incorrect format '$format' for Case_Folding inversion map" unless $format eq 'al' || $format eq 'a'; -my @is_in_multi_char_fold; -my @is_non_final_fold; - -for my $i (0 .. @$folds_ref - 1) { - next unless ref $folds_ref->[$i]; # Skip single-char folds - - # Add to the is_in_multis ls list each code point that is in a - # multi-character fold, and to the non-finals list each code point that is - # in a non-final position - for my $j (0 .. @{$folds_ref->[$i]} - 1) { - push @is_in_multi_char_fold, $folds_ref->[$i][$j]; - last if $j == @{$folds_ref->[$i]} - 1; - push @is_non_final_fold, $folds_ref->[$i][$j]; - } - @is_non_final_fold = uniques @is_non_final_fold; - @is_in_multi_char_fold = uniques @is_in_multi_char_fold; -} - -sub _Perl_Is_In_Multi_Char_Fold { - @is_in_multi_char_fold = sort { $a <=> $b } @is_in_multi_char_fold; - my @return = mk_invlist_from_sorted_cp_list(\@is_in_multi_char_fold); - return \@return; -} - -sub _Perl_Non_Final_Folds { - @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold; - my @return = mk_invlist_from_sorted_cp_list(\@is_non_final_fold); - return \@return; -} - sub _Perl_IVCF { # This creates a map of the inversion of case folding. i.e., given a @@ -2348,8 +2320,6 @@ no warnings 'qw'; # Ignore non-alpha in sort my @props; push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( - &NonL1_Perl_Non_Final_Folds - &_Perl_Is_In_Multi_Char_Fold &UpperLatin1 _Perl_GCB,EDGE,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,XPG_XX _Perl_LB,EDGE,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner @@ -2508,10 +2478,12 @@ foreach my $property (sort # And a #define for all simple names equivalent to a perl property, # except those that begin with 'is' or 'in'; if (exists $perl_tags{$tag} && $property !~ / ^ i[ns] | = /x) { - push @perl_prop_synonyms, "#define " - . $table_name_prefix - . uc(sanitize_name($define)) + my $name = $table_name_prefix . uc(sanitize_name($define)); + push @perl_prop_synonyms, "#define $name" . " $defined_to"; + push @perl_prop_synonyms, "#define " + . "${name}_invlist" + . " ${defined_to}_invlist"; } } } @@ -2975,7 +2947,7 @@ output_table_trailer(); print $out_fh join "\n", "\n", #'# ifdef DOINIT', #"\n", - "/* Synonyms for perl properties */", + "/* Synonyms for perl properties, and their tables */", @perl_prop_synonyms, #"\n", #"# endif /* DOINIT */", diff --git a/regexec.c b/regexec.c index dc5131b0da..f7da37251f 100644 --- a/regexec.c +++ b/regexec.c @@ -2182,7 +2182,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, STRLEN lnc; U8 c1; U8 c2; - char *e; + char *e = NULL; /* In some cases we accept only the first occurence of 'x' in a sequence of * them. This variable points to just beyond the end of the previous @@ -4529,7 +4529,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } } - if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + if ( ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) { /* Multi-character folds require more context to sort out. Also @@ -4540,25 +4540,24 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, else { /* an EXACTFish node which doesn't begin with a multi-char fold */ c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; if (c1 > 255) { - const unsigned int * remaining_folds_to_list; - unsigned int first_folds_to; + const unsigned int * remaining_folds; + unsigned int first_fold; /* Look up what code points (besides c1) fold to c1; e.g., * [ 'K', KELVIN_SIGN ] both fold to 'k'. */ - Size_t folds_to_count = _inverse_folds(c1, - &first_folds_to, - &remaining_folds_to_list); - if (folds_to_count == 0) { + Size_t folds_count = _inverse_folds(c1, &first_fold, + &remaining_folds); + if (folds_count == 0) { c2 = c1; /* there is only a single character that could match */ } - else if (folds_to_count != 1) { + else if (folds_count != 1) { /* If there aren't exactly two folds to this (itself and * another), it is outside the scope of this function */ use_chrtest_void = TRUE; } else { /* There are two. We already have one, get the other */ - c2 = first_folds_to; + c2 = first_fold; /* Folds that cross the 255/256 boundary are forbidden if * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is @@ -4578,12 +4577,12 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } } else /* Here, c1 is <= 255 */ - if (utf8_target + if ( utf8_target && HAS_NONLATIN1_FOLD_CLOSURE(c1) && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) - && ((OP(text_node) != EXACTFAA - && OP(text_node) != EXACTFAA_NO_TRIE) - || ! isASCII(c1))) + && ( ( OP(text_node) != EXACTFAA + && OP(text_node) != EXACTFAA_NO_TRIE) + || ! isASCII(c1))) { /* Here, there could be something above Latin1 in the target * which folds to this character in the pattern. All such @@ -6345,7 +6344,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; } - case EXACTFL: { /* /abc/il */ + case EXACTFL: /* /abc/il */ + { re_fold_t folder; const U8 * fold_array; const char * s; @@ -6363,8 +6363,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (! utf8_target) { sayNO; } - fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED - | FOLDEQ_S1_FOLDS_SANE; + fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED + | FOLDEQ_S2_FOLDS_SANE; folder = foldEQ_latin1; fold_array = PL_fold_latin1; goto do_exactf; @@ -6374,14 +6374,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO; } assert(is_utf8_pat); - fold_utf8_flags = FOLDEQ_S1_ALREADY_FOLDED; + fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; - fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; + fold_utf8_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; goto do_exactf; case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 @@ -6405,7 +6405,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target + if ( utf8_target || is_utf8_pat || state_num == EXACTFU_SS || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) @@ -6415,8 +6415,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * const l = locinput; char *e = reginfo->strend; - if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat, - l, &e, 0, utf8_target, fold_utf8_flags)) + if (! foldEQ_utf8_flags(l, &e, 0, utf8_target, + s, 0, ln, is_utf8_pat,fold_utf8_flags)) { sayNO; } @@ -6433,7 +6433,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } if (reginfo->strend - locinput < ln) sayNO; - if (ln > 1 && ! folder(s, locinput, ln)) + if (ln > 1 && ! folder(locinput, s, ln)) sayNO; locinput += ln; break; @@ -7129,7 +7129,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO; if (ln > 1 && (type == REF ? memNE(s, locinput, ln) - : ! folder(s, locinput, ln))) + : ! folder(locinput, s, ln))) sayNO; locinput += ln; break; @@ -8237,15 +8237,6 @@ NULL regnode *text_node = ST.B; if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node); - /* this used to be - - (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT) - - But the former is redundant in light of the latter. - - if this changes back then the macro for - IS_TEXT and friends need to change. - */ if (PL_regkind[OP(text_node)] == EXACT) { if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, @@ -8413,13 +8404,6 @@ NULL ST.c1 = ST.c2 = CHRTEST_VOID; } ... 13955 lines suppressed ... -- Perl5 Master Repository