In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/5dca92787911972e6827cbb3173c9b1f44ea8613?hp=40f914fd7fc2115d5df1c2b1ecc1d960d5f0a210>
- Log ----------------------------------------------------------------- commit 5dca92787911972e6827cbb3173c9b1f44ea8613 Author: Karl Williamson <[email protected]> Date: Mon May 5 22:17:33 2014 -0600 utf8.h: Use new macro type from previous commit This allows for an efficient isUTF8_CHAR macro, which does its own length checking, and uses the UTF8_INVARIANT macro for the first byte. On EBCDIC systems this macro which does a table lookup is quite a bit more efficient than all the branches that would normally have to be done. ----------------------------------------------------------------------- Summary of changes: regcharclass.h | 21 ++++++++---------- regen/regcharclass.pl | 12 +++++++---- utf8.h | 60 +++++++++++++++++++++------------------------------ 3 files changed, 42 insertions(+), 51 deletions(-) diff --git a/regcharclass.h b/regcharclass.h index f4a7e08..ebda2f7 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1009,14 +1009,13 @@ : ( ( 0x72 == ((U8*)s)[2] ) && ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x47 ) ) ? 4 : 0 ) : 0 ) /* - UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes + UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes - 0x0 - 0x3FFF + 0xA0 - 0x3FFF */ /*** GENERATED CODE ***/ #define is_UTF8_CHAR_utf8_no_length_checks(s) \ -( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xDF ) == 0x4B ) || ( ( ((U8*)s)[0] & 0xCC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xDE ) == 0x5A ) || ( ( ( ... [546 chars truncated] -: ( 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAC ) || ( 0xAE <= ((U8*)s)[0] ... [29 chars truncated] +( ( 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAC ) || ( 0xAE <= ((U8*)s)[0] ... [29 chars truncated] ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ? 2 : 0 ... [2 chars truncated] : ( ( ( ( ( ((U8*)s)[0] & 0xFC ) == 0xB8 ) || ((U8*)s)[0] == 0xBC || ( ( ((U8*)s)[0] & 0xFE ) == 0xBE ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 <= (( ... [372 chars truncated] @@ -1727,14 +1726,13 @@ : ( ( 0x71 == ((U8*)s)[2] ) && ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x47 ) ) ? 4 : 0 ) : 0 ) /* - UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes + UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes - 0x0 - 0x3FFF + 0xA0 - 0x3FFF */ /*** GENERATED CODE ***/ #define is_UTF8_CHAR_utf8_no_length_checks(s) \ -( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xDF ) == 0x4B ) || ( ( ((U8*)s)[0] & 0xFC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xDE ) == 0x5A ) || ( ( ( ... [668 chars truncated] -: ( 0x78 == ((U8*)s)[0] || 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAF ) || ... [52 chars truncated] +( ( 0x78 == ((U8*)s)[0] || 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAF ) || ... [52 chars truncated] ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s) ... [41 chars truncated] : ( ( ( ((U8*)s)[0] == 0xB7 || ( ( ((U8*)s)[0] & 0xFE ) == 0xB8 ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xBC ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 <= (( ... [450 chars truncated] @@ -2453,14 +2451,13 @@ : ( ( 0x74 == ((U8*)s)[2] ) && ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x47 ) ) ? 4 : 0 ) : 0 ) /* - UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes + UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes - 0x0 - 0x3FFF + 0xA0 - 0x3FFF */ /*** GENERATED CODE ***/ #define is_UTF8_CHAR_utf8_no_length_checks(s) \ -( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xCE ) == 0x4A ) || ( ( ((U8*)s)[0] & 0xCC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xFE ) == 0x60 ) || ( ( ( ... [530 chars truncated] -: ( ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA1 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\ +( ( ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA1 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\ ( ( ( ( ((U8*)s)[1] & 0xEF ) == 0x41 ) || ( ( ((U8*)s)[1] & 0xCE ) == 0x42 ) || ( ( ((U8*)s)[1] & 0xEC ) == 0x44 ) || ( ( ((U8*)s)[1] & 0xEE ) == 0x48 ) || ( ( ((U8*)s)[1] & 0xFC ) == 0x64 ) || ( ... [84 chars truncated] : ( ( ( ( 0xB7 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xBA ) || ( 0xBE <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xC0 ) || ( 0xCA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xD0 ) || 0xDA == ((U8*)s)[0] ) && ( ( ( ((U8*)s)[1 ... [534 chars truncated] diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index d37b863..1f453e8 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1646,12 +1646,16 @@ GCB_V: Grapheme_Cluster_Break=V # wants a maximum other than 4 bytes, or this program creates better # optimizations. Trying with 5 bytes used too much memory to calculate. # +# We don't generate code for invariants here because the EBCDIC form is too +# complicated and would slow things down; instead the user should test for +# invariants first. +# # NOTE: The number of bytes generated here must match the value in # IS_UTF8_CHAR_FAST in utf8.h # -#UTF8_CHAR: Matches legal UTF-8 encoded characters from 1 through 4 bytes +#UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes #=> UTF8 :no_length_checks only_ascii_platform -#0x0 - 0x1FFFFF +#0x80 - 0x1FFFFF # This hasn't been commented out, but the number of bytes it works on has been # cut down to 3, so it doesn't cover the full legal Unicode range. Making it @@ -1662,9 +1666,9 @@ GCB_V: Grapheme_Cluster_Break=V # NOTE: The number of bytes generated here must match the value in # IS_UTF8_CHAR_FAST in utf8.h # -UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes +UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes => UTF8 :no_length_checks only_ebcdic_platform -0x0 - 0x3FFF +0xA0 - 0x3FFF QUOTEMETA: Meta-characters that \Q should quote => high :fast diff --git a/utf8.h b/utf8.h index a18faa2..924380d 100644 --- a/utf8.h +++ b/utf8.h @@ -606,48 +606,38 @@ Perl's extended UTF-8 means we can have start bytes up to FF. * don't take too long to generate, and there is a separate one for each code * page, so they are in regcharclass.h instead of here */ /* - UTF8_CHAR: Matches utf8 from 1 to 4 bytes + UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes - 0x0 - 0x1FFFFF + 0x80 - 0x1FFFFF */ /*** GENERATED CODE ***/ -#define is_UTF8_CHAR_utf8_safe(s,e) \ -( ((e)-(s) > 3) ? \ - ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \ - : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ - ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ - : ( 0xE0 == ((U8*)s)[0] ) ? \ - ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ - : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \ - ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ - : ( 0xF0 == ((U8*)s)[0] ) ? \ - ( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ - : ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ -: ((e)-(s) > 2) ? \ - ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \ - : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ - ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ - : ( 0xE0 == ((U8*)s)[0] ) ? \ - ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ - : ( ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ -: ((e)-(s) > 1) ? \ - ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \ - : ( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) ? 2 : 0 )\ -: ((e)-(s) > 0) ? \ - ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) \ -: 0 ) +#define is_UTF8_CHAR_utf8_no_length_checks(s) \ +( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \ + ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \ +: ( 0xE0 == ((U8*)s)[0] ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ +: ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \ + ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\ +: ( 0xF0 == ((U8*)s)[0] ) ? \ + ( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\ +: ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 ) #endif /* - * =for apidoc isUTF8_CHAR - * - * Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or - * UTF-EBCDIC) encoded character, looking no further than C<e - s> bytes into - * C<s>. Returns 0 if the sequence starting at C<s> through C<e - 1> is not - * well-formed UTF-8 +=head1 Unicode Support + +=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e + +Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or +UTF-EBCDIC) encoded character, looking no further than C<e - s> bytes into +C<s>. Returns 0 if the sequence starting at C<s> through C<e - 1> is not +well-formed UTF-8 Note that an INVARIANT character (i.e. ASCII on non-EBCDIC -machines) is a valid UTF-8 character. */ +machines) is a valid UTF-8 character. + +=cut +*/ #define isUTF8_CHAR(s, e) (((e) <= (s)) \ ? 0 \ @@ -656,7 +646,7 @@ machines) is a valid UTF-8 character. */ : (((e) - (s)) < UTF8SKIP(s)) \ ? 0 \ : (IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \ - ? is_UTF8_CHAR_utf8_safe(s,e) \ + ? is_UTF8_CHAR_utf8_no_length_checks(s) \ : _is_utf8_char_slow(s, e)) /* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is -- Perl5 Master Repository
