In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/0ed2b00b2bd6a650391433ad8733983692df43d5?hp=f2645549e6a4bfca055b5fd1932da462df424921>
- Log ----------------------------------------------------------------- commit 0ed2b00b2bd6a650391433ad8733983692df43d5 Author: Karl Williamson <[email protected]> Date: Mon May 12 18:29:41 2014 -0600 Fix definition of toCTRL() for EBCDIC The definition was incorrect. When going from control to printable name, we need to go from Latin1 -> Native, so that e.g., a 65 gets turned into the native 'A' M handy.h M utf8.h M utfebcdic.h commit dd9bc2b0af8e838ed989897601a0ee36eeed092f Author: Karl Williamson <[email protected]> Date: Tue May 6 13:18:28 2014 -0600 Add some (UN)?LIKELY() to UTF8 handling It's very rare actually for code to be presented with malformed UTF-8, so give the compiler a hint about the likely branches. M regcharclass.h M regen/regcharclass.pl M utf8.h ----------------------------------------------------------------------- Summary of changes: handy.h | 12 +++++++----- regcharclass.h | 16 ++++++++-------- regen/regcharclass.pl | 7 ++++--- utf8.h | 10 +++++++--- utfebcdic.h | 4 ++++ 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/handy.h b/handy.h index 13f8d22..740ebe5 100644 --- a/handy.h +++ b/handy.h @@ -1653,11 +1653,13 @@ EXTCONST U32 PL_charclass[]; #ifndef EBCDIC # define toCTRL(c) (toUPPER(c) ^ 64) #else -# define toCTRL(c) ((c) == '?' \ - ? LATIN1_TO_NATIVE(0x9F) \ - : (c) == LATIN1_TO_NATIVE(0x9F) \ - ? '?' \ - : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64)) +# define toCTRL(c) ((isPRINT_A(c)) \ + ? UNLIKELY((c) == '?') \ + ? QUESTION_MARK_CTRL \ + : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64) \ + : UNLIKELY((c) == QUESTION_MARK_CTRL) \ + ? ((c) == '?') \ + : (LATIN1_TO_NATIVE((c) ^ 64))) #endif /* Line numbers are unsigned, 32 bits. */ diff --git a/regcharclass.h b/regcharclass.h index ebda2f7..7de537b 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -708,11 +708,11 @@ */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ -( ((e) > (s)) ? \ +( ( LIKELY((e) > (s)) ) ? \ ( ( ( 0x09 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x0D ) || 0x20 == ((U8*)s)[0] ) ? 1\ : (! is_utf8 ) ? \ ( 0x85 == ((U8*)s)[0] ) \ - : (((e) - (s)) >= UTF8SKIP(s)) ? \ + : ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ? \ ( ( 0xC2 == ((U8*)s)[0] ) ? \ ( ( 0x85 == ((U8*)s)[1] ) ? 2 : 0 ) \ : ( ( ( 0xE2 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( ( ((U8*)s)[2] & 0xFE ) == 0x8E || ( ((U8*)s)[2] & 0xFE ) == 0xA8 ) ) ? 3 : 0 )\ @@ -1427,9 +1427,9 @@ */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ -( ((e) > (s)) ? \ +( ( LIKELY((e) > (s)) ) ? \ ( ( ( ( ((U8*)s)[0] & 0xEF ) == 0x05 ) || ((U8*)s)[0] == 0x0B || ( ( ((U8*)s)[0] & 0xFE ) == 0x0C ) || ((U8*)s)[0] == 0x25 || ((U8*)s)[0] == 0x40 ) ? 1\ - : ( ( is_utf8 && (((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ + : ( ( is_utf8 && LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ ( ( 0x55 == ((U8*)s)[2] || 0x56 == ((U8*)s)[2] ) ? 3 : 0 )\ : ( ( 0x42 == ((U8*)s)[1] ) && ( 0x49 == ((U8*)s)[2] || 0x4A == ((U8*)s)[2] ) ) ? 3 : 0 ) : 0 )\ : 0 ) @@ -2152,9 +2152,9 @@ */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ -( ((e) > (s)) ? \ +( ( LIKELY((e) > (s)) ) ? \ ( ( ( ( ((U8*)s)[0] & 0xEF ) == 0x05 ) || ((U8*)s)[0] == 0x0B || ( ( ((U8*)s)[0] & 0xFE ) == 0x0C ) || ((U8*)s)[0] == 0x25 || ((U8*)s)[0] == 0x40 ) ? 1\ - : ( ( is_utf8 && (((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ + : ( ( is_utf8 && LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ ( ( 0x55 == ((U8*)s)[2] || 0x56 == ((U8*)s)[2] ) ? 3 : 0 )\ : ( ( 0x42 == ((U8*)s)[1] ) && ( 0x49 == ((U8*)s)[2] || 0x4A == ((U8*)s)[2] ) ) ? 3 : 0 ) : 0 )\ : 0 ) @@ -2877,9 +2877,9 @@ */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ -( ((e) > (s)) ? \ +( ( LIKELY((e) > (s)) ) ? \ ( ( ( ( ((U8*)s)[0] & 0xEF ) == 0x05 ) || ((U8*)s)[0] == 0x0B || ( ( ((U8*)s)[0] & 0xFE ) == 0x0C ) || ((U8*)s)[0] == 0x25 || ((U8*)s)[0] == 0x40 ) ? 1\ - : ( ( is_utf8 && (((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ + : ( ( is_utf8 && LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ ( ( ( ((U8*)s)[2] & 0xFE ) == 0x56 ) ? 3 : 0 ) \ : ( ( 0x42 == ((U8*)s)[1] ) && ( 0x49 == ((U8*)s)[2] || 0x51 == ((U8*)s)[2] ) ) ? 3 : 0 ) : 0 )\ : 0 ) diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 1f453e8..7919041 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -671,8 +671,9 @@ sub length_optree { # have only a few things that can match past this, so I (khw) # don't think it is worth it. (Even better would be to use # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it - # if it saves a bunch. - my $cond = "(((e) - (s)) >= UTF8SKIP(s))"; + # if it saves a bunch. We assume that input text likely to be + # well-formed . + my $cond = "LIKELY(((e) - (s)) >= UTF8SKIP(s))"; $else = __cond_join($cond, $utf8, $else); # For 'generic', we also will want the latin1 UTF-8 variants for @@ -715,7 +716,7 @@ sub length_optree { } # We need at least one byte available to start off the tests - $else = __cond_join("((e) > (s))", $else, 0); + $else = __cond_join("LIKELY((e) > (s))", $else, 0); } else { # Here, we don't want or there aren't any variants. A single # byte available is enough. diff --git a/utf8.h b/utf8.h index 74e7d48..8945663 100644 --- a/utf8.h +++ b/utf8.h @@ -270,6 +270,10 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #error UTF8_MAXBYTES must be at least 12 #endif +/* ^? is defined to be DEL on ASCII systems. See the definition of toCTRL() + * for more */ +#define QUESTION_MARK_CTRL DEL_NATIVE + #define MAX_UTF8_TWO_BYTE 0x7FF #define UTF8_MAXBYTES_CASE UTF8_MAXBYTES @@ -639,13 +643,13 @@ machines) is a valid UTF-8 character. =cut */ -#define isUTF8_CHAR(s, e) (((e) <= (s)) \ +#define isUTF8_CHAR(s, e) (UNLIKELY((e) <= (s)) \ ? 0 \ : (UTF8_IS_INVARIANT(*s)) \ ? 1 \ - : (((e) - (s)) < UTF8SKIP(s)) \ + : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ ? 0 \ - : (IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \ + : LIKELY(IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \ ? is_UTF8_CHAR_utf8_no_length_checks(s) \ : _is_utf8_char_slow(s, e)) diff --git a/utfebcdic.h b/utfebcdic.h index 1211c9f..7eec66f 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -194,6 +194,10 @@ END_EXTERN_C * character occupies 5 bytes, therefore this number is 15 */ #define UTF8_MAXBYTES_CASE 15 +/* ^? is defined to be APC on EBCDIC systems. See the definition of toCTRL() + * for more */ +#define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F) + #define MAX_UTF8_TWO_BYTE 0x3FF /* -- Perl5 Master Repository
