In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/40f914fd7fc2115d5df1c2b1ecc1d960d5f0a210?hp=6302f837102d66f532a1c151f7299abbef3a15dd>
- Log ----------------------------------------------------------------- commit 40f914fd7fc2115d5df1c2b1ecc1d960d5f0a210 Author: Karl Williamson <[email protected]> Date: Mon May 5 21:09:36 2014 -0600 regen/regcharclass.pl: Add new macro type with intermed checking This adds a new macro generation option for inputs that are checked elsewhere for buffer overflow, but otherwise needs validity checks. ----------------------------------------------------------------------- Summary of changes: regcharclass.h | 48 ++++++++++++++++++------------------------------ regen/regcharclass.pl | 48 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 56 insertions(+), 40 deletions(-) diff --git a/regcharclass.h b/regcharclass.h index c2a0ac7..f4a7e08 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1009,20 +1009,16 @@ : ( ( 0x72 == ((U8*)s)[2] ) && ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x47 ) ) ? 4 : 0 ) : 0 ) /* - UTF8_CHAR: Matches utf8 from 1 to 3 bytes + UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes 0x0 - 0x3FFF */ /*** GENERATED CODE ***/ -#define is_UTF8_CHAR_utf8_safe(s,e) \ -( ((e) > (s)) ? \ - ( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xDF ) == 0x4B ) || ( ( ((U8*)s)[0] & 0xCC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xDE ) == 0x5A ) || ( ... [550 chars truncated] - : (((e) - (s)) >= UTF8SKIP(s)) ? \ - ( ( 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAC ) || ( 0xAE <= ((U8*)s ... [34 chars truncated] - ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ? 2 : 0 )\ - : ( ( ( ( ( ((U8*)s)[0] & 0xFC ) == 0xB8 ) || ((U8*)s)[0] == 0xBC || ( ( ((U8*)s)[0] & 0xFE ) == 0xBE ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 ... [378 chars truncated] - : 0 ) \ -: 0 ) +#define is_UTF8_CHAR_utf8_no_length_checks(s) \ +( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xDF ) == 0x4B ) || ( ( ((U8*)s)[0] & 0xCC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xDE ) == 0x5A ) || ( ( ( ... [546 chars truncated] +: ( 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAC ) || ( 0xAE <= ((U8*)s)[0] ... [29 chars truncated] + ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ? 2 : 0 ... [2 chars truncated] +: ( ( ( ( ( ((U8*)s)[0] & 0xFC ) == 0xB8 ) || ((U8*)s)[0] == 0xBC || ( ( ((U8*)s)[0] & 0xFE ) == 0xBE ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 <= (( ... [372 chars truncated] /* QUOTEMETA: Meta-characters that \Q should quote @@ -1731,20 +1727,16 @@ : ( ( 0x71 == ((U8*)s)[2] ) && ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x47 ) ) ? 4 : 0 ) : 0 ) /* - UTF8_CHAR: Matches utf8 from 1 to 3 bytes + UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes 0x0 - 0x3FFF */ /*** GENERATED CODE ***/ -#define is_UTF8_CHAR_utf8_safe(s,e) \ -( ((e) > (s)) ? \ - ( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xDF ) == 0x4B ) || ( ( ((U8*)s)[0] & 0xFC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xDE ) == 0x5A ) || ( ... [672 chars truncated] - : (((e) - (s)) >= UTF8SKIP(s)) ? \ - ( ( 0x78 == ((U8*)s)[0] || 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAF ... [57 chars truncated] - ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1 ... [39 chars truncated] - : ( ( ( ((U8*)s)[0] == 0xB7 || ( ( ((U8*)s)[0] & 0xFE ) == 0xB8 ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xBC ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 ... [456 chars truncated] - : 0 ) \ -: 0 ) +#define is_UTF8_CHAR_utf8_no_length_checks(s) \ +( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xDF ) == 0x4B ) || ( ( ((U8*)s)[0] & 0xFC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xDE ) == 0x5A ) || ( ( ( ... [668 chars truncated] +: ( 0x78 == ((U8*)s)[0] || 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAF ) || ... [52 chars truncated] + ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s) ... [41 chars truncated] +: ( ( ( ((U8*)s)[0] == 0xB7 || ( ( ((U8*)s)[0] & 0xFE ) == 0xB8 ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xBC ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 <= (( ... [450 chars truncated] /* QUOTEMETA: Meta-characters that \Q should quote @@ -2461,20 +2453,16 @@ : ( ( 0x74 == ((U8*)s)[2] ) && ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x47 ) ) ? 4 : 0 ) : 0 ) /* - UTF8_CHAR: Matches utf8 from 1 to 3 bytes + UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes 0x0 - 0x3FFF */ /*** GENERATED CODE ***/ -#define is_UTF8_CHAR_utf8_safe(s,e) \ -( ((e) > (s)) ? \ - ( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xCE ) == 0x4A ) || ( ( ((U8*)s)[0] & 0xCC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xFE ) == 0x60 ) || ( ... [534 chars truncated] - : (((e) - (s)) >= UTF8SKIP(s)) ? \ - ( ( ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA1 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\ - ( ( ( ( ((U8*)s)[1] & 0xEF ) == 0x41 ) || ( ( ((U8*)s)[1] & 0xCE ) == 0x42 ) || ( ( ((U8*)s)[1] & 0xEC ) == 0x44 ) || ( ( ((U8*)s)[1] & 0xEE ) == 0x48 ) || ( ( ((U8*)s)[1] & 0xFC ) == 0x64 ) || ( ( ... [82 chars truncated] - : ( ( ( ( 0xB7 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xBA ) || ( 0xBE <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xC0 ) || ( 0xCA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xD0 ) || 0xDA == ((U8*)s)[0] ) && ( ( ( ((U8* ... [540 chars truncated] - : 0 ) \ -: 0 ) +#define is_UTF8_CHAR_utf8_no_length_checks(s) \ +( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xCE ) == 0x4A ) || ( ( ((U8*)s)[0] & 0xCC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xFE ) == 0x60 ) || ( ( ( ... [530 chars truncated] +: ( ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA1 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\ + ( ( ( ( ((U8*)s)[1] & 0xEF ) == 0x41 ) || ( ( ((U8*)s)[1] & 0xCE ) == 0x42 ) || ( ( ((U8*)s)[1] & 0xEC ) == 0x44 ) || ( ( ((U8*)s)[1] & 0xEE ) == 0x48 ) || ( ( ((U8*)s)[1] & 0xFC ) == 0x64 ) || ( ... [84 chars truncated] +: ( ( ( ( 0xB7 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xBA ) || ( 0xBE <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xC0 ) || ( 0xCA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xD0 ) || 0xDA == ((U8*)s)[0] ) && ( ( ( ((U8*)s)[1 ... [534 chars truncated] /* QUOTEMETA: Meta-characters that \Q should quote diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 187b91d..d37b863 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -110,6 +110,13 @@ include it, and it is a NULL. =back +The above isn't quite complete, as for specialized purposes one can get a +macro like C<is_WHATEVER_utf8_no_length_checks(s)>, which assumes that it is +already known that there is enough space to hold the character starting at +C<s>, but otherwise checks that it is well-formed. In other words, this is +intermediary in checking between C<is_WHATEVER_utf8(s)> and +C<is_WHATEVER_utf8_safe(s,e)>. + =head2 CODE FORMAT perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f" @@ -1275,9 +1282,11 @@ sub render { # make a macro of a given type. # calls into make_trie and (generic_|length_)optree as needed # Opts are: -# type : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8' -# ret_type : 'cp' or 'len' -# safe : add length guards to macro +# type : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8' +# ret_type : 'cp' or 'len' +# safe : don't assume is well-formed UTF-8, so don't skip any range +# checks, and add length guards to macro +# no_length_checks : like safe, but don't add length guards. # # type defaults to 'generic', and ret_type to 'len' unless type is 'cp' # in which case it defaults to 'cp' as well. @@ -1324,6 +1333,7 @@ sub make_macro { my $ext= $type =~ /generic/ ? '' : '_' . lc( $type ); $ext .= '_non_low' if $type eq 'generic_non_low'; $ext .= "_safe" if $opts{safe}; + $ext .= "_no_length_checks" if $opts{no_length_checks}; my $argstr= join ",", @args; my $def_fmt="$pfx$self->{op}$ext%s($argstr)"; my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type ); @@ -1372,6 +1382,7 @@ EOF my @mods; push @mods, 'safe' if delete $mods{safe}; + push @mods, 'no_length_checks' if delete $mods{no_length_checks}; unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast' # do this one # first, as @@ -1390,14 +1401,15 @@ EOF # way a cp macro will get generated. Below we convert 'safe' # to 'fast' in this instance next if $type =~ /^cp/ - && $mod eq 'safe' - && grep { 'fast' eq $_ } @mods; + && ($mod eq 'safe' || $mod eq 'no_length_checks') + && grep { 'fast' =~ $_ } @mods; delete $mods{$mod}; my $macro= $obj->make_macro( type => $type, ret_type => $ret, safe => $mod eq 'safe' && $type !~ /^cp/, charset => $charset, + no_length_checks => $mod eq 'no_length_checks' && $type !~ /^cp/, ); print $out_fh $macro, "\n"; } @@ -1534,6 +1546,9 @@ EOF # string. In the case of non-UTF8, it makes sure that the # string has at least one byte in it. The macro name has # '_safe' appended to it. +# no_length_checks The input string is not necessarily valid UTF-8, but it +# is to be assumed that the length has already been checked and +# found to be valid # fast The input string is valid UTF-8. No bounds checking is done, # and the macro can make assumptions that lead to faster # execution. @@ -1629,13 +1644,26 @@ GCB_V: Grapheme_Cluster_Break=V # then this was commented out because it takes so long to figure out these 2 # million code points. The results would not change unless utf8.h decides it # wants a maximum other than 4 bytes, or this program creates better -# optimizations -#UTF8_CHAR: Matches utf8 from 1 to 4 bytes -#=> UTF8 :safe only_ascii_platform +# optimizations. Trying with 5 bytes used too much memory to calculate. +# +# NOTE: The number of bytes generated here must match the value in +# IS_UTF8_CHAR_FAST in utf8.h +# +#UTF8_CHAR: Matches legal UTF-8 encoded characters from 1 through 4 bytes +#=> UTF8 :no_length_checks only_ascii_platform #0x0 - 0x1FFFFF -UTF8_CHAR: Matches utf8 from 1 to 3 bytes -=> UTF8 :safe only_ebcdic_platform +# This hasn't been commented out, but the number of bytes it works on has been +# cut down to 3, so it doesn't cover the full legal Unicode range. Making it +# 5 bytes would cover beyond the full range, but takes quite a bit of time and +# memory to calculate. The generated table varies depending on the EBCDIC +# code page. + +# NOTE: The number of bytes generated here must match the value in +# IS_UTF8_CHAR_FAST in utf8.h +# +UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes +=> UTF8 :no_length_checks only_ebcdic_platform 0x0 - 0x3FFF QUOTEMETA: Meta-characters that \Q should quote -- Perl5 Master Repository
