In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/050166318aca495d181c53df119efab77ddad029?hp=4a408539bf705f19d7b7e4702dbada468fb9655f>
- Log ----------------------------------------------------------------- commit 050166318aca495d181c53df119efab77ddad029 Author: Karl Williamson <[email protected]> Date: Sat Mar 2 20:53:04 2013 -0700 regen/unicode_constants.pl: Change #define name This was added in the 5.17 series so there's no code relying on its current name. I think that the abbreviation is clearer. M regen/unicode_constants.pl M unicode_constants.h M x2p/a2py.c commit 1dfa4f529a4cb0bb101513bad440ca3ef7b553d8 Author: Karl Williamson <[email protected]> Date: Sat Mar 2 20:43:56 2013 -0700 regen/unicode_constants.pl: Make portable to non-ASCII This now uses the U+ notation to indicate code points, which is unambiguous not matter what the platform's character set is. (charnames accepts the U+ notation) M regen/unicode_constants.pl M unicode_constants.h commit 5e250d2c6a5f45c36f5dfcf70e02931161e97695 Author: Karl Williamson <[email protected]> Date: Sat Mar 2 20:29:33 2013 -0700 regen/unicode_constants.pl: Remove unused constant This was added in the 5.17 series, so can't be yet in the field; and isn't needed. M regen/unicode_constants.pl M unicode_constants.h commit 5a731a17989fbfa436b52f1a6df9da10b9f9411f Author: Karl Williamson <[email protected]> Date: Sat Mar 2 19:28:43 2013 -0700 regen/unicode_constants.pl: Pass through input comments The data can now have comments, which are converted to C and passed through M regen/unicode_constants.pl commit e9cddfae7ec5891690e4f3255cb1e68612ed33a9 Author: Karl Williamson <[email protected]> Date: Sat Mar 2 19:19:02 2013 -0700 regen/unicode_constants.pl: Convert '-' in names to '_' Unicode character names can have dashes in them. These aren't accepted in C macro names. Change so both blanks and the hyphen-minus are converted to underscores. M regen/unicode_constants.pl ----------------------------------------------------------------------- Summary of changes: regen/unicode_constants.pl | 91 ++++++++++++++++++++++++-------------------- unicode_constants.h | 28 +++++++------- x2p/a2py.c | 2 +- 3 files changed, 65 insertions(+), 56 deletions(-) diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 48b43f4..1977fbd 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -28,12 +28,14 @@ print $out_fh <<END; END # The data are at the end of this file. A blank line is output as-is. -# Otherwise, each line represents one #define, and begins with either a -# Unicode character name with the blanks in it squeezed out or replaced by -# underscores; or it may be a hexadecimal Unicode code point. In the latter +# Comments (lines whose first non-blank is a '#') are converted to C-style, +# though empty comments are converted to blank lines. Otherwise, each line +# represents one #define, and begins with either a Unicode character name with +# the blanks and dashes in it squeezed out or replaced by underscores; or it +# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter # case, the name will be looked-up to use as the name of the macro. In either -# case, the macro name will have suffixes as listed above, and all blanks will -# be replaced by underscores. +# case, the macro name will have suffixes as listed above, and all blanks and +# dashes will be replaced by underscores. # # Each line may optionally have one of the following flags on it, separated by # white space from the initial token. @@ -55,12 +57,21 @@ END # having to figure things out. while ( <DATA> ) { - if ($_ !~ /\S/) { - print $out_fh "\n"; + chomp; + + # Convert any '#' comments to /* ... */; empty lines and comments are + # output as blank lines + if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) { + my $comment_body = $1 // ""; + if ($comment_body ne "") { + print $out_fh "/* $comment_body */\n"; + } + else { + print $out_fh "\n"; + } next; } - chomp; unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token (?: [\ ]+ ( [^ ]* ) )? # optional flag (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required @@ -75,31 +86,30 @@ while ( <DATA> ) { my $name; my $cp; + my $U_cp; # code point in Unicode (not-native) terms my $undef_ok = $desired_name || $flag =~ /skip_if_undef/; - if ($name_or_cp =~ /[^[:xdigit:]]/) { - - # Anything that isn't a hex value must be a name. - $name = $name_or_cp; - $cp = charnames::vianame($name =~ s/_/ /gr); - die "Unknown name '$name' at line $.: $_\n" unless defined $name; - } - else { - $cp = $name_or_cp; - $name = charnames::viacode("0$cp"); # viacode requires a leading zero - # to be sure that the argument is - # hex + if ($name_or_cp =~ /^U\+(.*)/) { + $U_cp = hex $1; + $name = charnames::viacode($name_or_cp); if (! defined $name) { - die "Unknown code point '$cp' at line $.: $_\n" unless $undef_ok; + die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok; $name = ""; } + $cp = utf8::unicode_to_native($U_cp); + } + else { + $name = $name_or_cp; + $cp = charnames::vianame($name =~ s/_/ /gr); + $U_cp = utf8::native_to_unicode($cp); + die "Unknown name '$name' at line $.: $_\n" unless defined $name; } $name = $desired_name if $name eq "" && $desired_name; - $name =~ s/ /_/g; # The macro name can have no blanks in it + $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes my $str = join "", map { sprintf "\\x%02X", $_ } - unpack("U0C*", pack("U", hex $cp)); + unpack("U0C*", pack("U", $cp)); my $suffix = '_UTF8'; if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) { @@ -115,15 +125,14 @@ while ( <DATA> ) { $str = "0x$str"; # Is a numeric constant } elsif ($flag eq 'native') { - die "Are you sure you want to run this on an above-Latin1 code point?" if hex $cp > 0xff; + die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff; $suffix = '_NATIVE'; - $str = utf8::unicode_to_native(hex $cp); - $str = "0x$cp"; # Is a numeric constant + $str = sprintf "0x%02X", $cp; # Is a numeric constant } else { die "Unknown flag at line $.: $_\n"; } - print $out_fh "#define ${name}$suffix $str /* U+$cp */\n"; + printf $out_fh "#define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp; } print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; @@ -131,21 +140,21 @@ print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n"; read_only_bottom_close_and_rename($out_fh); __DATA__ -0300 string -0301 string -0308 string -03B9 string +U+0300 string +U+0301 string +U+0308 string + +U+03B9 string -03C5 string +U+03C5 string -2010 string -D800 first FIRST_SURROGATE +U+2010 string +U+D800 first FIRST_SURROGATE -007F native -00DF native -00E5 native -00C5 native -00FF native -00B5 native -0085 native +DEL native +U+00DF native +U+00E5 native +U+00C5 native +U+00FF native +U+00B5 native diff --git a/unicode_constants.h b/unicode_constants.h index f4d3172..19f3acd 100644 --- a/unicode_constants.h +++ b/unicode_constants.h @@ -20,24 +20,24 @@ * "_TAIL" if instead it represents all but the first byte. This, and * with no additional suffix are both string constants */ -#define COMBINING_GRAVE_ACCENT_UTF8 "\xCC\x80" /* U+0300 */ -#define COMBINING_ACUTE_ACCENT_UTF8 "\xCC\x81" /* U+0301 */ -#define COMBINING_DIAERESIS_UTF8 "\xCC\x88" /* U+0308 */ -#define GREEK_SMALL_LETTER_IOTA_UTF8 "\xCE\xB9" /* U+03B9 */ +#define COMBINING_GRAVE_ACCENT_UTF8 "\xCC\x80" /* U+0300 */ +#define COMBINING_ACUTE_ACCENT_UTF8 "\xCC\x81" /* U+0301 */ +#define COMBINING_DIAERESIS_UTF8 "\xCC\x88" /* U+0308 */ -#define GREEK_SMALL_LETTER_UPSILON_UTF8 "\xCF\x85" /* U+03C5 */ +#define GREEK_SMALL_LETTER_IOTA_UTF8 "\xCE\xB9" /* U+03B9 */ -#define HYPHEN_UTF8 "\xE2\x80\x90" /* U+2010 */ -#define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xED /* U+D800 */ +#define GREEK_SMALL_LETTER_UPSILON_UTF8 "\xCF\x85" /* U+03C5 */ -#define DELETE_NATIVE 0x007F /* U+007F */ -#define LATIN_SMALL_LETTER_SHARP_S_NATIVE 0x00DF /* U+00DF */ -#define LATIN_SMALL_LETTER_A_WITH_RING_ABOVE_NATIVE 0x00E5 /* U+00E5 */ -#define LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE_NATIVE 0x00C5 /* U+00C5 */ -#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS_NATIVE 0x00FF /* U+00FF */ -#define MICRO_SIGN_NATIVE 0x00B5 /* U+00B5 */ -#define NEXT_LINE_NATIVE 0x0085 /* U+0085 */ +#define HYPHEN_UTF8 "\xE2\x80\x90" /* U+2010 */ +#define FIRST_SURROGATE_UTF8_FIRST_BYTE 0xED /* U+D800 */ + +#define DEL_NATIVE 0x7F /* U+007F */ +#define LATIN_SMALL_LETTER_SHARP_S_NATIVE 0xDF /* U+00DF */ +#define LATIN_SMALL_LETTER_A_WITH_RING_ABOVE_NATIVE 0xE5 /* U+00E5 */ +#define LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE_NATIVE 0xC5 /* U+00C5 */ +#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS_NATIVE 0xFF /* U+00FF */ +#define MICRO_SIGN_NATIVE 0xB5 /* U+00B5 */ #endif /* H_UNICODE_CONSTANTS */ diff --git a/x2p/a2py.c b/x2p/a2py.c index ca5958b..aec2a0e 100644 --- a/x2p/a2py.c +++ b/x2p/a2py.c @@ -18,7 +18,7 @@ #endif #include "util.h" #include "../unicode_constants.h" -#define DELETE_CHAR DELETE_NATIVE +#define DELETE_CHAR DEL_NATIVE const char *filename; const char *myname; -- Perl5 Master Repository
