In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/ef3a1d8a913a6ffccc2517a79a516651aa599054?hp=088bd7a611dad28d5343d929e157e423aed4f5dc>
- Log ----------------------------------------------------------------- commit ef3a1d8a913a6ffccc2517a79a516651aa599054 Merge: 088bd7a efb8961 Author: Ricardo Signes <[email protected]> Date: Sun Dec 21 18:52:15 2014 -0500 Merge branch 'perlunicook' into blead commit efb8961587a30e930d4581a96246b02e20d2b1f4 Author: Ricardo Signes <[email protected]> Date: Sun Dec 21 18:48:22 2014 -0500 perlunicook: add trusted-to-exist links for perlunicook M t/porting/known_pod_issues.dat commit 32a9c674a9cef14c4bd3666bff64da2f2ff2b9a0 Author: Ricardo Signes <[email protected]> Date: Sun Dec 21 18:41:35 2014 -0500 perlunicook: remove empty list item M pod/perlunicook.pod commit 5dd47fd476d31d98fcd75101d115aca2ddcdbe49 Author: Ricardo Signes <[email protected]> Date: Sun Dec 21 18:37:06 2014 -0500 perlunicook: add it to build files and TOC M plan9/mkfile M pod/perl.pod M win32/pod.mak commit d4d0a10ef54ca990c3e90457f90f03ea4bc9e10f Author: Ricardo Signes <[email protected]> Date: Sun Dec 21 18:35:10 2014 -0500 perlunicook: add perlunicook to MANIFEST M MANIFEST commit ddeccf1ffc8f43b11b4f333d8387265c4b449b3c Author: Rafael Garcia-Suarez <[email protected]> Date: Thu Mar 8 19:31:47 2012 +0100 Fixes for a few trivial typos M pod/perlunicook.pod commit 2561daa455347e2b981e853cf24c33f12c57c58b Author: Ricardo Signes <[email protected]> Date: Tue Mar 6 10:32:36 2012 -0500 Tom Christiansen's Perl Unicode Cookbook 1st draft originally posted to perl5-porters as message-id 18479.1330388058@chthon A pod/perlunicook.pod ----------------------------------------------------------------------- Summary of changes: MANIFEST | 1 + plan9/mkfile | 2 +- pod/perl.pod | 1 + pod/perlunicook.pod | 856 +++++++++++++++++++++++++++++++++++++++++ t/porting/known_pod_issues.dat | 6 + win32/pod.mak | 4 + 6 files changed, 869 insertions(+), 1 deletion(-) create mode 100644 pod/perlunicook.pod diff --git a/MANIFEST b/MANIFEST index b8c151e..e781d3a 100644 --- a/MANIFEST +++ b/MANIFEST @@ -4698,6 +4698,7 @@ pod/perltooc.pod pod/perltoot.pod pod/perltrap.pod Perl traps for the unwary pod/perlunicode.pod Perl Unicode support +pod/perlunicook.pod Perl Unicode cookbook pod/perlunifaq.pod Perl Unicode FAQ pod/perluniintro.pod Perl Unicode introduction pod/perlunitut.pod Perl Unicode tutorial diff --git a/plan9/mkfile b/plan9/mkfile index 28d290b..cf2153e 100644 --- a/plan9/mkfile +++ b/plan9/mkfile @@ -22,7 +22,7 @@ installman3dir = /sys/man/2 podnames = perl perlbook perldata perldebtut perldiag perldsc perlform perlfunc perlipc perllexwarn perllol perlmod perlmodlib perlmodinstall perlnewmod perlop perlootut perlopentut perlpacktut perlp ... [164 chars truncated] faqpodnames = perlfaq perlfaq1 perlfaq2 perlfaq3 perlfaq4 perlfaq5 perlfaq6 perlfaq7 perlfaq8 perlfaq9 -advpodnames = perlapi perlapio perlcall perlclib perlcompile perldebguts perldbmfilter perldebug perldelta perldiag perlebcdic perlembed perlfilter perlfork perlguts perlhack perlintern perliol perll ... [99 chars truncated] +advpodnames = perlapi perlapio perlcall perlclib perlcompile perldebguts perldbmfilter perldebug perldelta perldiag perlebcdic perlembed perlfilter perlfork perlguts perlhack perlintern perliol perll ... [111 chars truncated] archpodnames = perlaix perlamiga perlbeos perlbs2000 perlce perlcygwin perldgux perldos perlfreebsd perlhpux perlhurd perlirix perlmacos perlmpeix perlnetware perlos2 perlos390 perlos400 perlplan9 pe ... [53 chars truncated] histpods = perl5004delta perl5005delta perl561delta perl56delta perl570delta perl571delta perl572delta perl573delta perl58delta perlhist diff --git a/pod/perl.pod b/pod/perl.pod index 1ad467c..64a58d4 100644 --- a/pod/perl.pod +++ b/pod/perl.pod @@ -121,6 +121,7 @@ aux a2p c2ph h2ph h2xs perlbug pl2pm pod2html pod2man s2p splain xsubpp perllocale Perl locale support perluniintro Perl Unicode introduction perlunicode Perl Unicode support + perlunicook Perl Unicode cookbook perlunifaq Perl Unicode FAQ perluniprops Index of Unicode properties in Perl perlunitut Perl Unicode tutorial diff --git a/pod/perlunicook.pod b/pod/perlunicook.pod new file mode 100644 index 0000000..5192e54 --- /dev/null +++ b/pod/perlunicook.pod @@ -0,0 +1,856 @@ + +=encoding utf8 + +=head1 NAME + +perlunicook - cookbookish examples of handling Unicode in Perl + +=head1 DESCRIPTION + +This manpage contains short recipes demonstrating how to handle common Unicode +operations in Perl, plus one complete program at the end. Any undeclared +variables in individual recipes are assumed to have a previous appropriate +value in them. + +=head1 EXAMPLES + +=head2 â 0: Standard preamble + +Unless otherwise notes, all examples below require this standard preamble +to work correctly, with the C<#!> adjusted to work on your system: + + #!/usr/bin/env perl + + use utf8; # so literals and identifiers can be in UTF-8 + use v5.12; # or later to get "unicode_strings" feature + use strict; # quote strings, declare variables + use warnings; # on by default + use warnings qw(FATAL utf8); # fatalize encoding glitches + use open qw(:std :utf8); # undeclared streams in UTF-8 + use charnames qw(:full :short); # unneeded in v5.16 + +This I<does> make even Unix programmers C<binmode> your binary streams, +or open them with C<:raw>, but that's the only way to get at them +portably anyway. + +B<WARNING>: C<use autoload> and C<use open> do not get along with each other. + +=head2 â 1: Generic Unicode-savvy filter + +Always decompose on the way in, then recompose on the way out. + + use Unicode::Normalize; + + while (<>) { + $_ = NFD($_); # decompose + reorder canonically + ... + } continue { + print NFC($_); # recompose (where possible) + reorder canonically + } + +=head2 â 2: Fine-tuning Unicode warnings + +As of v5.14, Perl distinguishes three subclasses of UTFâ8 warnings. + + use v5.14; # subwarnings unavailable any earlier + no warnings "nonchar"; # the 66 forbidden non-characters + no warnings "surrogate"; # UTF-16/CESU-8 nonsense + no warnings "non_unicode"; # for codepoints over 0x10_FFFF + +=head2 â 3: Declare source in utf8 for identifiers and literals + +Without the all-critical C<use utf8> declaration, putting UTFâ8 in your +literals and identifiers wonât work right. If you used the standard +preamble just given above, this already happened. If you did, you can +do things like this: + + use utf8; + + my $measure = "à ngström"; + my @μsoft = qw( cp852 cp1251 cp1252 ); + my @á½ÏÎÏÎ¼ÎµÎ³Î±Ï = qw( á½ÏÎÏ Î¼ÎµÎ³Î±Ï ); + my @é¯ = qw( koi8-f koi8-u koi8-r ); + my $motto = "ðª ð ðª"; # FAMILY, GROWING HEART, DROMEDARY CAMEL + +If you forget C<use utf8>, high bytes will be misunderstood as +separate characters, and nothing will work right. + +=head2 â 4: Characters and their numbers + +The C<ord> and C<chr> functions work transparently on all codepoints, +not just on ASCII alone â nor in fact, not even just on Unicode alone. + + # ASCII characters + ord("A") + chr(65) + + # characters from the Basic Multilingual Plane + ord("Σ") + chr(0x3A3) + + # beyond the BMP + ord("ð") # MATHEMATICAL ITALIC SMALL N + chr(0x1D45B) + + # beyond Unicode! (up to MAXINT) + ord("\x{20_0000}") + chr(0x20_0000) + +=head2 â 5: Unicode literals by character number + +In an interpolated literal, whether a double-quoted string or a +regex, you may specify a character by its number using the +C<\x{I<HHHHHH>}> escape. + + String: "\x{3a3}" + Regex: /\x{3a3}/ + + String: "\x{1d45b}" + Regex: /\x{1d45b}/ + + # even non-BMP ranges in regex work fine + /[\x{1D434}-\x{1D467}]/ + +=head2 â 6: Get character name by number + + use charnames (); + my $name = charnames::viacode(0x03A3); + +=head2 â 7: Get character number by name + + use charnames (); + my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA"); + +=head2 â 8: Unicode named characters + +Use the C<< \N{I<charname>} >> notation to get the character +by that name for use in interpolated literals (double-quoted +strings and regexes). In v5.16, there is an implicit + + use charnames qw(:full :short); + +But prior to v5.16, you must be explicit about which set of charnames you +want. The C<:full> names are the official Unicode character name, alias, or +sequence, which all share a namespace. + + use charnames qw(:full :short latin greek); + + "\N{MATHEMATICAL ITALIC SMALL N}" # :full + "\N{GREEK CAPITAL LETTER SIGMA}" # :full + +Anything else is a Perl-specific convenience abbreviation. Specify one or +more scripts by names if you want short names that are script-specific. + + "\N{Greek:Sigma}" # :short + "\N{ae}" # latin + "\N{epsilon}" # greek + +The v5.16 release also supports a C<:loose> import for loose matching of +character names, which works just like loose matching of property names: +that is, it disregards case, whitespace, and underscores: + + "\N{euro sign}" # :loose (from v5.16) + +=head2 â 9: Unicode named sequences + +These look just like character names but return multiple codepoints. +Notice the C<%vx> vector-print functionality in C<printf>. + + use charnames qw(:full); + my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}"; + printf "U+%v04X\n", $seq; + U+0100.0300 + +=head2 â 10: Custom named characters + +Use C<:alias> to give your own lexically scoped nicknames to existing +characters, or even to give unnamed private-use characters useful names. + + use charnames ":full", ":alias" => { + ecute => "LATIN SMALL LETTER E WITH ACUTE", + "APPLE LOGO" => 0xF8FF, # private use character + }; + + "\N{ecute}" + "\N{APPLE LOGO}" + +=head2 â 11: Names of CJK codepoints + +Sinograms like âæ±äº¬â come back with character names of +C<CJK UNIFIED IDEOGRAPH-6771> and C<CJK UNIFIED IDEOGRAPH-4EAC>, +because their ânamesâ vary. The CPAN C<Unicode::Unihan> module +has a large database for decoding these (and a whole lot more), provided you +know how to understand its output. + + # cpan -i Unicode::Unihan + use Unicode::Unihan; + my $str = "æ±äº¬"; + my $unhan = new Unicode::Unihan; + for my $lang (qw(Mandarin Cantonese Korean JapaneseOn JapaneseKun)) { + printf "CJK $str in %-12s is ", $lang; + say $unhan->$lang($str); + } + +prints: + + CJK æ±äº¬ in Mandarin is DONG1JING1 + CJK æ±äº¬ in Cantonese is dung1ging1 + CJK æ±äº¬ in Korean is TONGKYENG + CJK æ±äº¬ in JapaneseOn is TOUKYOU KEI KIN + CJK æ±äº¬ in JapaneseKun is HIGASHI AZUMAMIYAKO + +If you have a specific romanization scheme in mind, +use the specific module: + + # cpan -i Lingua::JA::Romanize::Japanese + use Lingua::JA::Romanize::Japanese; + my $k2r = new Lingua::JA::Romanize::Japanese; + my $str = "æ±äº¬"; + say "Japanese for $str is ", $k2r->chars($str); + +prints + + Japanese for æ±äº¬ is toukyou + +=head2 â 12: Explicit encode/decode + +On rare occasion, such as a database read, you may be +given encoded text you need to decode. + + use Encode qw(encode decode); + + my $chars = decode("shiftjis", $bytes, 1); + # OR + my $bytes = encode("MIME-Header-ISO_2022_JP", $chars, 1); + +For streams all in the same encoding, don't use encode/decode; instead +set the file encoding when you open the file or immediately after with +C<binmode> as described later below. + +=head2 â 13: Decode program arguments as utf8 + + $ perl -CA ... + or + $ export PERL_UNICODE=A + or + use Encode qw(decode_utf8); + @ARGV = map { decode_utf8($_, 1) } @ARGV; + +=head2 â 14: Decode program arguments as locale encoding + + # cpan -i Encode::Locale + use Encode qw(locale); + use Encode::Locale; + + # use "locale" as an arg to encode/decode + @ARGV = map { decode(locale => $_, 1) } @ARGV; + +=head2 â 15: Declare STD{IN,OUT,ERR} to be utf8 + +Use a command-line option, an environment variable, or else +call C<binmode> explicitly: + + $ perl -CS ... + or + $ export PERL_UNICODE=S + or + use open qw(:std :utf8); + or + binmode(STDIN, ":utf8"); + binmode(STDOUT, ":utf8"); + binmode(STDERR, ":utf8"); + +=head2 â 16: Declare STD{IN,OUT,ERR} to be in locale encoding + + # cpan -i Encode::Locale + use Encode; + use Encode::Locale; + + # or as a stream for binmode or open + binmode STDIN, ":encoding(console_in)" if -t STDIN; + binmode STDOUT, ":encoding(console_out)" if -t STDOUT; + binmode STDERR, ":encoding(console_out)" if -t STDERR; + +=head2 â 17: Make file I/O default to utf8 + +Files opened without an encoding argument will be in UTF-8: + + $ perl -CD ... + or + $ export PERL_UNICODE=D + or + use open qw(:utf8); + +=head2 â 18: Make all I/O and args default to utf8 + + $ perl -CSDA ... + or + $ export PERL_UNICODE=SDA + or + use open qw(:std :utf8); + use Encode qw(decode_utf8); + @ARGV = map { decode_utf8($_, 1) } @ARGV; + +=head2 â 19: Open file with specific encoding + +Specify stream encoding. This is the normal way +to deal with encoded text, not by calling low-level +functions. + + # input file + open(my $in_file, "< :encoding(UTF-16)", "wintext"); + OR + open(my $in_file, "<", "wintext"); + binmode($in_file, ":encoding(UTF-16)"); + THEN + my $line = <$in_file>; + + # output file + open($out_file, "> :encoding(cp1252)", "wintext"); + OR + open(my $out_file, ">", "wintext"); + binmode($out_file, ":encoding(cp1252)"); + THEN + print $out_file "some text\n"; + +More layers than just the encoding can be specified here. For example, +the incantation C<":raw :encoding(UTF-16LE) :crlf"> includes implicit +CRLF handling. + +=head2 â 20: Unicode casing + +Unicode casing is very different from ASCII casing. + + uc("henry â ·") # "HENRY â §" + uc("tschüÃ") # "TSCHÃSS" notice à => SS + + # both are true: + "tschüÃ" =~ /TSCHÃSS/i # notice à => SS + "ΣίÏÏ ÏοÏ" =~ /ΣÎΣΥΦÎΣ/i # notice Σ,Ï,Ï sameness + +=head2 â 21: Unicode case-insensitive comparisons + +Also available in the CPAN L<Unicode::CaseFold> module, +the new C<fc> âfoldcaseâ function from v5.16 grants +access to the same Unicode casefolding as the C</i> +pattern modifier has always used: + + use feature "fc"; # fc() function is from v5.16 + + # sort case-insensitively + my @sorted = sort { fc($a) cmp fc($b) } @list; + + # both are true: + fc("tschüÃ") eq fc("TSCHÃSS") + fc("ΣίÏÏ ÏοÏ") eq fc("ΣÎΣΥΦÎΣ") + +=head2 â 22: Match Unicode linebreak sequence in regex + +A Unicode linebreak matches the two-character CRLF +grapheme or any of seven vertical whitespace characters. +Good for dealing with textfiles coming from different +operating systems. + + \R + + s/\R/\n/g; # normalize all linebreaks to \n + +=head2 â 23: Get character category + +Find the general category of a numeric codepoint. + + use Unicode::UCD qw(charinfo); + my $cat = charinfo(0x3A3)->{category}; # "Lu" + +=head2 â 24: Disabling Unicode-awareness in builtin charclasses + +Disable C<\w>, C<\b>, C<\s>, C<\d>, and the POSIX +classes from working correctly on Unicode either in this +scope, or in just one regex. + + use v5.14; + use re "/a"; + + # OR + + my($num) = $str =~ /(\d+)/a; + +Or use specific un-Unicode properties, like C<\p{ahex}> +and C<\p{POSIX_Digit>}. Properties still work normally +no matter what charset modifiers (C</d /u /l /a /aa>) +should be effect. + +=head2 â 25: Match Unicode properties in regex with \p, \P + +These all match a single codepoint with the given +property. Use C<\P> in place of C<\p> to match +one codepoint lacking that property. + + \pL, \pN, \pS, \pP, \pM, \pZ, \pC + \p{Sk}, \p{Ps}, \p{Lt} + \p{alpha}, \p{upper}, \p{lower} + \p{Latin}, \p{Greek} + \p{script=Latin}, \p{script=Greek} + \p{East_Asian_Width=Wide}, \p{EA=W} + \p{Line_Break=Hyphen}, \p{LB=HY} + \p{Numeric_Value=4}, \p{NV=4} + +=head2 â 26: Custom character properties + +Define at compile-time your own custom character +properties for use in regexes. + + # using private-use characters + sub In_Tengwar { "E000\tE07F\n" } + + if (/\p{In_Tengwar}/) { ... } + + # blending existing properties + sub Is_GraecoRoman_Title {<<'END_OF_SET'} + +utf8::IsLatin + +utf8::IsGreek + &utf8::IsTitle + END_OF_SET + + if (/\p{Is_GraecoRoman_Title}/ { ... } + +=head2 â 27: Unicode normalization + +Typically render into NFD on input and NFC on output. Using NFKC or NFKD +functions improves recall on searches, assuming you've already done to the +same text to be searched. Note that this is about much more than just pre- +combined compatibility glyphs; it also reorders marks according to their +canonical combining classes and weeds out singletons. + + use Unicode::Normalize; + my $nfd = NFD($orig); + my $nfc = NFC($orig); + my $nfkd = NFKD($orig); + my $nfkc = NFKC($orig); + +=head2 â 28: Convert non-ASCII Unicode numerics + +Unless youâve used C</a> or C</aa>, C<\d> matches more than +ASCII digits only, but Perlâs implicit string-to-number +conversion does not current recognize these. Hereâs how to +convert such strings manually. + + use v5.14; # needed for num() function + use Unicode::UCD qw(num); + my $str = "got â « and ४५६ॠand â and here"; + my @nums = (); + while (/$str =~ (\d+|\N)/g) { # not just ASCII! + push @nums, num($1); + } + say "@nums"; # 12 4567 0.875 + + use charnames qw(:full); + my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}"); + +=head2 â 29: Match Unicode grapheme cluster in regex + +Programmer-visible âcharactersâ are codepoints matched by C</./s>, +but user-visible âcharactersâ are graphemes matched by C</\X/>. + + # Find vowel *plus* any combining diacritics,underlining,etc. + my $nfd = NFD($orig); + $nfd =~ / (?=[aeiou]) \X /xi + +=head2 â 30: Extract by grapheme instead of by codepoint (regex) + + # match and grab five first graphemes + my($first_five) = $str =~ /^ ( \X{5} ) /x; + +=head2 â 31: Extract by grapheme instead of by codepoint (substr) + + # cpan -i Unicode::GCString + use Unicode::GCString; + my $gcs = Unicode::GCString->new($str); + my $first_five = $gcs->substr(0, 5); + +=head2 â 32: Reverse string by grapheme + +Reversing by codepoint messes up diacritics, mistakenly converting +C<crème brûlée> into C<éelÌurb emÌerc> instead of into C<eélûrb emèrc>; +so reverse by grapheme instead. Both these approaches work +right no matter what normalization the string is in: + + $str = join("", reverse $str =~ /\X/g); + + # OR: cpan -i Unicode::GCString + use Unicode::GCString; + $str = reverse Unicode::GCString->new($str); + +=head2 â 33: String length in graphemes + +The string C<brûlée> has six graphemes but up to eight codepoints. +This counts by grapheme, not by codepoint: + + my $str = "brûlée"; + my $count = 0; + while ($str =~ /\X/g) { $count++ } + + # OR: cpan -i Unicode::GCString + use Unicode::GCString; + my $gcs = Unicode::GCString->new($str); + my $count = $gcs->length; + +=head2 â 34: Unicode column-width for printing + +Perlâs C<printf>, C<sprintf>, and C<format> think all +codepoints take up 1 print column, but many take 0 or 2. +Here to show that normalization makes no difference, +we print out both forms: + + use Unicode::GCString; + use Unicode::Normalize; + + my @words = qw/crème brûlée/; + @words = map { NFC($_), NFD($_) } @words; + + for my $str (@words) { + my $gcs = Unicode::GCString->new($str); + my $cols = $gcs->columns; + my $pad = " " x (10 - $cols); + say str, $pad, " |"; + } + +generates this to show that it pads correctly no matter +the normalization: + + crème | + creÌme | + brûlée | + bruÌleÌe | + +=head2 â 35: Unicode collation + +Text sorted by numeric codepoint follows no reasonable alphabetic order; +use the UCA for sorting text. + + use Unicode::Collate; + my $col = Unicode::Collate->new(); + my @list = $col->sort(@old_list); + +See the I<ucsort> program from the L<Unicode::Tussle> CPAN module +for a convenient command-line interface to this module. + +=head2 â 36: Case- I<and> accent-insensitive Unicode sort + +Specify a collation strength of level 1 to ignore case and +diacritics, only looking at the basic character. + + use Unicode::Collate; + my $col = Unicode::Collate->new(level => 1); + my @list = $col->sort(@old_list); + +=head2 â 37: Unicode locale collation + +Some locales have special sorting rules. + + # either use v5.12, OR: cpan -i Unicode::Collate::Locale + use Unicode::Collate::Locale; + my $col = Unicode::Collate::Locale->new(locale => "de__phonebook"); + my @list = $col->sort(@old_list); + +The I<ucsort> program mentioned above accepts a C<--locale> parameter. + +=head2 â 38: Making C<cmp> work on text instead of codepoints + +Instead of this: + + @srecs = sort { + $b->{AGE} <=> $a->{AGE} + || + $a->{NAME} cmp $b->{NAME} + } @recs; + +Use this: + + my $coll = Unicode::Collate->new(); + for my $rec (@recs) { + $rec->{NAME_key} = $coll->getSortKey( $rec->{NAME} ); + } + @srecs = sort { + $b->{AGE} <=> $a->{AGE} + || + $a->{NAME_key} cmp $b->{NAME_key} + } @recs; + +=head2 â 39: Case- I<and> accent-insensitive comparisons + +Use a collator object to compare Unicode text by character +instead of by codepoint. + + use Unicode::Collate; + my $es = Unicode::Collate->new( + level => 1, + normalization => undef + ); + + # now both are true: + $es->eq("GarcÃa", "GARCIA" ); + $es->eq("Márquez", "MARQUEZ"); + +=head2 â 40: Case- I<and> accent-insensitive locale comparisons + +Same, but in a specific locale. + + my $de = Unicode::Collate::Locale->new( + locale => "de__phonebook", + ); + + # now this is true: + $de->eq("tschüÃ", "TSCHUESS"); # notice ü => UE, à => SS + +=head2 â 41: Unicode linebreaking + +Break up text into lines according to Unicode rules. + + # cpan -i Unicode::LineBreak + use Unicode::LineBreak; + use charnames qw(:full); + + my $para = "This is a super\N{HYPHEN}long string. " x 20; + my $fmt = new Unicode::LineBreak; + print $fmt->break($para), "\n"; + +=head2 â 42: Unicode text in DBM hashes, the tedious way + +Using a regular Perl string as a key or value for a DBM +hash will trigger a wide character exception if any codepoints +wonât fit into a byte. Hereâs how to manually manage the translation: + + use DB_File; + use Encode qw(encode decode); + tie %dbhash, "DB_File", "pathname"; + + # STORE + + # assume $uni_key and $uni_value are abstract Unicode strings + my $enc_key = encode("UTF-8", $uni_key, 1); + my $enc_value = encode("UTF-8", $uni_value, 1); + $dbhash{$enc_key} = $enc_value; + + # FETCH + + # assume $uni_key holds a normal Perl string (abstract Unicode) + my $enc_key = encode("UTF-8", $uni_key, 1); + my $enc_value = $dbhash{$enc_key}; + my $uni_value = decode("UTF-8", $enc_key, 1); + +=head2 â 43: Unicode text in DBM hashes, the easy way + +Hereâs how to implicitly manage the translation; all encoding +and decoding is done automatically, just as with streams that +have a particular encoding attached to them: + + use DB_File; + use DBM_Filter; + + my $dbobj = tie %dbhash, "DB_File", "pathname"; + $dbobj->Filter_Value("utf8"); # this is the magic bit + + # STORE + + # assume $uni_key and $uni_value are abstract Unicode strings + $dbhash{$uni_key} = $uni_value; + + # FETCH + + # $uni_key holds a normal Perl string (abstract Unicode) + my $uni_value = $dbhash{$uni_key}; + +=head2 â 44: PROGRAM: Demo of Unicode collation and printing + +Hereâs a full program showing how to make use of locale-sensitive +sorting, Unicode casing, and managing print widths when some of the +characters take up zero or two columns, not just one column each time. +When run, the following program produces this nicely aligned output: + + Crème Brûlée....... â¬2.00 + Ãclair............. â¬1.60 + Fideuà ............. â¬4.20 + Hamburger.......... â¬6.00 + Jamón Serrano...... â¬4.45 + Linguiça........... â¬7.00 + Pâté............... â¬4.15 + Pears.............. â¬2.00 + Pêches............. â¬2.25 + Smørbrød........... â¬5.75 + Spätzle............ â¬5.50 + Xoriço............. â¬3.00 + ÎÏÏοÏ.............. â¬6.50 + ë§ê±¸ë¦¬............. â¬4.00 + ããã¡............. â¬2.65 + ã好ã¿ç¼ã......... â¬8.00 + ã·ã¥ã¼ã¯ãªã¼ã ..... â¬1.85 + 寿å¸............... â¬9.99 + å å............... â¬7.50 + +Here's that program; tested on v5.14. + + #!/usr/bin/env perl + # umenu - demo sorting and printing of Unicode food + # + # (obligatory and increasingly long preamble) + # + use utf8; + use v5.14; # for locale sorting + use strict; + use warnings; + use warnings qw(FATAL utf8); # fatalize encoding faults + use open qw(:std :utf8); # undeclared streams in UTF-8 + use charnames qw(:full :short); # unneeded in v5.16 + + # std modules + use Unicode::Normalize; # std perl distro as of v5.8 + use List::Util qw(max); # std perl distro as of v5.10 + use Unicode::Collate::Locale; # std perl distro as of v5.14 + + # cpan modules + use Unicode::GCString; # from CPAN + + # forward defs + sub pad($$$); + sub colwidth(_); + sub entitle(_); + + my %price = ( + "γÏÏοÏ" => 6.50, # gyros + "pears" => 2.00, # like um, pears + "linguiça" => 7.00, # spicy sausage, Portuguese + "xoriço" => 3.00, # chorizo sausage, Catalan + "hamburger" => 6.00, # burgermeister meisterburger + "éclair" => 1.60, # dessert, French + "smørbrød" => 5.75, # sandwiches, Norwegian + "spätzle" => 5.50, # Bayerisch noodles, little sparrows + "å å" => 7.50, # bao1 zi5, steamed pork buns, Mandarin + "jamón serrano" => 4.45, # country ham, Spanish + "pêches" => 2.25, # peaches, French + "ã·ã¥ã¼ã¯ãªã¼ã " => 1.85, # cream-filled pastry like eclair + "ë§ê±¸ë¦¬" => 4.00, # makgeolli, Korean rice wine + "寿å¸" => 9.99, # sushi, Japanese + "ããã¡" => 2.65, # omochi, rice cakes, Japanese + "crème brûlée" => 2.00, # crema catalana + "fideuà " => 4.20, # more noodles, Valencian (Catalan=fideuada) + "pâté" => 4.15, # gooseliver paste, French + "ã好ã¿ç¼ã" => 8.00, # okonomiyaki, Japanese + ); + + my $width = 5 + max map { colwidth } keys %price; + + # So the Asian stuff comes out in an order that someone + # who reads those scripts won't freak out over; the + # CJK stuff will be in JIS X 0208 order that way. + my $coll = new Unicode::Collate::Locale locale => "ja"; + + for my $item ($coll->sort(keys %price)) { + print pad(entitle($item), $width, "."); + printf " â¬%.2f\n", $price{$item}; + } + + sub pad($$$) { + my($str, $width, $padchar) = @_; + return $str . ($padchar x ($width - colwidth($str))); + } + + sub colwidth(_) { + my($str) = @_; + return Unicode::GCString->new($str)->columns; + } + + sub entitle(_) { + my($str) = @_; + $str =~ s{ (?=\pL)(\S) (\S*) } + { ucfirst($1) . lc($2) }xge; + return $str; + } + +=head1 SEE ALSO + +See these manpages, some of which are CPAN modules: +L<perlunicode>, L<perluniprops>, +L<perlre>, L<perlrecharclass>, +L<perluniintro>, L<perlunitut>, L<perlunifaq>, +L<PerlIO>, L<DB_File>, L<DBM_Filter>, L<DBM_Filter::utf8>, +L<Encode>, L<Encode::Locale>, +L<Unicode::UCD>, +L<Unicode::Normalize>, +L<Unicode::GCString>, L<Unicode::LineBreak>, +L<Unicode::Collate>, L<Unicode::Collate::Locale>, +L<Unicode::Unihan>, +L<Unicode::CaseFold>, +L<Unicode::Tussle>, +L<Lingua::JA::Romanize::Japanese>, +L<Lingua::ZH::Romanize::Pinyin>, +L<Lingua::KO::Romanize::Hangul>. + +The L<Unicode::Tussle> CPAN module includes many programs +to help with working with Unicode, including +these programs to fully or partly replace standard utilities: +I<tcgrep> instead of I<egrep>, +I<uniquote> instead of I<cat -v> or I<hexdump>, +I<uniwc> instead of I<wc>, +I<unilook> instead of I<look>, +I<unifmt> instead of I<fmt>, +and +I<ucsort> instead of I<sort>. +For exploring Unicode character names and character properties, +see its I<uniprops>, I<unichars>, and I<uninames> programs. +It also supplies these programs, all of which are general filters that do Unicode-y things: +I<unititle> and I<unicaps>; +I<uniwide> and I<uninarrow>; +I<unisupers> and I<unisubs>; +I<nfd>, I<nfc>, I<nfkd>, and I<nfkc>; +and I<uc>, I<lc>, and I<tc>. + +Finally, see the published Unicode Standard (page numbers are from version +6.0.0), including these specific annexes and technical reports: + +=over + +=item §3.13 Default Case Algorithms, page 113; +§4.2 Case, pages 120â122; +Case Mappings, page 166â172, especially Caseless Matching starting on page 170. + +=item UAX #44: Unicode Character Database + +=item UTS #18: Unicode Regular Expressions + +=item UAX #15: Unicode Normalization Forms + +=item UTS #10: Unicode Collation Algorithm + +=item UAX #29: Unicode Text Segmentation + +=item UAX #14: Unicode Line Breaking Algorithm + +=item UAX #11: East Asian Width + +=back + +=head1 AUTHOR + +Tom Christiansen E<lt>[email protected]<gt> wrote this, with occasional +kibbitzing from Larry Wall and Jeffrey Friedl in the background. + +=head1 COPYRIGHT AND LICENCE + +Copyright © 2012 Tom Christiansen. + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +Most of these examples taken from the current edition of the âCamel Bookâ; +that is, from the 4áµÊ° Edition of I<Programming Perl>, Copyright © 2012 Tom +Christiansen <et al.>, 2012-02-13 by OâReilly Media. The code itself is +freely redistributable, and you are encouraged to transplant, fold, +spindle, and mutilate any of the examples in this manpage however you please +for inclusion into your own programs without any encumbrance whatsoever. +Acknowledgement via code comment is polite but not required. + +=head1 REVISION HISTORY + +v1.0.0 â first public release, 2012-02-27 + diff --git a/t/porting/known_pod_issues.dat b/t/porting/known_pod_issues.dat index 688d033..e1de02b 100644 --- a/t/porting/known_pod_issues.dat +++ b/t/porting/known_pod_issues.dat @@ -94,7 +94,10 @@ kill(3) langinfo(3) LaTeX::Encode Lexical::Var +Lingua::JA::Romanize::Japanese Lingua::KO::Hangul::Util +Lingua::KO::Romanize::Hangul +Lingua::ZH::Romanize::Pinyin List::Gather local::lib Log::Message @@ -205,10 +208,13 @@ Time::Object Tk Tk::Pod tty(1) +Unicode::CaseFold Unicode::Casing +Unicode::GCString Unicode::LineBreak Unicode::Regex::Set Unicode::Semantics +Unicode::Tussle Unicode::Unihan unzip(1) Version::Requirements diff --git a/win32/pod.mak b/win32/pod.mak index c053b37..16980d4 100644 --- a/win32/pod.mak +++ b/win32/pod.mak @@ -144,6 +144,7 @@ POD = perl.pod \ perltoot.pod \ perltrap.pod \ perlunicode.pod \ + perlunicook.pod \ perlunifaq.pod \ perluniintro.pod \ perluniprops.pod \ @@ -284,6 +285,7 @@ MAN = perl.man \ perltoot.man \ perltrap.man \ perlunicode.man \ + perlunicook.man \ perlunifaq.man \ perluniintro.man \ perluniprops.man \ @@ -423,6 +425,7 @@ HTML = perl.html \ perltoot.html \ perltrap.html \ perlunicode.html \ + perlunicook.html \ perlunifaq.html \ perluniintro.html \ perluniprops.html \ @@ -564,6 +567,7 @@ TEX = perl.tex \ perltoot.tex \ perltrap.tex \ perlunicode.tex \ + perlunicook.tex \ perlunifaq.tex \ perluniintro.tex \ perluniprops.tex \ -- Perl5 Master Repository
