[perl.git] branch blead, updated. v5.21.7-44-gef3a1d8

Ricardo Signes Sun, 21 Dec 2014 15:53:14 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/ef3a1d8a913a6ffccc2517a79a516651aa599054?hp=088bd7a611dad28d5343d929e157e423aed4f5dc>


- Log -----------------------------------------------------------------
commit ef3a1d8a913a6ffccc2517a79a516651aa599054
Merge: 088bd7a efb8961
Author: Ricardo Signes <[email protected]>
Date:   Sun Dec 21 18:52:15 2014 -0500

    Merge branch 'perlunicook' into blead

commit efb8961587a30e930d4581a96246b02e20d2b1f4
Author: Ricardo Signes <[email protected]>
Date:   Sun Dec 21 18:48:22 2014 -0500

    perlunicook: add trusted-to-exist links for perlunicook

M       t/porting/known_pod_issues.dat

commit 32a9c674a9cef14c4bd3666bff64da2f2ff2b9a0
Author: Ricardo Signes <[email protected]>
Date:   Sun Dec 21 18:41:35 2014 -0500

    perlunicook: remove empty list item

M       pod/perlunicook.pod

commit 5dd47fd476d31d98fcd75101d115aca2ddcdbe49
Author: Ricardo Signes <[email protected]>
Date:   Sun Dec 21 18:37:06 2014 -0500

    perlunicook: add it to build files and TOC

M       plan9/mkfile
M       pod/perl.pod
M       win32/pod.mak

commit d4d0a10ef54ca990c3e90457f90f03ea4bc9e10f
Author: Ricardo Signes <[email protected]>
Date:   Sun Dec 21 18:35:10 2014 -0500

    perlunicook: add perlunicook to MANIFEST

M       MANIFEST

commit ddeccf1ffc8f43b11b4f333d8387265c4b449b3c
Author: Rafael Garcia-Suarez <[email protected]>
Date:   Thu Mar 8 19:31:47 2012 +0100

    Fixes for a few trivial typos

M       pod/perlunicook.pod

commit 2561daa455347e2b981e853cf24c33f12c57c58b
Author: Ricardo Signes <[email protected]>
Date:   Tue Mar 6 10:32:36 2012 -0500

    Tom Christiansen's Perl Unicode Cookbook 1st draft
    
    originally posted to perl5-porters as message-id
    18479.1330388058@chthon

A       pod/perlunicook.pod
-----------------------------------------------------------------------

Summary of changes:
 MANIFEST                       |   1 +
 plan9/mkfile                   |   2 +-
 pod/perl.pod                   |   1 +
 pod/perlunicook.pod            | 856 +++++++++++++++++++++++++++++++++++++++++
 t/porting/known_pod_issues.dat |   6 +
 win32/pod.mak                  |   4 +
 6 files changed, 869 insertions(+), 1 deletion(-)
 create mode 100644 pod/perlunicook.pod

diff --git a/MANIFEST b/MANIFEST
index b8c151e..e781d3a 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -4698,6 +4698,7 @@ pod/perltooc.pod
 pod/perltoot.pod               
 pod/perltrap.pod               Perl traps for the unwary
 pod/perlunicode.pod            Perl Unicode support
+pod/perlunicook.pod            Perl Unicode cookbook
 pod/perlunifaq.pod             Perl Unicode FAQ
 pod/perluniintro.pod           Perl Unicode introduction
 pod/perlunitut.pod             Perl Unicode tutorial
diff --git a/plan9/mkfile b/plan9/mkfile
index 28d290b..cf2153e 100644
--- a/plan9/mkfile
+++ b/plan9/mkfile
@@ -22,7 +22,7 @@ installman3dir = /sys/man/2
 
 podnames = perl perlbook perldata perldebtut perldiag perldsc perlform 
perlfunc perlipc perllexwarn perllol perlmod perlmodlib perlmodinstall 
perlnewmod perlop perlootut perlopentut perlpacktut perlp ... [164 chars 
truncated]
 faqpodnames = perlfaq perlfaq1 perlfaq2 perlfaq3 perlfaq4 perlfaq5 perlfaq6 
perlfaq7 perlfaq8 perlfaq9
-advpodnames = perlapi perlapio perlcall perlclib perlcompile perldebguts 
perldbmfilter perldebug perldelta perldiag perlebcdic perlembed perlfilter 
perlfork perlguts perlhack perlintern perliol perll ... [99 chars truncated]
+advpodnames = perlapi perlapio perlcall perlclib perlcompile perldebguts 
perldbmfilter perldebug perldelta perldiag perlebcdic perlembed perlfilter 
perlfork perlguts perlhack perlintern perliol perll ... [111 chars truncated]
 archpodnames = perlaix perlamiga perlbeos perlbs2000 perlce perlcygwin 
perldgux perldos perlfreebsd perlhpux perlhurd perlirix perlmacos perlmpeix 
perlnetware perlos2 perlos390 perlos400 perlplan9 pe ... [53 chars truncated]
 histpods = perl5004delta perl5005delta perl561delta perl56delta perl570delta 
perl571delta perl572delta perl573delta perl58delta perlhist
 
diff --git a/pod/perl.pod b/pod/perl.pod
index 1ad467c..64a58d4 100644
--- a/pod/perl.pod
+++ b/pod/perl.pod
@@ -121,6 +121,7 @@ aux a2p c2ph h2ph h2xs perlbug pl2pm pod2html pod2man s2p 
splain xsubpp
     perllocale         Perl locale support
     perluniintro       Perl Unicode introduction
     perlunicode        Perl Unicode support
+    perlunicook        Perl Unicode cookbook
     perlunifaq         Perl Unicode FAQ
     perluniprops       Index of Unicode properties in Perl
     perlunitut         Perl Unicode tutorial
diff --git a/pod/perlunicook.pod b/pod/perlunicook.pod
new file mode 100644
index 0000000..5192e54
--- /dev/null
+++ b/pod/perlunicook.pod
@@ -0,0 +1,856 @@
+
+=encoding utf8
+
+=head1 NAME
+
+perlunicook - cookbookish examples of handling Unicode in Perl
+
+=head1 DESCRIPTION
+
+This manpage contains short recipes demonstrating how to handle common Unicode
+operations in Perl, plus one complete program at the end. Any undeclared
+variables in individual recipes are assumed to have a previous appropriate
+value in them.
+
+=head1 EXAMPLES
+
+=head2 â 0: Standard preamble
+
+Unless otherwise notes, all examples below require this standard preamble
+to work correctly, with the C<#!> adjusted to work on your system:
+
+ #!/usr/bin/env perl
+
+ use utf8;      # so literals and identifiers can be in UTF-8
+ use v5.12;     # or later to get "unicode_strings" feature
+ use strict;    # quote strings, declare variables
+ use warnings;  # on by default
+ use warnings  qw(FATAL utf8);    # fatalize encoding glitches
+ use open      qw(:std :utf8);    # undeclared streams in UTF-8
+ use charnames qw(:full :short);  # unneeded in v5.16
+
+This I<does> make even Unix programmers C<binmode> your binary streams,
+or open them with C<:raw>, but that's the only way to get at them
+portably anyway.
+
+B<WARNING>: C<use autoload> and C<use open> do not get along with each other.
+
+=head2 â 1: Generic Unicode-savvy filter
+
+Always decompose on the way in, then recompose on the way out.
+
+ use Unicode::Normalize;
+
+ while (<>) {
+     $_ = NFD($_);   # decompose + reorder canonically
+     ...
+ } continue {
+     print NFC($_);  # recompose (where possible) + reorder canonically
+ }
+
+=head2 â 2: Fine-tuning Unicode warnings
+
+As of v5.14, Perl distinguishes three subclasses of UTFâ8 warnings.
+
+ use v5.14;                  # subwarnings unavailable any earlier
+ no warnings "nonchar";      # the 66 forbidden non-characters
+ no warnings "surrogate";    # UTF-16/CESU-8 nonsense
+ no warnings "non_unicode";  # for codepoints over 0x10_FFFF
+
+=head2 â 3: Declare source in utf8 for identifiers and literals
+
+Without the all-critical C<use utf8> declaration, putting UTFâ8 in your
+literals and identifiers wonât work right.  If you used the standard
+preamble just given above, this already happened.  If you did, you can
+do things like this:
+
+ use utf8;
+
+ my $measure   = "ÃngstrÃ¶m";
+ my @Î¼soft     = qw( cp852 cp1251 cp1252 );
+ my @á½ÏÎÏÎ¼ÎµÎ³Î±Ï = qw( á½ÏÎÏ  Î¼ÎµÎ³Î±Ï );
+ my @é¯        = qw( koi8-f koi8-u koi8-r );
+ my $motto     = "ðª ð ðª"; # FAMILY, GROWING HEART, DROMEDARY CAMEL
+
+If you forget C<use utf8>, high bytes will be misunderstood as
+separate characters, and nothing will work right.
+
+=head2 â 4: Characters and their numbers
+
+The C<ord> and C<chr> functions work transparently on all codepoints,
+not just on ASCII alone â nor in fact, not even just on Unicode alone.
+
+ # ASCII characters
+ ord("A")
+ chr(65)
+
+ # characters from the Basic Multilingual Plane
+ ord("Î£")
+ chr(0x3A3)
+
+ # beyond the BMP
+ ord("ð")               # MATHEMATICAL ITALIC SMALL N
+ chr(0x1D45B)
+
+ # beyond Unicode! (up to MAXINT)
+ ord("\x{20_0000}")
+ chr(0x20_0000)
+
+=head2 â 5: Unicode literals by character number
+
+In an interpolated literal, whether a double-quoted string or a
+regex, you may specify a character by its number using the
+C<\x{I<HHHHHH>}> escape.
+
+ String: "\x{3a3}"
+ Regex:  /\x{3a3}/
+
+ String: "\x{1d45b}"
+ Regex:  /\x{1d45b}/
+
+ # even non-BMP ranges in regex work fine
+ /[\x{1D434}-\x{1D467}]/
+
+=head2 â 6: Get character name by number
+
+ use charnames ();
+ my $name = charnames::viacode(0x03A3);
+
+=head2 â 7: Get character number by name
+
+ use charnames ();
+ my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA");
+
+=head2 â 8: Unicode named characters
+
+Use the C<< \N{I<charname>} >> notation to get the character
+by that name for use in interpolated literals (double-quoted
+strings and regexes).  In v5.16, there is an implicit
+
+ use charnames qw(:full :short);
+
+But prior to v5.16, you must be explicit about which set of charnames you
+want.  The C<:full> names are the official Unicode character name, alias, or
+sequence, which all share a namespace.
+
+ use charnames qw(:full :short latin greek);
+
+ "\N{MATHEMATICAL ITALIC SMALL N}"      # :full
+ "\N{GREEK CAPITAL LETTER SIGMA}"       # :full
+
+Anything else is a Perl-specific convenience abbreviation.  Specify one or
+more scripts by names if you want short names that are script-specific.
+
+ "\N{Greek:Sigma}"                      # :short
+ "\N{ae}"                               #  latin
+ "\N{epsilon}"                          #  greek
+
+The v5.16 release also supports a C<:loose> import for loose matching of
+character names, which works just like loose matching of property names:
+that is, it disregards case, whitespace, and underscores:
+
+ "\N{euro sign}"                        # :loose (from v5.16)
+
+=head2 â 9: Unicode named sequences
+
+These look just like character names but return multiple codepoints.
+Notice the C<%vx> vector-print functionality in C<printf>.
+
+ use charnames qw(:full);
+ my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}";
+ printf "U+%v04X\n", $seq;
+ U+0100.0300
+
+=head2 â 10: Custom named characters
+
+Use C<:alias> to give your own lexically scoped nicknames to existing
+characters, or even to give unnamed private-use characters useful names.
+
+ use charnames ":full", ":alias" => {
+     ecute => "LATIN SMALL LETTER E WITH ACUTE",
+     "APPLE LOGO" => 0xF8FF, # private use character
+ };
+
+ "\N{ecute}"
+ "\N{APPLE LOGO}"
+
+=head2 â 11: Names of CJK codepoints
+
+Sinograms like âæ±äº¬â come back with character names of
+C<CJK UNIFIED IDEOGRAPH-6771> and C<CJK UNIFIED IDEOGRAPH-4EAC>,
+because their ânamesâ vary.  The CPAN C<Unicode::Unihan> module
+has a large database for decoding these (and a whole lot more), provided you
+know how to understand its output.
+
+ # cpan -i Unicode::Unihan
+ use Unicode::Unihan;
+ my $str = "æ±äº¬";
+ my $unhan = new Unicode::Unihan;
+ for my $lang (qw(Mandarin Cantonese Korean JapaneseOn JapaneseKun)) {
+     printf "CJK $str in %-12s is ", $lang;
+     say $unhan->$lang($str);
+ }
+
+prints:
+
+ CJK æ±äº¬ in Mandarin     is DONG1JING1
+ CJK æ±äº¬ in Cantonese    is dung1ging1
+ CJK æ±äº¬ in Korean       is TONGKYENG
+ CJK æ±äº¬ in JapaneseOn   is TOUKYOU KEI KIN
+ CJK æ±äº¬ in JapaneseKun  is HIGASHI AZUMAMIYAKO
+
+If you have a specific romanization scheme in mind,
+use the specific module:
+
+ # cpan -i Lingua::JA::Romanize::Japanese
+ use Lingua::JA::Romanize::Japanese;
+ my $k2r = new Lingua::JA::Romanize::Japanese;
+ my $str = "æ±äº¬";
+ say "Japanese for $str is ", $k2r->chars($str);
+
+prints
+
+ Japanese for æ±äº¬ is toukyou
+
+=head2 â 12: Explicit encode/decode
+
+On rare occasion, such as a database read, you may be
+given encoded text you need to decode.
+
+  use Encode qw(encode decode);
+
+  my $chars = decode("shiftjis", $bytes, 1);
+ # OR
+  my $bytes = encode("MIME-Header-ISO_2022_JP", $chars, 1);
+
+For streams all in the same encoding, don't use encode/decode; instead
+set the file encoding when you open the file or immediately after with
+C<binmode> as described later below.
+
+=head2 â 13: Decode program arguments as utf8
+
+     $ perl -CA ...
+ or
+     $ export PERL_UNICODE=A
+ or
+    use Encode qw(decode_utf8);
+    @ARGV = map { decode_utf8($_, 1) } @ARGV;
+
+=head2 â 14: Decode program arguments as locale encoding
+
+    # cpan -i Encode::Locale
+    use Encode qw(locale);
+    use Encode::Locale;
+
+    # use "locale" as an arg to encode/decode
+    @ARGV = map { decode(locale => $_, 1) } @ARGV;
+
+=head2 â 15: Declare STD{IN,OUT,ERR} to be utf8
+
+Use a command-line option, an environment variable, or else
+call C<binmode> explicitly:
+
+     $ perl -CS ...
+ or
+     $ export PERL_UNICODE=S
+ or
+     use open qw(:std :utf8);
+ or
+     binmode(STDIN,  ":utf8");
+     binmode(STDOUT, ":utf8");
+     binmode(STDERR, ":utf8");
+
+=head2 â 16: Declare STD{IN,OUT,ERR} to be in locale encoding
+
+    # cpan -i Encode::Locale
+    use Encode;
+    use Encode::Locale;
+
+    # or as a stream for binmode or open
+    binmode STDIN,  ":encoding(console_in)"  if -t STDIN;
+    binmode STDOUT, ":encoding(console_out)" if -t STDOUT;
+    binmode STDERR, ":encoding(console_out)" if -t STDERR;
+
+=head2 â 17: Make file I/O default to utf8
+
+Files opened without an encoding argument will be in UTF-8:
+
+     $ perl -CD ...
+ or
+     $ export PERL_UNICODE=D
+ or
+     use open qw(:utf8);
+
+=head2 â 18: Make all I/O and args default to utf8
+
+     $ perl -CSDA ...
+ or
+     $ export PERL_UNICODE=SDA
+ or
+     use open qw(:std :utf8);
+     use Encode qw(decode_utf8);
+     @ARGV = map { decode_utf8($_, 1) } @ARGV;
+
+=head2 â 19: Open file with specific encoding
+
+Specify stream encoding.  This is the normal way
+to deal with encoded text, not by calling low-level
+functions.
+
+ # input file
+     open(my $in_file, "< :encoding(UTF-16)", "wintext");
+ OR
+     open(my $in_file, "<", "wintext");
+     binmode($in_file, ":encoding(UTF-16)");
+ THEN
+     my $line = <$in_file>;
+
+ # output file
+     open($out_file, "> :encoding(cp1252)", "wintext");
+ OR
+     open(my $out_file, ">", "wintext");
+     binmode($out_file, ":encoding(cp1252)");
+ THEN
+     print $out_file "some text\n";
+
+More layers than just the encoding can be specified here. For example,
+the incantation C<":raw :encoding(UTF-16LE) :crlf"> includes implicit
+CRLF handling.
+
+=head2 â 20: Unicode casing
+
+Unicode casing is very different from ASCII casing.
+
+ uc("henry â·")  # "HENRY â§"
+ uc("tschÃ¼Ã")   # "TSCHÃSS"  notice Ã => SS
+
+ # both are true:
+ "tschÃ¼Ã"  =~ /TSCHÃSS/i   # notice Ã => SS
+ "Î£Î¯ÏÏÏÎ¿Ï" =~ /Î£ÎÎ£Î¥Î¦ÎÎ£/i   # notice Î£,Ï,Ï sameness
+
+=head2 â 21: Unicode case-insensitive comparisons
+
+Also available in the CPAN L<Unicode::CaseFold> module,
+the new C<fc> âfoldcaseâ function from v5.16 grants
+access to the same Unicode casefolding as the C</i>
+pattern modifier has always used:
+
+ use feature "fc"; # fc() function is from v5.16
+
+ # sort case-insensitively
+ my @sorted = sort { fc($a) cmp fc($b) } @list;
+
+ # both are true:
+ fc("tschÃ¼Ã")  eq fc("TSCHÃSS")
+ fc("Î£Î¯ÏÏÏÎ¿Ï") eq fc("Î£ÎÎ£Î¥Î¦ÎÎ£")
+
+=head2 â 22: Match Unicode linebreak sequence in regex
+
+A Unicode linebreak matches the two-character CRLF
+grapheme or any of seven vertical whitespace characters.
+Good for dealing with textfiles coming from different
+operating systems.
+
+ \R
+
+ s/\R/\n/g;  # normalize all linebreaks to \n
+
+=head2 â 23: Get character category
+
+Find the general category of a numeric codepoint.
+
+ use Unicode::UCD qw(charinfo);
+ my $cat = charinfo(0x3A3)->{category};  # "Lu"
+
+=head2 â 24: Disabling Unicode-awareness in builtin charclasses
+
+Disable C<\w>, C<\b>, C<\s>, C<\d>, and the POSIX
+classes from working correctly on Unicode either in this
+scope, or in just one regex.
+
+ use v5.14;
+ use re "/a";
+
+ # OR
+
+ my($num) = $str =~ /(\d+)/a;
+
+Or use specific un-Unicode properties, like C<\p{ahex}>
+and C<\p{POSIX_Digit>}.  Properties still work normally
+no matter what charset modifiers (C</d /u /l /a /aa>)
+should be effect.
+
+=head2 â 25: Match Unicode properties in regex with \p, \P
+
+These all match a single codepoint with the given
+property.  Use C<\P> in place of C<\p> to match
+one codepoint lacking that property.
+
+ \pL, \pN, \pS, \pP, \pM, \pZ, \pC
+ \p{Sk}, \p{Ps}, \p{Lt}
+ \p{alpha}, \p{upper}, \p{lower}
+ \p{Latin}, \p{Greek}
+ \p{script=Latin}, \p{script=Greek}
+ \p{East_Asian_Width=Wide}, \p{EA=W}
+ \p{Line_Break=Hyphen}, \p{LB=HY}
+ \p{Numeric_Value=4}, \p{NV=4}
+
+=head2 â 26: Custom character properties
+
+Define at compile-time your own custom character
+properties for use in regexes.
+
+ # using private-use characters
+ sub In_Tengwar { "E000\tE07F\n" }
+
+ if (/\p{In_Tengwar}/) { ... }
+
+ # blending existing properties
+ sub Is_GraecoRoman_Title {<<'END_OF_SET'}
+ +utf8::IsLatin
+ +utf8::IsGreek
+ &utf8::IsTitle
+ END_OF_SET
+
+ if (/\p{Is_GraecoRoman_Title}/ { ... }
+
+=head2 â 27: Unicode normalization
+
+Typically render into NFD on input and NFC on output. Using NFKC or NFKD
+functions improves recall on searches, assuming you've already done to the
+same text to be searched. Note that this is about much more than just pre-
+combined compatibility glyphs; it also reorders marks according to their
+canonical combining classes and weeds out singletons.
+
+ use Unicode::Normalize;
+ my $nfd  = NFD($orig);
+ my $nfc  = NFC($orig);
+ my $nfkd = NFKD($orig);
+ my $nfkc = NFKC($orig);
+
+=head2 â 28: Convert non-ASCII Unicode numerics
+
+Unless youâve used C</a> or C</aa>, C<\d> matches more than
+ASCII digits only, but Perlâs implicit string-to-number
+conversion does not current recognize these.  Hereâs how to
+convert such strings manually.
+
+ use v5.14;  # needed for num() function
+ use Unicode::UCD qw(num);
+ my $str = "got â« and à¥ªà¥«à¥¬à¥ and â and here";
+ my @nums = ();
+ while (/$str =~ (\d+|\N)/g) {  # not just ASCII!
+    push @nums, num($1);
+ }
+ say "@nums";   #     12      4567      0.875
+
+ use charnames qw(:full);
+ my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
+
+=head2 â 29: Match Unicode grapheme cluster in regex
+
+Programmer-visible âcharactersâ are codepoints matched by C</./s>,
+but user-visible âcharactersâ are graphemes matched by C</\X/>.
+
+ # Find vowel *plus* any combining diacritics,underlining,etc.
+ my $nfd = NFD($orig);
+ $nfd =~ / (?=[aeiou]) \X /xi
+
+=head2 â 30: Extract by grapheme instead of by codepoint (regex)
+
+ # match and grab five first graphemes
+ my($first_five) = $str =~ /^ ( \X{5} ) /x;
+
+=head2 â 31: Extract by grapheme instead of by codepoint (substr)
+
+ # cpan -i Unicode::GCString
+ use Unicode::GCString;
+ my $gcs = Unicode::GCString->new($str);
+ my $first_five = $gcs->substr(0, 5);
+
+=head2 â 32: Reverse string by grapheme
+
+Reversing by codepoint messes up diacritics, mistakenly converting
+C<crÃ¨me brÃ»lÃ©e> into C<Ã©elÌurb emÌerc> instead of into C<eÃ©lÃ»rb 
emÃ¨rc>;
+so reverse by grapheme instead.  Both these approaches work
+right no matter what normalization the string is in:
+
+ $str = join("", reverse $str =~ /\X/g);
+
+ # OR: cpan -i Unicode::GCString
+ use Unicode::GCString;
+ $str = reverse Unicode::GCString->new($str);
+
+=head2 â 33: String length in graphemes
+
+The string C<brÃ»lÃ©e> has six graphemes but up to eight codepoints.
+This counts by grapheme, not by codepoint:
+
+ my $str = "brÃ»lÃ©e";
+ my $count = 0;
+ while ($str =~ /\X/g) { $count++ }
+
+  # OR: cpan -i Unicode::GCString
+ use Unicode::GCString;
+ my $gcs = Unicode::GCString->new($str);
+ my $count = $gcs->length;
+
+=head2 â 34: Unicode column-width for printing
+
+Perlâs C<printf>, C<sprintf>, and C<format> think all
+codepoints take up 1 print column, but many take 0 or 2.
+Here to show that normalization makes no difference,
+we print out both forms:
+
+ use Unicode::GCString;
+ use Unicode::Normalize;
+
+ my @words = qw/crÃ¨me brÃ»lÃ©e/;
+ @words = map { NFC($_), NFD($_) } @words;
+
+ for my $str (@words) {
+     my $gcs = Unicode::GCString->new($str);
+     my $cols = $gcs->columns;
+     my $pad = " " x (10 - $cols);
+     say str, $pad, " |";
+ }
+
+generates this to show that it pads correctly no matter
+the normalization:
+
+ crÃ¨me      |
+ creÌme      |
+ brÃ»lÃ©e     |
+ bruÌleÌe     |
+
+=head2 â 35: Unicode collation
+
+Text sorted by numeric codepoint follows no reasonable alphabetic order;
+use the UCA for sorting text.
+
+ use Unicode::Collate;
+ my $col = Unicode::Collate->new();
+ my @list = $col->sort(@old_list);
+
+See the I<ucsort> program from the L<Unicode::Tussle> CPAN module
+for a convenient command-line interface to this module.
+
+=head2 â 36: Case- I<and> accent-insensitive Unicode sort
+
+Specify a collation strength of level 1 to ignore case and
+diacritics, only looking at the basic character.
+
+ use Unicode::Collate;
+ my $col = Unicode::Collate->new(level => 1);
+ my @list = $col->sort(@old_list);
+
+=head2 â 37: Unicode locale collation
+
+Some locales have special sorting rules.
+
+ # either use v5.12, OR: cpan -i Unicode::Collate::Locale
+ use Unicode::Collate::Locale;
+ my $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
+ my @list = $col->sort(@old_list);
+
+The I<ucsort> program mentioned above accepts a C<--locale> parameter.
+
+=head2 â 38: Making C<cmp> work on text instead of codepoints
+
+Instead of this:
+
+ @srecs = sort {
+     $b->{AGE}   <=>  $a->{AGE}
+                 ||
+     $a->{NAME}  cmp  $b->{NAME}
+ } @recs;
+
+Use this:
+
+ my $coll = Unicode::Collate->new();
+ for my $rec (@recs) {
+     $rec->{NAME_key} = $coll->getSortKey( $rec->{NAME} );
+ }
+ @srecs = sort {
+     $b->{AGE}       <=>  $a->{AGE}
+                     ||
+     $a->{NAME_key}  cmp  $b->{NAME_key}
+ } @recs;
+
+=head2 â 39: Case- I<and> accent-insensitive comparisons
+
+Use a collator object to compare Unicode text by character
+instead of by codepoint.
+
+ use Unicode::Collate;
+ my $es = Unicode::Collate->new(
+     level => 1,
+     normalization => undef
+ );
+
+  # now both are true:
+ $es->eq("GarcÃa",  "GARCIA" );
+ $es->eq("MÃ¡rquez", "MARQUEZ");
+
+=head2 â 40: Case- I<and> accent-insensitive locale comparisons
+
+Same, but in a specific locale.
+
+ my $de = Unicode::Collate::Locale->new(
+            locale => "de__phonebook",
+          );
+
+ # now this is true:
+ $de->eq("tschÃ¼Ã", "TSCHUESS");  # notice Ã¼ => UE, Ã => SS
+
+=head2 â 41: Unicode linebreaking
+
+Break up text into lines according to Unicode rules.
+
+ # cpan -i Unicode::LineBreak
+ use Unicode::LineBreak;
+ use charnames qw(:full);
+
+ my $para = "This is a super\N{HYPHEN}long string. " x 20;
+ my $fmt = new Unicode::LineBreak;
+ print $fmt->break($para), "\n";
+
+=head2 â 42: Unicode text in DBM hashes, the tedious way
+
+Using a regular Perl string as a key or value for a DBM
+hash will trigger a wide character exception if any codepoints
+wonât fit into a byte.  Hereâs how to manually manage the translation:
+
+    use DB_File;
+    use Encode qw(encode decode);
+    tie %dbhash, "DB_File", "pathname";
+
+ # STORE
+
+    # assume $uni_key and $uni_value are abstract Unicode strings
+    my $enc_key   = encode("UTF-8", $uni_key, 1);
+    my $enc_value = encode("UTF-8", $uni_value, 1);
+    $dbhash{$enc_key} = $enc_value;
+
+ # FETCH
+
+    # assume $uni_key holds a normal Perl string (abstract Unicode)
+    my $enc_key   = encode("UTF-8", $uni_key, 1);
+    my $enc_value = $dbhash{$enc_key};
+    my $uni_value = decode("UTF-8", $enc_key, 1);
+
+=head2 â 43: Unicode text in DBM hashes, the easy way
+
+Hereâs how to implicitly manage the translation; all encoding
+and decoding is done automatically, just as with streams that
+have a particular encoding attached to them:
+
+    use DB_File;
+    use DBM_Filter;
+
+    my $dbobj = tie %dbhash, "DB_File", "pathname";
+    $dbobj->Filter_Value("utf8");  # this is the magic bit
+
+ # STORE
+
+    # assume $uni_key and $uni_value are abstract Unicode strings
+    $dbhash{$uni_key} = $uni_value;
+
+  # FETCH
+
+    # $uni_key holds a normal Perl string (abstract Unicode)
+    my $uni_value = $dbhash{$uni_key};
+
+=head2 â 44: PROGRAM: Demo of Unicode collation and printing
+
+Hereâs a full program showing how to make use of locale-sensitive
+sorting, Unicode casing, and managing print widths when some of the
+characters take up zero or two columns, not just one column each time.
+When run, the following program produces this nicely aligned output:
+
+    CrÃ¨me BrÃ»lÃ©e....... â¬2.00
+    Ãclair............. â¬1.60
+    FideuÃ ............. â¬4.20
+    Hamburger.......... â¬6.00
+    JamÃ³n Serrano...... â¬4.45
+    LinguiÃ§a........... â¬7.00
+    PÃ¢tÃ©............... â¬4.15
+    Pears.............. â¬2.00
+    PÃªches............. â¬2.25
+    SmÃ¸rbrÃ¸d........... â¬5.75
+    SpÃ¤tzle............ â¬5.50
+    XoriÃ§o............. â¬3.00
+    ÎÏÏÎ¿Ï.............. â¬6.50
+    ë§ê±¸ë¦¬............. â¬4.00
+    ããã¡............. â¬2.65
+    ãå¥½ã¿ç¼ã......... â¬8.00
+    ã·ã¥ã¼ã¯ãªã¼ã ..... â¬1.85
+    å¯¿å¸............... â¬9.99
+    åå............... â¬7.50
+
+Here's that program; tested on v5.14.
+
+ #!/usr/bin/env perl
+ # umenu - demo sorting and printing of Unicode food
+ #
+ # (obligatory and increasingly long preamble)
+ #
+ use utf8;
+ use v5.14;                       # for locale sorting
+ use strict;
+ use warnings;
+ use warnings  qw(FATAL utf8);    # fatalize encoding faults
+ use open      qw(:std :utf8);    # undeclared streams in UTF-8
+ use charnames qw(:full :short);  # unneeded in v5.16
+
+ # std modules
+ use Unicode::Normalize;          # std perl distro as of v5.8
+ use List::Util qw(max);          # std perl distro as of v5.10
+ use Unicode::Collate::Locale;    # std perl distro as of v5.14
+
+ # cpan modules
+ use Unicode::GCString;           # from CPAN
+
+ # forward defs
+ sub pad($$$);
+ sub colwidth(_);
+ sub entitle(_);
+
+ my %price = (
+     "Î³ÏÏÎ¿Ï"             => 6.50, # gyros
+     "pears"             => 2.00, # like um, pears
+     "linguiÃ§a"          => 7.00, # spicy sausage, Portuguese
+     "xoriÃ§o"            => 3.00, # chorizo sausage, Catalan
+     "hamburger"         => 6.00, # burgermeister meisterburger
+     "Ã©clair"            => 1.60, # dessert, French
+     "smÃ¸rbrÃ¸d"          => 5.75, # sandwiches, Norwegian
+     "spÃ¤tzle"           => 5.50, # Bayerisch noodles, little sparrows
+     "åå"              => 7.50, # bao1 zi5, steamed pork buns, Mandarin
+     "jamÃ³n serrano"     => 4.45, # country ham, Spanish
+     "pÃªches"            => 2.25, # peaches, French
+     "ã·ã¥ã¼ã¯ãªã¼ã "    => 1.85, # cream-filled pastry like eclair
+     "ë§ê±¸ë¦¬"            => 4.00, # makgeolli, Korean rice wine
+     "å¯¿å¸"              => 9.99, # sushi, Japanese
+     "ããã¡"            => 2.65, # omochi, rice cakes, Japanese
+     "crÃ¨me brÃ»lÃ©e"      => 2.00, # crema catalana
+     "fideuÃ "            => 4.20, # more noodles, Valencian (Catalan=fideuada)
+     "pÃ¢tÃ©"              => 4.15, # gooseliver paste, French
+     "ãå¥½ã¿ç¼ã"        => 8.00, # okonomiyaki, Japanese
+ );
+
+ my $width = 5 + max map { colwidth } keys %price;
+
+ # So the Asian stuff comes out in an order that someone
+ # who reads those scripts won't freak out over; the
+ # CJK stuff will be in JIS X 0208 order that way.
+ my $coll  = new Unicode::Collate::Locale locale => "ja";
+
+ for my $item ($coll->sort(keys %price)) {
+     print pad(entitle($item), $width, ".");
+     printf " â¬%.2f\n", $price{$item};
+ }
+
+ sub pad($$$) {
+     my($str, $width, $padchar) = @_;
+     return $str . ($padchar x ($width - colwidth($str)));
+ }
+
+ sub colwidth(_) {
+     my($str) = @_;
+     return Unicode::GCString->new($str)->columns;
+ }
+
+ sub entitle(_) {
+     my($str) = @_;
+     $str =~ s{ (?=\pL)(\S)     (\S*) }
+              { ucfirst($1) . lc($2)  }xge;
+     return $str;
+ }
+
+=head1 SEE ALSO
+
+See these manpages, some of which are CPAN modules:
+L<perlunicode>, L<perluniprops>,
+L<perlre>, L<perlrecharclass>,
+L<perluniintro>, L<perlunitut>, L<perlunifaq>,
+L<PerlIO>, L<DB_File>, L<DBM_Filter>, L<DBM_Filter::utf8>,
+L<Encode>, L<Encode::Locale>,
+L<Unicode::UCD>,
+L<Unicode::Normalize>,
+L<Unicode::GCString>, L<Unicode::LineBreak>,
+L<Unicode::Collate>, L<Unicode::Collate::Locale>,
+L<Unicode::Unihan>,
+L<Unicode::CaseFold>,
+L<Unicode::Tussle>,
+L<Lingua::JA::Romanize::Japanese>,
+L<Lingua::ZH::Romanize::Pinyin>,
+L<Lingua::KO::Romanize::Hangul>.
+
+The L<Unicode::Tussle> CPAN module includes many programs
+to help with working with Unicode, including
+these programs to fully or partly replace standard utilities:
+I<tcgrep> instead of I<egrep>,
+I<uniquote> instead of I<cat -v> or I<hexdump>,
+I<uniwc> instead of I<wc>,
+I<unilook> instead of I<look>,
+I<unifmt> instead of I<fmt>,
+and
+I<ucsort> instead of I<sort>.
+For exploring Unicode character names and character properties,
+see its I<uniprops>, I<unichars>, and I<uninames> programs.
+It also supplies these programs, all of which are general filters that do 
Unicode-y things:
+I<unititle> and I<unicaps>;
+I<uniwide> and I<uninarrow>;
+I<unisupers> and I<unisubs>;
+I<nfd>, I<nfc>, I<nfkd>, and I<nfkc>;
+and I<uc>, I<lc>, and I<tc>.
+
+Finally, see the published Unicode Standard (page numbers are from version
+6.0.0), including these specific annexes and technical reports:
+
+=over
+
+=item Â§3.13 Default Case Algorithms, page 113;
+Â§4.2  Case, pages 120â122;
+Case Mappings, page 166â172, especially Caseless Matching starting on page 
170.
+
+=item UAX #44: Unicode Character Database
+
+=item UTS #18: Unicode Regular Expressions
+
+=item UAX #15: Unicode Normalization Forms
+
+=item UTS #10: Unicode Collation Algorithm
+
+=item UAX #29: Unicode Text Segmentation
+
+=item UAX #14: Unicode Line Breaking Algorithm
+
+=item UAX #11: East Asian Width
+
+=back
+
+=head1 AUTHOR
+
+Tom Christiansen E<lt>[email protected]<gt> wrote this, with occasional
+kibbitzing from Larry Wall and Jeffrey Friedl in the background.
+
+=head1 COPYRIGHT AND LICENCE
+
+Copyright Â© 2012 Tom Christiansen.
+
+This program is free software; you may redistribute it and/or modify it
+under the same terms as Perl itself.
+
+Most of these examples taken from the current edition of the âCamel Bookâ;
+that is, from the 4áµÊ° Edition of I<Programming Perl>, Copyright Â© 2012 Tom
+Christiansen <et al.>, 2012-02-13 by OâReilly Media.  The code itself is
+freely redistributable, and you are encouraged to transplant, fold,
+spindle, and mutilate any of the examples in this manpage however you please
+for inclusion into your own programs without any encumbrance whatsoever.
+Acknowledgement via code comment is polite but not required.
+
+=head1 REVISION HISTORY
+
+v1.0.0 â first public release, 2012-02-27
+
diff --git a/t/porting/known_pod_issues.dat b/t/porting/known_pod_issues.dat
index 688d033..e1de02b 100644
--- a/t/porting/known_pod_issues.dat
+++ b/t/porting/known_pod_issues.dat
@@ -94,7 +94,10 @@ kill(3)
 langinfo(3)
 LaTeX::Encode
 Lexical::Var
+Lingua::JA::Romanize::Japanese
 Lingua::KO::Hangul::Util
+Lingua::KO::Romanize::Hangul
+Lingua::ZH::Romanize::Pinyin
 List::Gather
 local::lib
 Log::Message
@@ -205,10 +208,13 @@ Time::Object
 Tk
 Tk::Pod
 tty(1)
+Unicode::CaseFold
 Unicode::Casing
+Unicode::GCString
 Unicode::LineBreak
 Unicode::Regex::Set
 Unicode::Semantics
+Unicode::Tussle
 Unicode::Unihan
 unzip(1)
 Version::Requirements
diff --git a/win32/pod.mak b/win32/pod.mak
index c053b37..16980d4 100644
--- a/win32/pod.mak
+++ b/win32/pod.mak
@@ -144,6 +144,7 @@ POD = perl.pod      \
        perltoot.pod    \
        perltrap.pod    \
        perlunicode.pod \
+       perlunicook.pod \
        perlunifaq.pod  \
        perluniintro.pod        \
        perluniprops.pod        \
@@ -284,6 +285,7 @@ MAN = perl.man      \
        perltoot.man    \
        perltrap.man    \
        perlunicode.man \
+       perlunicook.man \
        perlunifaq.man  \
        perluniintro.man        \
        perluniprops.man        \
@@ -423,6 +425,7 @@ HTML = perl.html    \
        perltoot.html   \
        perltrap.html   \
        perlunicode.html        \
+       perlunicook.html        \
        perlunifaq.html \
        perluniintro.html       \
        perluniprops.html       \
@@ -564,6 +567,7 @@ TEX = perl.tex      \
        perltoot.tex    \
        perltrap.tex    \
        perlunicode.tex \
+       perlunicook.tex \
        perlunifaq.tex  \
        perluniintro.tex        \
        perluniprops.tex        \

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.21.7-44-gef3a1d8

Reply via email to