[perl.git] branch blead, updated. v5.25.2-68-g48791bf

Karl Williamson Thu, 30 Jun 2016 21:23:41 -0700

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/48791bf1d9612a84d71edc00af8610da1a6cf34b?hp=7d7345cf4f14a683b78978462e37e75c5bccd5ed>


- Log -----------------------------------------------------------------
commit 48791bf1d9612a84d71edc00af8610da1a6cf34b
Author: Karl Williamson <[email protected]>
Date:   Thu Jun 30 22:05:55 2016 -0600

    Change \p{foo} to mean \p{scx: foo}
    
    when 'foo' is a script.  Also update the pods correspondingly, and to
    encourage scx property use.
    
    See http://nntp.perl.org/group/perl.perl5.porters/237403
-----------------------------------------------------------------------

Summary of changes:
 charclass_invlists.h |  4 ++--
 lib/Unicode/UCD.pm   | 13 ++++++++++---
 lib/unicore/mktables | 13 ++++++++++---
 pod/perldelta.pod    | 12 ++++++++++++
 pod/perlretut.pod    | 18 +++++++++++-------
 pod/perlunicode.pod  | 35 +++++++++++++++++++++++------------
 pod/perlunicook.pod  |  2 +-
 regcharclass.h       |  4 ++--
 t/uni/cache.t        |  4 ++--
 9 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/charclass_invlists.h b/charclass_invlists.h
index 3791381..a748d00 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -91515,7 +91515,7 @@ static const U8 WB_table[24][24] = {
 #endif /* defined(PERL_IN_REGEXEC_C) */
 
 /* Generated from:
- * de6076d81bc4e85f179377ded4c68f3b257c8f7990227d4302eca442fda558f8 
lib/Unicode/UCD.pm
+ * d4d1ae3d05b9b07d066661a93af8896abe10fbf0f5cbe89575fbbc06a2928d60 
lib/Unicode/UCD.pm
  * 47cb62a53beea6d0263e2147331c7e751853c9327225d95bbe2d9e1dc3e1aa44 
lib/unicore/ArabicShaping.txt
  * 153f0a100c315f9f3945e78f57137611d36c44b3a975919c499fd403413fede8 
lib/unicore/BidiBrackets.txt
  * fbe806975c1bf9fc9960bbaa39ff6290c42c7da8315f9cd459109b024cc1c485 
lib/unicore/BidiMirroring.txt
@@ -91558,7 +91558,7 @@ static const U8 WB_table[24][24] = {
  * 37f6186253da9824bdb27f4ad867bfe8c25d4dc6bdb2f05585e40a034675a348 
lib/unicore/extracted/DLineBreak.txt
  * ef24061b5a5dc93d7e90c2e34530ec757180ee75d872cba65ffc946e52624ae8 
lib/unicore/extracted/DNumType.txt
  * a197371fec9a1b517058b440841f60f9378d81682084eef8db22a88cb2f96e90 
lib/unicore/extracted/DNumValues.txt
- * 58e546458da91e33d5cdacd9ca1b5a87868701a1e7e4eea6b0c4cf0c62fff431 
lib/unicore/mktables
+ * 0cc006e22469cee3db1a55a4df1ac656c9d26a70ba920985883eb77198931c1a 
lib/unicore/mktables
  * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 
lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c 
regen/charset_translations.pl
  * 11011bc761487f5a63c8135e67248394d4cdff6f8f204a41cdfbdc8131e79406 
regen/mk_invlists.pl
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 276e9f5..990e86f 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -338,7 +338,8 @@ See L</Blocks versus Scripts>.
 
 the script I<code> belongs to.
 The L</prop_value_aliases()> function can be used to get all the synonyms
-of the script name.
+of the script name.  Note that this is the older "Script" property value, and
+not the improved "Script_Extensions" value.
 
 See L</Blocks versus Scripts>.
 
@@ -966,6 +967,10 @@ that it doesn't have scripts, this function returns 
C<"Unknown">.
 The L</prop_value_aliases()> function can be used to get all the synonyms
 of the script name.
 
+Note that the Script_Extensions property is an improved version of the Script
+property, and you should probably be using that instead, with the
+L</charprop()> function.
+
 If supplied with an argument that can't be a code point, charscript() tries
 to do the opposite and interpret the argument as a script name. The
 return value is a I<range set>: an anonymous array of arrays that contain
@@ -1056,7 +1061,9 @@ names as the keys, and the code point ranges (see 
L</charscript()>) as
 the values.
 
 L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
-different type of data structure.
+different type of data structure.  Since the Script_Extensions property is an
+improved version of the Script property, you should instead use
+L<prop_invmap("scx")|/prop_invmap()>.
 
 L<C<prop_values("Script")>|/prop_values()> can be used to get all
 the known script names as a list, without the code point ranges.
@@ -2468,7 +2475,7 @@ resolving the input property's name as is done for 
regular expressions.  These
 are also specified in L<perluniprops|perluniprops/Properties accessible
 through \p{} and \P{}>.  Examples of using the "property=value" form are:
 
- say join ", ", prop_invlist("Script=Shavian");
+ say join ", ", prop_invlist("Script_Extensions=Shavian");
 
  prints:
  66640, 66688
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 30ba881..0517938 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -15335,7 +15335,7 @@ sub add_perl_synonyms() {
     # the single-form, \p{name}.  These are:
     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
     #       \p{Is_Name} as synonyms
-    #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
+    #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
     #       conflict, \p{Value} and \p{Is_Value} as well
@@ -15349,8 +15349,15 @@ sub add_perl_synonyms() {
                                                             property_ref('*');
     push @tables, $gc->tables;
 
-    # If the version of Unicode includes the Script property, add its tables
-    push @tables, $script->tables if defined $script;
+    # If the version of Unicode includes the Script Extensions (preferably),
+    # or Script property, add its tables
+    my $scx = property_ref("Script_Extensions");
+    if (defined $scx) {
+        push @tables, $scx->tables;
+    }
+    else {
+        push @tables, $script->tables if defined $script;
+    }
 
     # The Block tables are kept separate because they are treated differently.
     # And the earliest versions of Unicode didn't include them, so add only if
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 53d839a..8bca8c7 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -34,6 +34,18 @@ L<http://www.unicode.org/versions/Unicode9.0.0/>.  Modules 
that are
 shipped with core Perl but not maintained by p5p do not necessarily
 support Unicode 9.0.  L<Unicode::Normalize> does work on 9.0.
 
+=head2 Use of C<\p{I<script>}> uses the improved Script_Extensions
+property
+
+Unicode 6.0 introduced an improved form of the Script (C<sc>) property,
+and called it Script_Extensions (C<scx>).  As of now, Perl uses this
+improved version when a property is specified as just C<\p{I<script>}>.
+The meaning of compound forms, like C<\p{sc=I<script>}> are unchanged.
+This should make programs be more accurate when determining if a
+character is used in a given script, but there is a slight chance of
+breakage for programs that very specifically needed the old behavior.
+See L<perlunicode/Scripts>.
+
 =head1 Security
 
 XXX Any security-related notices go here.  In particular, any security
diff --git a/pod/perlretut.pod b/pod/perlretut.pod
index efebb11..734ca5c 100644
--- a/pod/perlretut.pod
+++ b/pod/perlretut.pod
@@ -1986,14 +1986,18 @@ also listed there.  Some synonyms are a single 
character.  For these,
 you can drop the braces.  For instance, C<\pM> is the same thing as
 C<\p{Mark}>, meaning things like accent marks.
 
-The Unicode C<\p{Script}> property is used to categorize every Unicode
-character into the language script it is written in.  For example,
+The Unicode C<\p{Script}> and C<\p{Script_Extensions}> properties are
+used to categorize every Unicode character into the language script it
+is written in.  (C<Script_Extensions> is an improved version of
+C<Script>, which is retained for backward compatibility, and so you
+should generally use C<Script_Extensions>.)
+For example,
 English, French, and a bunch of other European languages are written in
 the Latin script.  But there is also the Greek script, the Thai script,
 the Katakana script, etc.  You can test whether a character is in a
-particular script with, for example C<\p{Latin}>, C<\p{Greek}>,
-or C<\p{Katakana}>.  To test if it isn't in the Balinese script, you
-would use C<\P{Balinese}>.
+particular script (based on C<Script_Extensions>) with, for example
+C<\p{Latin}>, C<\p{Greek}>, or C<\p{Katakana}>.  To test if it isn't in
+the Balinese script, you would use C<\P{Balinese}>.
 
 What we have described so far is the single form of the C<\p{...}> character
 classes.  There is also a compound form which you may run into.  These
@@ -2001,8 +2005,8 @@ look like C<\p{name=value}> or C<\p{name:value}> (the 
equals sign and colon
 can be used interchangeably).  These are more general than the single form,
 and in fact most of the single forms are just Perl-defined shortcuts for common
 compound forms.  For example, the script examples in the previous paragraph
-could be written equivalently as C<\p{Script=Latin}>, C<\p{Script:Greek}>,
-C<\p{script=katakana}>, and C<\P{script=balinese}> (case is irrelevant
+could be written equivalently as C<\p{Script_Extensions=Latin}>, 
C<\p{Script_Extensions:Greek}>,
+C<\p{script_extensions=katakana}>, and C<\P{script_extensions=balinese}> (case 
is irrelevant
 between the C<{}> braces).  You may
 never have to use the compound forms, but sometimes it is necessary, and their
 use can make your code easier to understand.
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 959b800..8346b23 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -602,16 +602,19 @@ The world's languages are written in many different 
scripts.  This sentence
 written in Cyrillic, and Greek is written in, well, Greek; Japanese mainly in
 Hiragana or Katakana.  There are many more.
 
-The Unicode C<Script> and C<Script_Extensions> properties give what script a
-given character is in.  Either property can be specified with the
-compound form like
+The Unicode C<Script> and C<Script_Extensions> properties give what
+script a given character is in.  The C<Script_Extensions> property is an
+improved version of C<Script>, as demonstrated below.  Either property
+can be specified with the compound form like
 C<\p{Script=Hebrew}> (short: C<\p{sc=hebr}>), or
 C<\p{Script_Extensions=Javanese}> (short: C<\p{scx=java}>).
 In addition, Perl furnishes shortcuts for all
-C<Script> property names.  You can omit everything up through the equals
-(or colon), and simply write C<\p{Latin}> or C<\P{Cyrillic}>.
-(This is not true for C<Script_Extensions>, which is required to be
-written in the compound form.)
+C<Script_Extensions> property names.  You can omit everything up through
+the equals (or colon), and simply write C<\p{Latin}> or C<\P{Cyrillic}>.
+(This is not true for C<Script>, which is required to be
+written in the compound form.  Prior to Perl v5.26, the single form
+returned the plain old C<Script> version, but was changed because
+C<Script_Extensions> gives better results.)
 
 The difference between these two properties involves characters that are
 used in multiple scripts.  For example the digits '0' through '9' are
@@ -645,7 +648,11 @@ fewer characters in the C<Common> script, and 
correspondingly more in
 other scripts.  It is new in Unicode version 6.0, and its data are likely
 to change significantly in later releases, as things get sorted out.
 New code should probably be using C<Script_Extensions> and not plain
-C<Script>.
+C<Script>.  If you compile perl with a Unicode release that doesn't have
+C<Script_Extensions>, the single form Perl extensions will instead refer
+to the plain C<Script> property.  If you compile with a version of
+Unicode that doesn't have the C<Script> property, these extensions will
+not be defined at all.
 
 (Actually, besides C<Common>, the C<Inherited> script, contains
 characters that are used in multiple scripts.  These are modifier
@@ -658,10 +665,13 @@ C<Script>, but not in C<Script_Extensions>.)
 It is worth stressing that there are several different sets of digits in
 Unicode that are equivalent to 0-9 and are matchable by C<\d> in a
 regular expression.  If they are used in a single language only, they
-are in that language's C<Script> and C<Script_Extension>.  If they are
+are in that language's C<Script> and C<Script_Extensions>.  If they are
 used in more than one script, they will be in C<sc=Common>, but only
 if they are used in many scripts should they be in C<scx=Common>.
 
+The explanation above has omitted some detail; refer to UAX#24 "Unicode
+Script Property": L<http://www.unicode.org/reports/tr24>.
+
 A complete list of scripts and their shortcuts is in L<perluniprops>.
 
 =head3 B<Use of the C<"Is"> Prefix>
@@ -690,7 +700,7 @@ C<Common> script.
 For more about scripts versus blocks, see UAX#24 "Unicode Script Property":
 L<http://www.unicode.org/reports/tr24>
 
-The C<Script> or C<Script_Extensions> properties are likely to be the
+The C<Script_Extensions> or C<Script> properties are likely to be the
 ones you want to use when processing
 natural language; the C<Block> property may occasionally be useful in working
 with the nuts and bolts of Unicode.
@@ -711,10 +721,11 @@ longer work.  The extensions are mentioned here for 
completeness:  Take
 the block name and prefix it with one of: C<In> (for example
 C<\p{Blk=Arrows}> can currently be written as C<\p{In_Arrows}>); or
 sometimes C<Is> (like C<\p{Is_Arrows}>); or sometimes no prefix at all
-(C<\p{Arrows}>).  As of this writing (Unicode 8.0) there are no
+(C<\p{Arrows}>).  As of this writing (Unicode 9.0) there are no
 conflicts with using the C<In_> prefix, but there are plenty with the
 other two forms.  For example, C<\p{Is_Hebrew}> and C<\p{Hebrew}> mean
-C<\p{Script=Hebrew}> which is NOT the same thing as C<\p{Blk=Hebrew}>.  Our
+C<\p{Script_Extensions=Hebrew}> which is NOT the same thing as
+C<\p{Blk=Hebrew}>.  Our
 advice used to be to use the C<In_> prefix as a single form way of
 specifying a block.  But Unicode 8.0 added properties whose names begin
 with C<In>, and it's now clear that it's only luck that's so far
diff --git a/pod/perlunicook.pod b/pod/perlunicook.pod
index e1693cd..ac30509 100644
--- a/pod/perlunicook.pod
+++ b/pod/perlunicook.pod
@@ -391,7 +391,7 @@ one codepoint lacking that property.
  \p{Sk}, \p{Ps}, \p{Lt}
  \p{alpha}, \p{upper}, \p{lower}
  \p{Latin}, \p{Greek}
- \p{script=Latin}, \p{script=Greek}
+ \p{script_extensions=Latin}, \p{scx=Greek}
  \p{East_Asian_Width=Wide}, \p{EA=W}
  \p{Line_Break=Hyphen}, \p{LB=HY}
  \p{Numeric_Value=4}, \p{NV=4}
diff --git a/regcharclass.h b/regcharclass.h
index 47c8d8a..d4b483c 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -1852,7 +1852,7 @@
 #endif /* H_REGCHARCLASS */
 
 /* Generated from:
- * de6076d81bc4e85f179377ded4c68f3b257c8f7990227d4302eca442fda558f8 
lib/Unicode/UCD.pm
+ * d4d1ae3d05b9b07d066661a93af8896abe10fbf0f5cbe89575fbbc06a2928d60 
lib/Unicode/UCD.pm
  * 47cb62a53beea6d0263e2147331c7e751853c9327225d95bbe2d9e1dc3e1aa44 
lib/unicore/ArabicShaping.txt
  * 153f0a100c315f9f3945e78f57137611d36c44b3a975919c499fd403413fede8 
lib/unicore/BidiBrackets.txt
  * fbe806975c1bf9fc9960bbaa39ff6290c42c7da8315f9cd459109b024cc1c485 
lib/unicore/BidiMirroring.txt
@@ -1895,7 +1895,7 @@
  * 37f6186253da9824bdb27f4ad867bfe8c25d4dc6bdb2f05585e40a034675a348 
lib/unicore/extracted/DLineBreak.txt
  * ef24061b5a5dc93d7e90c2e34530ec757180ee75d872cba65ffc946e52624ae8 
lib/unicore/extracted/DNumType.txt
  * a197371fec9a1b517058b440841f60f9378d81682084eef8db22a88cb2f96e90 
lib/unicore/extracted/DNumValues.txt
- * 58e546458da91e33d5cdacd9ca1b5a87868701a1e7e4eea6b0c4cf0c62fff431 
lib/unicore/mktables
+ * 0cc006e22469cee3db1a55a4df1ac656c9d26a70ba920985883eb77198931c1a 
lib/unicore/mktables
  * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 
lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c 
regen/charset_translations.pl
  * d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 
regen/regcharclass.pl
diff --git a/t/uni/cache.t b/t/uni/cache.t
index 4cd9a48..41ac361 100644
--- a/t/uni/cache.t
+++ b/t/uni/cache.t
@@ -7,7 +7,7 @@ BEGIN {
 
 plan tests => 1;
 
-# Looks to see if a "do 'unicore/lib/Sc/Hira.pl'" is called more than once, by
+# Looks to see if a "do 'unicore/lib/Scx/Hira.pl'" is called more than once, by
 # putting a compile sub first on the library path;
 # XXX Kludge: requires exact path, which might change, and has deep knowledge
 # of how utf8_heavy.pl works, which might also change.
@@ -15,7 +15,7 @@ plan tests => 1;
 BEGIN { # Make sure catches compile time references
     $::count = 0;
     unshift @INC, sub {
-       $::count++ if $_[1] eq 'unicore/lib/Sc/Hira.pl';
+       $::count++ if $_[1] eq 'unicore/lib/Scx/Hira.pl';
     };
 }
 

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.25.2-68-g48791bf

Reply via email to