In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/38684baa0525ac95a5bcc5f409d91ff31f9fe242?hp=c8536afa2abc39d901062df19a839a4209513974>

- Log -----------------------------------------------------------------
commit 38684baa0525ac95a5bcc5f409d91ff31f9fe242
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 19 14:40:34 2010 -0700

    utf8.c: add to comment

M       utf8.c

commit 453c8cca713cdc34c3ebf8e55115e4c5a5313379
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 19 11:59:31 2010 -0700

    regexec.c: Remove unnecessary statements
    
    These variables are set to other values just a couple of lines below

M       regexec.c

commit 85c006b64da3a6adb26786871a367c7b75119d2e
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 19 11:37:06 2010 -0700

    perltodo: Revise utf8 todo

M       pod/perltodo.pod

commit ff97e5cf7f9d89732c45b74ff5abc53519433776
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 19 12:24:07 2010 -0700

    utf8.c, .h: Clarify pod and comment

M       utf8.c
M       utf8.h

commit 6426c51b011a78db74b51fc0517bb3f21cef8dc1
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 19 12:24:54 2010 -0700

    mktables: fix typo in comment

M       lib/unicore/mktables

commit c89df6cf6f70d6460ca3fec9d465e5e6e17fb3a7
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 19 11:08:47 2010 -0700

    Change regexes to debug dump non-ASCII as hex.
    
    instead of the less familiar octal for larger values.  Perhaps they
    should actually print the actual character, but this is far easier than
    the previous to understand.

M       perl.h
M       regcomp.c
M       regcomp.h
M       regexec.c

commit 681f01c2a5ff0846090d78599b3d4caeb93fda26
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 19 11:00:49 2010 -0700

    pv_escape: Add option to dump all non-ascii as hex
    
    This patch adds an option to pv_escape() to dump all characters above ASCII
    in hex.  Before, you could get all chars as hex or the Latin1 non-ASCII
    as octal, whereas the typical values for these that people think in are
    given in hex.

M       dump.c
M       perl.h
-----------------------------------------------------------------------

Summary of changes:
 dump.c               |   14 +++++++++-----
 lib/unicore/mktables |    2 +-
 perl.h               |    3 ++-
 pod/perltodo.pod     |   11 +++++++----
 regcomp.c            |    1 +
 regcomp.h            |    6 +++---
 regexec.c            |    6 +-----
 utf8.c               |    8 +++++---
 utf8.h               |    2 +-
 9 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/dump.c b/dump.c
index e7ae8b7..68d3745 100644
--- a/dump.c
+++ b/dump.c
@@ -232,10 +232,11 @@ if PERL_PV_ESCAPE_UNI_DETECT is set then the input string 
is scanned
 using C<is_utf8_string()> to determine if it is Unicode.
 
 If PERL_PV_ESCAPE_ALL is set then all input chars will be output
-using C<\x01F1> style escapes, otherwise only chars above 255 will be
-escaped using this style, other non printable chars will use octal or
-common escaped patterns like C<\n>. If PERL_PV_ESCAPE_NOBACKSLASH
-then all chars below 255 will be treated as printable and 
+using C<\x01F1> style escapes, otherwise if PERL_PV_ESCAPE_NONASCII is set, 
only
+chars above 127 will be escaped using this style; otherwise, only chars above
+255 will be so escaped; other non printable chars will use octal or
+common escaped patterns like C<\n>. Otherwise, if PERL_PV_ESCAPE_NOBACKSLASH
+then all chars below 255 will be treated as printable and
 will be output as literals.
 
 If PERL_PV_ESCAPE_FIRSTCHAR is set then only the first char of the
@@ -284,7 +285,10 @@ Perl_pv_escape( pTHX_ SV *dsv, char const * const str,
         const UV u= (isuni) ? utf8_to_uvchr((U8*)pv, &readsize) : (U8)*pv;     
       
         const U8 c = (U8)u & 0xFF;
         
-        if ( ( u > 255 ) || (flags & PERL_PV_ESCAPE_ALL)) {
+        if ( ( u > 255 )
+         || (flags & PERL_PV_ESCAPE_ALL)
+         || (( u > 127 ) && (flags & PERL_PV_ESCAPE_NONASCII)))
+       {
             if (flags & PERL_PV_ESCAPE_FIRSTCHAR) 
                 chsize = my_snprintf( octbuf, PV_ESCAPE_OCTBUFSIZE, 
                                       "%"UVxf, u);
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 824cdd1..d438d21 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -478,7 +478,7 @@ my $unicode_reference_url = 
'http://www.unicode.org/reports/tr44/';
 #
 # Here are some observations about some of the issues in early versions:
 #
-# The number of code points in \p{alpha} halve in 2.1.9.  It turns out that
+# The number of code points in \p{alpha} halved in 2.1.9.  It turns out that
 # the reason is that the CJK block starting at 4E00 was removed from PropList,
 # and was not put back in until 3.1.0
 #
diff --git a/perl.h b/perl.h
index 151e7bd..567150d 100644
--- a/perl.h
+++ b/perl.h
@@ -6159,6 +6159,7 @@ extern void moncontrol(int);
 
 #define PERL_PV_ESCAPE_UNI          0x0100
 #define PERL_PV_ESCAPE_UNI_DETECT   0x0200
+#define PERL_PV_ESCAPE_NONASCII     0x0400
 
 #define PERL_PV_ESCAPE_ALL         0x1000
 #define PERL_PV_ESCAPE_NOBACKSLASH  0x2000
@@ -6169,7 +6170,7 @@ extern void moncontrol(int);
 
 /* used by pv_display in dump.c*/
 #define PERL_PV_PRETTY_DUMP  PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE
-#define PERL_PV_PRETTY_REGPROP 
PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_LTGT|PERL_PV_ESCAPE_RE
+#define PERL_PV_PRETTY_REGPROP 
PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_LTGT|PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII
 
 /*
 
diff --git a/pod/perltodo.pod b/pod/perltodo.pod
index 4eda992..3bd0c06 100644
--- a/pod/perltodo.pod
+++ b/pod/perltodo.pod
@@ -966,10 +966,13 @@ years for this discrepancy.
 
 =head2 UTF-8 revamp
 
-The handling of Unicode is unclean in many places. For example, the regexp
-engine matches in Unicode semantics whenever the string or the pattern is
-flagged as UTF-8, but that should not be dependent on an internal storage
-detail of the string.
+The handling of Unicode is unclean in many places.  In the regex engine
+there are especially many problems.  The swash data structure could be
+replaced my something better.  Inversion lists and maps are likely
+candidates.  The whole Unicode database could be placed in-core for a
+huge speed-up.  Only minimal work was done on the optimizer when utf8
+was added, with the result that the synthetic start class often will
+fail to narrow down the possible choices when given non-Latin1 input.
 
 =head2 Properly Unicode safe tokeniser and pads.
 
diff --git a/regcomp.c b/regcomp.c
index fb9c606..122c560 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -9610,6 +9610,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const 
regnode *o)
         * --jhi */
        pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
                  PERL_PV_ESCAPE_UNI_DETECT |
+                 PERL_PV_ESCAPE_NONASCII   |
                  PERL_PV_PRETTY_ELLIPSES   |
                  PERL_PV_PRETTY_LTGT       |
                  PERL_PV_PRETTY_NOCLEAR
diff --git a/regcomp.h b/regcomp.h
index c140089..00fd945 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -822,20 +822,20 @@ re.pm, especially to the documentation.
     const char * const rpv =                          \
         pv_pretty((dsv), (pv), (l), (m), \
             PL_colors[(c1)],PL_colors[(c2)], \
-            PERL_PV_ESCAPE_RE |((isuni) ? PERL_PV_ESCAPE_UNI : 0) );         \
+            PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII |((isuni) ? 
PERL_PV_ESCAPE_UNI : 0) );         \
     const int rlen = SvCUR(dsv)
 
 #define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m) \
     const char * const rpv =                          \
         pv_pretty((dsv), (SvPV_nolen_const(sv)), (SvCUR(sv)), (m), \
             PL_colors[(c1)],PL_colors[(c2)], \
-            PERL_PV_ESCAPE_RE |((isuni) ? PERL_PV_ESCAPE_UNI : 0) )
+            PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII |((isuni) ? 
PERL_PV_ESCAPE_UNI : 0) )
 
 #define RE_PV_QUOTED_DECL(rpv,isuni,dsv,pv,l,m)                    \
     const char * const rpv =                                       \
         pv_pretty((dsv), (pv), (l), (m), \
             PL_colors[0], PL_colors[1], \
-            ( PERL_PV_PRETTY_QUOTE | PERL_PV_ESCAPE_RE | 
PERL_PV_PRETTY_ELLIPSES | \
+            ( PERL_PV_PRETTY_QUOTE | PERL_PV_ESCAPE_RE | 
PERL_PV_ESCAPE_NONASCII | PERL_PV_PRETTY_ELLIPSES | \
               ((isuni) ? PERL_PV_ESCAPE_UNI : 0))                  \
         )
 
diff --git a/regexec.c b/regexec.c
index 591018a..7778992 100644
--- a/regexec.c
+++ b/regexec.c
@@ -3493,7 +3493,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                    ST.nextword,
                    tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
                            PL_colors[0], PL_colors[1],
-                           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
+                           (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 
0)|PERL_PV_ESCAPE_NONASCII
                        ) 
                    : "not compiled under -Dr",
                    PL_colors[5] );
@@ -3937,10 +3937,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            re_fold_t folder;
            const U8 *fold_array;
 
-           folder = NULL;      /* NULL assumes will be NREF, REF: no
-                                  folding */
-           fold_array = NULL;
-
            PL_reg_flags |= RF_tainted;
            folder = foldEQ_locale;
            fold_array = PL_fold_locale;
diff --git a/utf8.c b/utf8.c
index e615d7b..fa30a67 100644
--- a/utf8.c
+++ b/utf8.c
@@ -980,9 +980,10 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool 
*is_utf8)
 /*
 =for apidoc bytes_to_utf8
 
-Converts a string C<s> of length C<len> from the native encoding into UTF-8.
+Converts a string C<s> of length C<len> bytes from the native encoding into
+UTF-8.
 Returns a pointer to the newly-created string, and sets C<len> to
-reflect the new length.
+reflect the new length in bytes.
 
 A NUL character will be written after the end of the string.
 
@@ -1968,7 +1969,8 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, 
SV *listsv, I32 minbits
  * return several Unicode characters for a single Unicode character
  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
  * the lower-level routine, and it is similarly broken for returning
- * multiple values.  --jhi */
+ * multiple values.  --jhi
+ * For those, you should use to_utf8_case() instead */
 /* Now SWASHGET is recasted into S_swash_get in this file. */
 
 /* Note:
diff --git a/utf8.h b/utf8.h
index be14a94..405b8b4 100644
--- a/utf8.h
+++ b/utf8.h
@@ -234,7 +234,7 @@ Perl's extended UTF-8 means we can have start bytes up to 
FF.
 #define UTF8_ALLOW_CONTINUATION                0x0002
 #define UTF8_ALLOW_NON_CONTINUATION    0x0004
 #define UTF8_ALLOW_FE_FF               0x0008 /* Allow FE or FF start bytes, \
-                                                 yields above 0x7fffFFFF */
+                                                 yields above 0x7fffFFFF = 31 
bits */
 #define UTF8_ALLOW_SHORT               0x0010 /* expecting more bytes */
 #define UTF8_ALLOW_SURROGATE           0x0020
 #define UTF8_ALLOW_FFFF                        0x0040 /* Allow UNICODE_ILLEGAL 
*/

--
Perl5 Master Repository

Reply via email to