In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/38684baa0525ac95a5bcc5f409d91ff31f9fe242?hp=c8536afa2abc39d901062df19a839a4209513974>
- Log ----------------------------------------------------------------- commit 38684baa0525ac95a5bcc5f409d91ff31f9fe242 Author: Karl Williamson <[email protected]> Date: Sun Dec 19 14:40:34 2010 -0700 utf8.c: add to comment M utf8.c commit 453c8cca713cdc34c3ebf8e55115e4c5a5313379 Author: Karl Williamson <[email protected]> Date: Sun Dec 19 11:59:31 2010 -0700 regexec.c: Remove unnecessary statements These variables are set to other values just a couple of lines below M regexec.c commit 85c006b64da3a6adb26786871a367c7b75119d2e Author: Karl Williamson <[email protected]> Date: Sun Dec 19 11:37:06 2010 -0700 perltodo: Revise utf8 todo M pod/perltodo.pod commit ff97e5cf7f9d89732c45b74ff5abc53519433776 Author: Karl Williamson <[email protected]> Date: Sun Dec 19 12:24:07 2010 -0700 utf8.c, .h: Clarify pod and comment M utf8.c M utf8.h commit 6426c51b011a78db74b51fc0517bb3f21cef8dc1 Author: Karl Williamson <[email protected]> Date: Sun Dec 19 12:24:54 2010 -0700 mktables: fix typo in comment M lib/unicore/mktables commit c89df6cf6f70d6460ca3fec9d465e5e6e17fb3a7 Author: Karl Williamson <[email protected]> Date: Sun Dec 19 11:08:47 2010 -0700 Change regexes to debug dump non-ASCII as hex. instead of the less familiar octal for larger values. Perhaps they should actually print the actual character, but this is far easier than the previous to understand. M perl.h M regcomp.c M regcomp.h M regexec.c commit 681f01c2a5ff0846090d78599b3d4caeb93fda26 Author: Karl Williamson <[email protected]> Date: Sun Dec 19 11:00:49 2010 -0700 pv_escape: Add option to dump all non-ascii as hex This patch adds an option to pv_escape() to dump all characters above ASCII in hex. Before, you could get all chars as hex or the Latin1 non-ASCII as octal, whereas the typical values for these that people think in are given in hex. M dump.c M perl.h ----------------------------------------------------------------------- Summary of changes: dump.c | 14 +++++++++----- lib/unicore/mktables | 2 +- perl.h | 3 ++- pod/perltodo.pod | 11 +++++++---- regcomp.c | 1 + regcomp.h | 6 +++--- regexec.c | 6 +----- utf8.c | 8 +++++--- utf8.h | 2 +- 9 files changed, 30 insertions(+), 23 deletions(-) diff --git a/dump.c b/dump.c index e7ae8b7..68d3745 100644 --- a/dump.c +++ b/dump.c @@ -232,10 +232,11 @@ if PERL_PV_ESCAPE_UNI_DETECT is set then the input string is scanned using C<is_utf8_string()> to determine if it is Unicode. If PERL_PV_ESCAPE_ALL is set then all input chars will be output -using C<\x01F1> style escapes, otherwise only chars above 255 will be -escaped using this style, other non printable chars will use octal or -common escaped patterns like C<\n>. If PERL_PV_ESCAPE_NOBACKSLASH -then all chars below 255 will be treated as printable and +using C<\x01F1> style escapes, otherwise if PERL_PV_ESCAPE_NONASCII is set, only +chars above 127 will be escaped using this style; otherwise, only chars above +255 will be so escaped; other non printable chars will use octal or +common escaped patterns like C<\n>. Otherwise, if PERL_PV_ESCAPE_NOBACKSLASH +then all chars below 255 will be treated as printable and will be output as literals. If PERL_PV_ESCAPE_FIRSTCHAR is set then only the first char of the @@ -284,7 +285,10 @@ Perl_pv_escape( pTHX_ SV *dsv, char const * const str, const UV u= (isuni) ? utf8_to_uvchr((U8*)pv, &readsize) : (U8)*pv; const U8 c = (U8)u & 0xFF; - if ( ( u > 255 ) || (flags & PERL_PV_ESCAPE_ALL)) { + if ( ( u > 255 ) + || (flags & PERL_PV_ESCAPE_ALL) + || (( u > 127 ) && (flags & PERL_PV_ESCAPE_NONASCII))) + { if (flags & PERL_PV_ESCAPE_FIRSTCHAR) chsize = my_snprintf( octbuf, PV_ESCAPE_OCTBUFSIZE, "%"UVxf, u); diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 824cdd1..d438d21 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -478,7 +478,7 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # # Here are some observations about some of the issues in early versions: # -# The number of code points in \p{alpha} halve in 2.1.9. It turns out that +# The number of code points in \p{alpha} halved in 2.1.9. It turns out that # the reason is that the CJK block starting at 4E00 was removed from PropList, # and was not put back in until 3.1.0 # diff --git a/perl.h b/perl.h index 151e7bd..567150d 100644 --- a/perl.h +++ b/perl.h @@ -6159,6 +6159,7 @@ extern void moncontrol(int); #define PERL_PV_ESCAPE_UNI 0x0100 #define PERL_PV_ESCAPE_UNI_DETECT 0x0200 +#define PERL_PV_ESCAPE_NONASCII 0x0400 #define PERL_PV_ESCAPE_ALL 0x1000 #define PERL_PV_ESCAPE_NOBACKSLASH 0x2000 @@ -6169,7 +6170,7 @@ extern void moncontrol(int); /* used by pv_display in dump.c*/ #define PERL_PV_PRETTY_DUMP PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE -#define PERL_PV_PRETTY_REGPROP PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_LTGT|PERL_PV_ESCAPE_RE +#define PERL_PV_PRETTY_REGPROP PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_LTGT|PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII /* diff --git a/pod/perltodo.pod b/pod/perltodo.pod index 4eda992..3bd0c06 100644 --- a/pod/perltodo.pod +++ b/pod/perltodo.pod @@ -966,10 +966,13 @@ years for this discrepancy. =head2 UTF-8 revamp -The handling of Unicode is unclean in many places. For example, the regexp -engine matches in Unicode semantics whenever the string or the pattern is -flagged as UTF-8, but that should not be dependent on an internal storage -detail of the string. +The handling of Unicode is unclean in many places. In the regex engine +there are especially many problems. The swash data structure could be +replaced my something better. Inversion lists and maps are likely +candidates. The whole Unicode database could be placed in-core for a +huge speed-up. Only minimal work was done on the optimizer when utf8 +was added, with the result that the synthetic start class often will +fail to narrow down the possible choices when given non-Latin1 input. =head2 Properly Unicode safe tokeniser and pads. diff --git a/regcomp.c b/regcomp.c index fb9c606..122c560 100644 --- a/regcomp.c +++ b/regcomp.c @@ -9610,6 +9610,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) * --jhi */ pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_ESCAPE_NONASCII | PERL_PV_PRETTY_ELLIPSES | PERL_PV_PRETTY_LTGT | PERL_PV_PRETTY_NOCLEAR diff --git a/regcomp.h b/regcomp.h index c140089..00fd945 100644 --- a/regcomp.h +++ b/regcomp.h @@ -822,20 +822,20 @@ re.pm, especially to the documentation. const char * const rpv = \ pv_pretty((dsv), (pv), (l), (m), \ PL_colors[(c1)],PL_colors[(c2)], \ - PERL_PV_ESCAPE_RE |((isuni) ? PERL_PV_ESCAPE_UNI : 0) ); \ + PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII |((isuni) ? PERL_PV_ESCAPE_UNI : 0) ); \ const int rlen = SvCUR(dsv) #define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m) \ const char * const rpv = \ pv_pretty((dsv), (SvPV_nolen_const(sv)), (SvCUR(sv)), (m), \ PL_colors[(c1)],PL_colors[(c2)], \ - PERL_PV_ESCAPE_RE |((isuni) ? PERL_PV_ESCAPE_UNI : 0) ) + PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII |((isuni) ? PERL_PV_ESCAPE_UNI : 0) ) #define RE_PV_QUOTED_DECL(rpv,isuni,dsv,pv,l,m) \ const char * const rpv = \ pv_pretty((dsv), (pv), (l), (m), \ PL_colors[0], PL_colors[1], \ - ( PERL_PV_PRETTY_QUOTE | PERL_PV_ESCAPE_RE | PERL_PV_PRETTY_ELLIPSES | \ + ( PERL_PV_PRETTY_QUOTE | PERL_PV_ESCAPE_RE | PERL_PV_ESCAPE_NONASCII | PERL_PV_PRETTY_ELLIPSES | \ ((isuni) ? PERL_PV_ESCAPE_UNI : 0)) \ ) diff --git a/regexec.c b/regexec.c index 591018a..7778992 100644 --- a/regexec.c +++ b/regexec.c @@ -3493,7 +3493,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ST.nextword, tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII ) : "not compiled under -Dr", PL_colors[5] ); @@ -3937,10 +3937,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) re_fold_t folder; const U8 *fold_array; - folder = NULL; /* NULL assumes will be NREF, REF: no - folding */ - fold_array = NULL; - PL_reg_flags |= RF_tainted; folder = foldEQ_locale; fold_array = PL_fold_locale; diff --git a/utf8.c b/utf8.c index e615d7b..fa30a67 100644 --- a/utf8.c +++ b/utf8.c @@ -980,9 +980,10 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) /* =for apidoc bytes_to_utf8 -Converts a string C<s> of length C<len> from the native encoding into UTF-8. +Converts a string C<s> of length C<len> bytes from the native encoding into +UTF-8. Returns a pointer to the newly-created string, and sets C<len> to -reflect the new length. +reflect the new length in bytes. A NUL character will be written after the end of the string. @@ -1968,7 +1969,8 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits * return several Unicode characters for a single Unicode character * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is * the lower-level routine, and it is similarly broken for returning - * multiple values. --jhi */ + * multiple values. --jhi + * For those, you should use to_utf8_case() instead */ /* Now SWASHGET is recasted into S_swash_get in this file. */ /* Note: diff --git a/utf8.h b/utf8.h index be14a94..405b8b4 100644 --- a/utf8.h +++ b/utf8.h @@ -234,7 +234,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define UTF8_ALLOW_CONTINUATION 0x0002 #define UTF8_ALLOW_NON_CONTINUATION 0x0004 #define UTF8_ALLOW_FE_FF 0x0008 /* Allow FE or FF start bytes, \ - yields above 0x7fffFFFF */ + yields above 0x7fffFFFF = 31 bits */ #define UTF8_ALLOW_SHORT 0x0010 /* expecting more bytes */ #define UTF8_ALLOW_SURROGATE 0x0020 #define UTF8_ALLOW_FFFF 0x0040 /* Allow UNICODE_ILLEGAL */ -- Perl5 Master Repository
