[perl.git] branch blead, updated. v5.21.7-445-gacdfc3b

Karl Williamson Tue, 20 Jan 2015 09:38:54 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/acdfc3b6cf1374c974570ccc518854db69d7f8a8?hp=8792e55d18c9b4d24303caed70df6f78a7e5eb0a>


- Log -----------------------------------------------------------------
commit acdfc3b6cf1374c974570ccc518854db69d7f8a8
Author: Karl Williamson <[email protected]>
Date:   Tue Jan 20 09:48:44 2015 -0700

    regcomp.c: Add warnings under re 'strict'

M       pod/perldelta.pod
M       pod/perldiag.pod
M       regcomp.c
M       t/re/reg_mesg.t

commit f4ae5a277990f1a5a92dca0362caeaac4fa419b1
Author: Karl Williamson <[email protected]>
Date:   Mon Oct 6 16:17:58 2014 -0600

    regcomp.c: Move #define, make a function always compiled
    
    This is in preparation for the next commit.  The function previously was
    used only in DEBUGGING builds

M       embed.fnc
M       embed.h
M       proto.h
M       regcomp.c

commit c877af1b1d4b8cf208483b79695143d40560a8ee
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 19 23:51:55 2015 -0700

    regcomp.c: Add warnings under re 'strict'

M       pod/perldelta.pod
M       pod/perldiag.pod
M       regcomp.c
M       t/re/reg_mesg.t

commit 21adcf33cfe83d19ce1fc78c9e222a52e661e4f4
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 19 17:49:38 2015 -0700

    t/re/reg_mesg.t: Turn on $|
    
    Otherwise the output gets mixed up

M       t/re/reg_mesg.t

commit b927b7e95b7031e0f55821c537e194ad78fd3a09
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 19 12:47:41 2015 -0700

    Add portablity warning for re 'strict'
    
    When a range in a bracketed character class has one end be specified as
    Unicode, the whole range is viewed as Unicode.  Currently this is not
    warned about, though it is somewhat like mixing apples and oranges.
    This commit adds a warning, but only under "use re 'strict'", and
    it now documents the only one-end behavior.

M       pod/perldelta.pod
M       pod/perldiag.pod
M       pod/perlre.pod
M       pod/perlrecharclass.pod
M       regcomp.c
M       t/re/reg_mesg.t

commit dfa1e3a5359a09e2f8ad21051412248182399696
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 19 12:43:43 2015 -0700

    regcomp.c: Fix typo in comment

M       regcomp.c

commit 63374c78a9d0a9ab46fa90f9a2379f802b7cb195
Author: Karl Williamson <[email protected]>
Date:   Mon Jan 19 12:19:14 2015 -0700

    t/re/reg_mesg.t: Add support for only re 'strict' warnings
    
    This will allow future commits to specify warnings that are raised only
    when under "use re 'strict'"

M       t/re/reg_mesg.t

commit 0ca46d14552d7bc7f04ebedae1f98f8828256a07
Author: Karl Williamson <[email protected]>
Date:   Sat Jan 17 15:57:24 2015 -0700

    regcomp.c: Refactor a calculation
    
    Currently the way we calculate if the endpoints in a range in a
    [bracketed character class] are "literal" (like 'A', 'b') vs non (like
    \x{41}) is to have a count of the literal endpoints.
    
    Future commits will expand the definition of literal to include things
    that are portably-specified, including things like \t, \N{U+xx}, etc.
    It will be easier to specify that we have encountered a non-portable
    name instead of the other way around.  So that is what this commit does.
    The only non-portables are \digit, \o{}, \x{}, and \cX for all X.

M       regcomp.c
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc               |   2 +-
 embed.h                 |   2 +-
 pod/perldelta.pod       |  18 +++++-
 pod/perldiag.pod        |  63 +++++++++++++++++++
 pod/perlre.pod          |  30 ++++++----
 pod/perlrecharclass.pod |  11 ++++
 proto.h                 |   6 +-
 regcomp.c               | 156 ++++++++++++++++++++++++++++++++++++++----------
 t/re/reg_mesg.t         | 103 +++++++++++++++++++++++++++-----
 9 files changed, 325 insertions(+), 66 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index faccf49..6707b2b 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2213,6 +2213,7 @@ Es        |I32    |make_trie      |NN RExC_state_t 
*pRExC_state \
                                |U32 word_count|U32 flags|U32 depth
 Es     |regnode *|construct_ahocorasick_from_trie|NN RExC_state_t *pRExC_state 
\
                                 |NN regnode *source|U32 depth
+EnPs   |const char *|cntrl_to_mnemonic|const U8 c
 #  ifdef DEBUGGING
 Es        |void        |regdump_intflags|NULLOK const char *lead| const U32 
flags
 Es     |void   |regdump_extflags|NULLOK const char *lead| const U32 flags
@@ -2221,7 +2222,6 @@ Es        |const regnode*|dumpuntil|NN const regexp *r|NN 
const regnode *start \
                                |NULLOK const regnode *last \
                                |NULLOK const regnode *plast \
                                |NN SV* sv|I32 indent|U32 depth
-EnPs   |const char *|cntrl_to_mnemonic|const U8 c
 Es     |void   |put_code_point |NN SV* sv|UV c
 Es     |bool   |put_charclass_bitmap_innards|NN SV* sv     \
                                |NN char* bitmap            \
diff --git a/embed.h b/embed.h
index 47e45c2..f0ebc96 100644
--- a/embed.h
+++ b/embed.h
@@ -928,7 +928,6 @@
 #  endif
 #  if defined(DEBUGGING)
 #    if defined(PERL_IN_REGCOMP_C)
-#define cntrl_to_mnemonic      S_cntrl_to_mnemonic
 #define dump_trie(a,b,c,d)     S_dump_trie(aTHX_ a,b,c,d)
 #define dump_trie_interim_list(a,b,c,d,e)      S_dump_trie_interim_list(aTHX_ 
a,b,c,d,e)
 #define dump_trie_interim_table(a,b,c,d,e)     S_dump_trie_interim_table(aTHX_ 
a,b,c,d,e)
@@ -960,6 +959,7 @@
 #define add_data               S_add_data
 #define add_multi_match(a,b,c) S_add_multi_match(aTHX_ a,b,c)
 #define alloc_maybe_populate_EXACT(a,b,c,d,e,f)        
S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e,f)
+#define cntrl_to_mnemonic      S_cntrl_to_mnemonic
 #define compute_EXACTish       S_compute_EXACTish
 #define construct_ahocorasick_from_trie(a,b,c) 
S_construct_ahocorasick_from_trie(aTHX_ a,b,c)
 #define could_it_be_a_POSIX_class      S_could_it_be_a_POSIX_class
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 09a0dfb..b6e0cdc 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -257,8 +257,6 @@ in future Perl releases in incompatible ways.  This means 
that a pattern
 that compiles today may not in a future Perl release.  This warning is
 to alert you to that risk.
 
-=item *
-
 L<Wide character (U+%X) in %s|perldiag/"Wide character (U+%X) in %s">
 
 (W locale) While in a single-byte locale (I<i.e.>, a non-UTF-8
@@ -273,6 +271,22 @@ You likely need to figure out how this multi-byte 
character got mixed up
 with your single-byte locale (or perhaps you thought you had a UTF-8
 locale, but Perl disagrees).
 
+=item *
+
+L<Both or neither range ends should be Unicode in regex; marked by E<lt>-- 
HERE in mE<sol>%sE<sol>|perldiag/"Both or neither range ends should be Unicode 
in regex; marked by <-- HERE in m/%s/">
+
+=item *
+
+L<Ranges of ASCII printables should be some subset of "0-9", "A-Z", or "a-z" 
in regex; marked by <-- HERE in mE<sol>%sE<sol>|perldiag/"Ranges of ASCII 
printables should be some subset of "0-9", "A-Z" ... [60 chars truncated]
+
+=item *
+
+L<Ranges of digits should be from the same group in regex; marked by <-- HERE 
in mE<sol>%sE<sol>|perldiag/"Ranges of digits should be from the same group in 
regex; marked by <-- HERE in m/%s/">
+
+=item *
+
+L<"%s" is more clearly written simply as "%s" in regex; marked by <-- HERE in 
mE<sol>%sE<sol>|perldiag/"%s" is more clearly written simply as "%s" in regex; 
marked by <-- HERE in mE<sol>%sE<sol>>
+
 =back
 
 =head2 Changes to Existing Diagnostics
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 4bd04cb..6fadf25 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -554,6 +554,22 @@ copiable.
 (P) When starting a new thread or returning values from a thread, Perl
 encountered an invalid data type.
 
+=item Both or neither range ends should be Unicode in regex; marked by
+<-- HERE in m/%s/
+
+(W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>)
+
+In a bracketed character class in a regular expression pattern, you
+had a range which has exactly one end of it specified using C<\N{}>, and
+the other end is specified using a non-portable mechanism.  Perl treats
+the range as a Unicode range, that is, all the characters in it are
+considered to be the Unicode characters, and which may be different code
+points on some platforms Perl runs on.  For example, C<[\N{U+06}-\x08]>
+is treated as if you had instead said C<[\N{U+06}-\N{U+08}]>, that is it
+matches the characters whose code points in Unicode are 6, 7, and 8.
+But that C<\x08> might indicate that you meant something different, so
+the warning gets raised.
+
 =item Buffer overflow in prime_env_iter: %s
 
 (W internal) A warning peculiar to VMS.  While Perl was preparing to
@@ -2873,6 +2889,14 @@ with 'useperlio'.
 (F) Your machine doesn't implement the sockatmark() functionality,
 neither as a system call nor an ioctl call (SIOCATMARK).
 
+=item "%s" is more clearly written simply as "%s" in regex; marked by <-- HERE 
in m/%s/
+
+(W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>)
+
+You specified a character that has the given plainer way of writing it,
+and which is also portable to platforms running with different character
+sets.
+
 =item $* is no longer supported
 
 (D deprecated, syntax) The special variable C<$*>, deprecated in older
@@ -3272,6 +3296,45 @@ arguments than were supplied, but might be used in the 
future for
 other cases where we can statically determine that arguments to
 functions are missing, e.g. for the L<perlfunc/pack> function.
 
+=item Ranges of ASCII printables should be some subset of "0-9", "A-Z", or
+"a-z" in regex; marked by <-- HERE in m/%s/
+
+(W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>)
+
+Stricter rules help to find typos and other errors.  Perhaps you didn't
+even intend a range here, if the C<"-"> was meant to be some other
+character, or should have been escaped (like C<"\-">).  If you did
+intend a range, the one that was used is not portable between ASCII and
+EBCDIC platforms, and doesn't have an obvious meaning to a casual
+reader.
+
+ [3-7]    # OK; Obvious and portable
+ [d-g]    # OK; Obvious and portable
+ [A-Y]    # OK; Obvious and portable
+ [A-z]    # WRONG; Not portable; not clear what is meant
+ [a-Z]    # WRONG; Not portable; not clear what is meant
+ [%-.]    # WRONG; Not portable; not clear what is meant
+ [\x41-Z] # WRONG; Not portable; not obvious to non-geek
+
+(You can force portablity by specifying a Unicode range, which means that
+the endpoints are specified by
+L<C<\N{...}>|perlrecharclass/Character Ranges>, but the meaning may
+still not be obvious.)
+The stricter rules require that ranges that start or stop with an ASCII
+character that is not a control have all their endpoints be the literal
+character, and not some escape sequence (like C<"\x41">), and the ranges
+must be all digits, or all uppercase letters, or all lowercase letters.
+
+=item Ranges of digits should be from the same group in regex; marked by
+<-- HERE in m/%s/
+
+(W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>)
+
+Stricter rules help to find typos and other errors.  You included a
+range, and at least one of the end points is a decimal digit.  Under the
+stricter rules, when this happens, both end points should be digits in
+the same group of 10 consecutive digits.
+
 =item Missing argument to -%c
 
 (F) The argument to the indicated command line switch must follow
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 21e0f04..247632b 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -2333,17 +2333,25 @@ classes C<\w>, C<\W>, C<\s>, C<\S>, C<\d>, or C<\D> as 
endpoints of
 a range, the "-" is understood literally.
 
 Note also that the whole range idea is rather unportable between
-character sets--and even within character sets they may cause results
-you probably didn't expect.  A sound principle is to use only ranges
-that begin from and end at either alphabetics of equal case ([a-e],
-[A-E]), or digits ([0-9]).  Anything else is unsafe or unclear.  If in
-doubt, spell out the character sets in full.  Specifying the end points
-of the range using the C<\N{...}> syntax, using Unicode names or code
-points makes the range portable, but still likely not easily
-understandable to someone reading the code.  For example,
-C<[\N{U+04}-\N{U+07}]> means to match the Unicode code points
-C<\N{U+04}>, C<\N{U+05}>, C<\N{U+06}>, and C<\N{U+07}>, whatever their
-native values may be on the platform.
+character sets, except for four situations that Perl handles specially.
+Any subset of the ranges C<[A-Z]>, C<[a-z]>, and C<[0-9]> are guaranteed
+to match the expected subset of ASCII characters, no matter what
+character set the platform is running.  The fourth portable way to
+specify ranges is to use the C<\N{...}> syntax to specify either end
+point of the range.  For example, C<[\N{U+04}-\N{U+07}]> means to match
+the Unicode code points C<\N{U+04}>, C<\N{U+05}>, C<\N{U+06}>, and
+C<\N{U+07}>, whatever their native values may be on the platform.  Under
+L<use re 'strict'|re/'strict' mode> or within a L</C<(?[ ])>>, a warning
+is raised, if enabled, and the other end point of a range which has a
+C<\N{...}> endpoint is not portably specified.  For example,
+
+ [\N{U+00}-\x06]    # Warning under "use re 'strict'".
+
+It is hard to understand without digging what exactly matches ranges
+other than subsets of C<[A-Z]>, C<[a-z]>, and C<[0-9]>.  A sound
+principle is to use only ranges that begin from and end at either
+alphabetics of equal case ([a-e], [A-E]), or digits ([0-9]).  Anything
+else is unsafe or unclear.  If in doubt, spell out the range in full.
 
 Characters may be specified using a metacharacter syntax much like that
 used in C: "\n" matches a newline, "\t" a tab, "\r" a carriage return,
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index 440ed90..4421911 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -619,6 +619,17 @@ endpoints.  These indicate that the specified range is to 
be interpreted
 using Unicode values, so C<[\N{U+27}-\N{U+3F}]> means to match
 C<\N{U+27}>, C<\N{U+28}>, C<\N{U+29}>, ..., C<\N{U+3D}>, C<\N{U+3E}>,
 and C<\N{U+3F}>, whatever the native code point versions for those are.
+These are called "Unicode" ranges.  If either end is of the C<\N{...}>
+form, the range is considered Unicode.  A C<regexp> warning is raised
+under C<S<"use re 'strict'">> if the other endpoint is specified
+non-portably:
+
+ [\N{U+00}-\x09]    # Warning under re 'strict'; \x09 is non-portable
+ [\N{U+00}-\t]      # No warning;
+
+Both of the above match the characters C<\N{U+00}> C<\N{U+01}>, ...
+C<\N{U+08}>, C<\N{U+09}>, but the C<\x09> looks like it could be a
+mistake so the warning is raised (under C<re 'strict'>) for it.
 
 Perl also guarantees that the ranges C<A-Z>, C<a-z>, C<0-9>, and any
 subranges of these match what an English-only speaker would expect them
diff --git a/proto.h b/proto.h
index a0c4a40..1aaa938 100644
--- a/proto.h
+++ b/proto.h
@@ -5542,9 +5542,6 @@ STATIC void       S_cv_dump(pTHX_ const CV *cv, const 
char *title)
 
 #  endif
 #  if defined(PERL_IN_REGCOMP_C)
-STATIC const char *    S_cntrl_to_mnemonic(const U8 c)
-                       __attribute__pure__;
-
 STATIC void    S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV* 
widecharmap, AV *revcharmap, U32 depth)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_3);
@@ -6912,6 +6909,9 @@ PERL_STATIC_INLINE void   
S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_s
 #define PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT    \
        assert(pRExC_state); assert(node); assert(flagp)
 
+STATIC const char *    S_cntrl_to_mnemonic(const U8 c)
+                       __attribute__pure__;
+
 PERL_STATIC_INLINE U8  S_compute_EXACTish(RExC_state_t *pRExC_state)
                        __attribute__nonnull__(1);
 #define PERL_ARGS_ASSERT_COMPUTE_EXACTISH      \
diff --git a/regcomp.c b/regcomp.c
index b62c30d..64a90b4 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -122,6 +122,12 @@ typedef struct scan_frame {
     struct scan_frame *next_frame;      /* next frame */
 } scan_frame;
 
+/* Certain characters are output as a sequence with the first being a
+ * backslash. */
+#define isBACKSLASHED_PUNCT(c)                                              \
+                    ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+
+
 struct RExC_state_t {
     U32                flags;                  /* RXf_* are we folding, 
multilining? */
     U32                pm_flags;               /* PMf_* stuff from the calling 
PMOP */
@@ -641,6 +647,12 @@ static const scan_data_t zero_scan_data =
            REPORT_LOCATION_ARGS(offset));              \
 } STMT_END
 
+#define        vWARN(loc, m) STMT_START {                                      
\
+    const IV offset = loc - RExC_precomp;                              \
+    __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m 
REPORT_LOCATION,       \
+           REPORT_LOCATION_ARGS(offset));              \
+} STMT_END
+
 #define        vWARN_dep(loc, m) STMT_START {                                  
\
     const IV offset = loc - RExC_precomp;                              \
     __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m 
REPORT_LOCATION,   \
@@ -880,8 +892,6 @@ DEBUG_OPTIMISE_MORE_r(if(data){                             
         \
     PerlIO_printf(Perl_debug_log,"\n");                              \
 });
 
-#ifdef DEBUGGING
-
 /* is c a control character for which we have a mnemonic? */
 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 
@@ -905,8 +915,6 @@ S_cntrl_to_mnemonic(const U8 c)
     return NULL;
 }
 
-#endif
-
 /* Mark that we cannot extend a found fixed substring at this point.
    Update the longest found anchored substring and the longest found
    floating substrings if needed. */
@@ -13810,16 +13818,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
      * runtime locale is UTF-8 */
     SV* only_utf8_locale_list = NULL;
 
-#ifdef EBCDIC
-    /* In a range, counts how many 0-2 of the ends of it came from literals,
-     * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
-    UV literal_endpoint = 0;
+    /* In a range, if one of the endpoints is non-character-set portable,
+     * meaning that it hard-codes a code point that may mean a different
+     * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
+     * mnemonic '\t' which each mean the same character no matter which
+     * character set the platform is on. */
+    unsigned int non_portable_endpoint = 0;
 
     /* Is the range unicode? which means on a platform that isn't 1-1 native
      * to Unicode (i.e. non-ASCII), each code point in it should be considered
      * to be a Unicode value.  */
     bool unicode_range = FALSE;
-#endif
     bool invert = FALSE;    /* Is this class to be complemented */
 
     bool warn_super = ALWAYS_WARN_SUPER;
@@ -13923,9 +13932,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
        if (!range) {
            rangebegin = RExC_parse;
            element_count++;
-#ifdef EBCDIC
-            literal_endpoint = 0;
-#endif
+            non_portable_endpoint = 0;
        }
        if (UTF) {
            value = utf8n_to_uvchr((U8*)RExC_parse,
@@ -13942,12 +13949,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
         {
             namedclass = regpposixcc(pRExC_state, value, strict);
         }
-        else if (value != '\\') {
-#ifdef EBCDIC
-            literal_endpoint++;
-#endif
-        }
-        else {
+        else if (value == '\\') {
             /* Is a backslash; get the code point of the char after it */
            if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
                value = utf8n_to_uvchr((U8*)RExC_parse,
@@ -14029,13 +14031,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
                         prevvalue = save_prevvalue;
                         continue;   /* Back to top of loop to get next char */
                     }
+
                     /* Here, is a single code point, and <value> contains it */
-#ifdef EBCDIC
-                    /* We consider named characters to be literal characters,
-                     * and they are Unicode */
-                    literal_endpoint++;
-                    unicode_range = TRUE;
-#endif
+                    unicode_range = TRUE;   /* \N{} are Unicode */
                 }
                 break;
            case 'p':
@@ -14233,6 +14231,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
                        vFAIL(error_msg);
                    }
                }
+                non_portable_endpoint++;
                if (IN_ENCODING && value < 0x100) {
                    goto recode_encoding;
                }
@@ -14252,11 +14251,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
                        vFAIL(error_msg);
                    }
                }
+                non_portable_endpoint++;
                if (IN_ENCODING && value < 0x100)
                    goto recode_encoding;
                break;
            case 'c':
                value = grok_bslash_c(*RExC_parse++, PASS2);
+                non_portable_endpoint++;
                break;
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7':
@@ -14284,6 +14285,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
                             (void)ReREFCNT_inc(RExC_rx_sv);
                         }
                     }
+                    non_portable_endpoint++;
                    if (IN_ENCODING && value < 0x100)
                        goto recode_encoding;
                    break;
@@ -14512,8 +14514,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
        if (range) {
 #ifdef EBCDIC
             /* For unicode ranges, we have to test that the Unicode as opposed
-             * to the native values are not decreasing.  (Above 255, and there
-             * is no difference between native and Unicode) */
+             * to the native values are not decreasing.  (Above 255, there is
+             * no difference between native and Unicode) */
            if (unicode_range && prevvalue < 255 && value < 255) {
                 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
                     goto backwards_range;
@@ -14655,8 +14657,100 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
             }
         }
 
+        if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
+            if (range) {
+
+                /* If the range starts above 255, everything is portable and
+                 * likely to be so for any forseeable character set, so don't
+                 * warn. */
+                if (unicode_range && non_portable_endpoint && prevvalue < 256) 
{
+                    vWARN(RExC_parse, "Both or neither range ends should be 
Unicode");
+                }
+                else if (prevvalue != value) {
+
+                    /* Under strict, ranges that stop and/or end in an ASCII
+                     * printable should have each end point be a portable value
+                     * for it (preferably like 'A', but we don't warn if it is
+                     * a (portable) Unicode name or code point), and the range
+                     * must be be all digits or all letters of the same case.
+                     * Otherwise, the range is non-portable and unclear as to
+                     * what it contains */
+                    if ((isPRINT_A(prevvalue) || isPRINT_A(value))
+                        && (non_portable_endpoint
+                            || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
+                                   || (isLOWER_A(prevvalue) && 
isLOWER_A(value))
+                                   || (isUPPER_A(prevvalue) && 
isUPPER_A(value)))))
+                    {
+                        vWARN(RExC_parse, "Ranges of ASCII printables should 
be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
+                    }
+                    else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO 
*/
+
+                        /* But the nature of Unicode and languages mean we
+                         * can't do the same checks for above-ASCII ranges,
+                         * except in the case of digit ones.  These should
+                         * contain only digits from the same group of 10.  The
+                         * ASCII case is handled just above.  0x660 is the
+                         * first digit character beyond ASCII.  Hence here, the
+                         * range could be a range of digits.  Find out.  */
+                        IV index_start = 
_invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+                                                         prevvalue);
+                        IV index_final = 
_invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+                                                         value);
+
+                        /* If the range start and final points are in the same
+                         * inversion list element, it means that either both
+                         * are not digits, or both are digits in a consecutive
+                         * sequence of digits.  (So far, Unicode has kept all
+                         * such sequences as distinct groups of 10, but assert
+                         * to make sure).  If the end points are not in the
+                         * same element, neither should be a digit. */
+                        if (index_start == index_final) {
+                            assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
+                            || 
invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+                            - 
invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+                            == 10);
+                        }
+                        else if ((index_start >= 0
+                                  && 
ELEMENT_RANGE_MATCHES_INVLIST(index_start))
+                                 || (index_final >= 0
+                                     && 
ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
+                        {
+                            vWARN(RExC_parse, "Ranges of digits should be from 
the same group of 10");
+                        }
+                    }
+                }
+            }
+            if ((! range || prevvalue == value) && non_portable_endpoint) {
+                if (isPRINT_A(value)) {
+                    char literal[3];
+                    unsigned d = 0;
+                    if (isBACKSLASHED_PUNCT(value)) {
+                        literal[d++] = '\\';
+                    }
+                    literal[d++] = (char) value;
+                    literal[d++] = '\0';
+
+                    vWARN4(RExC_parse,
+                           "\"%.*s\" is more clearly written simply as \"%s\"",
+                           (int) (RExC_parse - rangebegin),
+                           rangebegin,
+                           literal
+                        );
+                }
+                else if isMNEMONIC_CNTRL(value) {
+                    vWARN4(RExC_parse,
+                           "\"%.*s\" is more clearly written simply as \"%s\"",
+                           (int) (RExC_parse - rangebegin),
+                           rangebegin,
+                           cntrl_to_mnemonic((char) value)
+                        );
+                }
+            }
+        }
+
         /* Deal with this element of the class */
        if (! SIZE_ONLY) {
+
 #ifndef EBCDIC
             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
                                                      prevvalue, value);
@@ -14667,7 +14761,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
             if ((UNLIKELY(prevvalue == 0) && value >= 255)
                 || ! (prevvalue < 256
                       && (unicode_range
-                          || (literal_endpoint == 2
+                          || (! non_portable_endpoint
                               && ((isLOWER_A(prevvalue) && isLOWER_A(value))
                                   || (isUPPER_A(prevvalue)
                                       && isUPPER_A(value)))))))
@@ -14902,7 +14996,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
                 if (prevvalue == 'A') {
                     if (value == 'Z'
 #ifdef EBCDIC
-                        && literal_endpoint == 2
+                        && ! non_portable_end_point
 #endif
                     ) {
                         arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
@@ -14912,7 +15006,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth,
                 else if (prevvalue == 'a') {
                     if (value == 'z'
 #ifdef EBCDIC
-                        && literal_endpoint == 2
+                        && ! non_portable_end_point
 #endif
                     ) {
                         arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
@@ -17203,10 +17297,6 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const 
char* pat2,...)
 }
 
 #ifdef DEBUGGING
-/* Certain characters are output as a sequence with the first being a
- * backslash. */
-#define isBACKSLASHED_PUNCT(c)                                              \
-                    ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
 
 STATIC void
 S_put_code_point(pTHX_ SV *sv, UV c)
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 9238246..4abfdf7 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -1,5 +1,7 @@
 #!./perl -w
 
+$|=1;   # outherwise things get mixed up in output
+
 BEGIN {
        chdir 't' if -d 't';
        @INC = qw '../lib ../ext/re';
@@ -24,7 +26,7 @@ use open qw(:utf8 :std);
 
 sub fixup_expect {
     my $expect_ref = shift;
-    return if $expect_ref eq "";
+    return "" if $expect_ref eq "";
 
     my @expect;
     if (ref $expect_ref) {
@@ -82,6 +84,15 @@ sub mark_as_utf8 {
 my $inf_m1 = ($Config::Config{reg_infty} || 32767) - 1;
 my $inf_p1 = $inf_m1 + 2;
 
+my $B_hex = sprintf("\\x%02X", ord "B");
+my $low_mixed_alpha = ('A' lt 'a') ? 'A' : 'a';
+my $high_mixed_alpha = ('A' lt 'a') ? 'a' : 'A';
+my $low_mixed_digit = ('A' lt '0') ? 'A' : '0';
+my $high_mixed_digit = ('A' lt '0') ? '0' : 'A';
+
+my $colon_hex = sprintf "%02X", ord(":");
+my $tab_hex = sprintf "%02X", ord("\t");
+
 ##
 ## Key-value pairs of code/error of code that should have fatal errors.
 ##
@@ -483,6 +494,15 @@ my @warning = (
                   ],
     '/a{1,1}?\x{100}/' => 'Useless use of greediness modifier \'?\' {#} 
m/a{1,1}?{#}\x{100}/',
     '/b{3}  +\x{100}/x' => 'Useless use of greediness modifier \'+\' {#} 
m/b{3}  +{#}\x{100}/',
+    "/(?[ [ % - % ] ])/" => "",
+    "/(?[ [ : - \\x$colon_hex ] ])\\x{100}/" => "\": - \\x$colon_hex \" is 
more clearly written simply as \":\" {#} m/(?[ [ : - \\x$colon_hex {#}] 
])\\x{100}/",
+    "/(?[ [ \\x$colon_hex - : ] ])\\x{100}/" => "\"\\x$colon_hex\ - : \" is 
more clearly written simply as \":\" {#} m/(?[ [ \\x$colon_hex - : {#}] 
])\\x{100}/",
+    "/(?[ [ \\t - \\x$tab_hex ] ])\\x{100}/" => "\"\\t - \\x$tab_hex \" is 
more clearly written simply as \"\\t\" {#} m/(?[ [ \\t - \\x$tab_hex {#}] 
])\\x{100}/",
+    "/(?[ [ \\x$tab_hex - \\t ] ])\\x{100}/" => "\"\\x$tab_hex\ - \\t \" is 
more clearly written simply as \"\\t\" {#} m/(?[ [ \\x$tab_hex - \\t {#}] 
])\\x{100}/",
+    "/(?[ [ $B_hex - C ] ])/" => "Ranges of ASCII printables should be some 
subset of \"0-9\", \"A-Z\", or \"a-z\" {#} m/(?[ [ $B_hex - C {#}] ])/",
+    "/(?[ [ A - $B_hex ] ])/" => "Ranges of ASCII printables should be some 
subset of \"0-9\", \"A-Z\", or \"a-z\" {#} m/(?[ [ A - $B_hex {#}] ])/",
+    "/(?[ [ $low_mixed_alpha - $high_mixed_alpha ] ])/" => "Ranges of ASCII 
printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\" {#} m/(?[ [ 
$low_mixed_alpha - $high_mixed_alpha {#}] ])/" ... [1 chars truncated]
+    "/(?[ [ $low_mixed_digit - $high_mixed_digit ] ])/" => "Ranges of ASCII 
printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\" {#} m/(?[ [ 
$low_mixed_digit - $high_mixed_digit {#}] ])/" ... [1 chars truncated]
 ); # See comments before this for why '\x{100}' is generally needed
 
 # These need the character 'ã' as a marker for mark_as_utf8()
@@ -501,6 +521,39 @@ my @warnings_utf8 = mark_as_utf8(
 
 push @warning, @warnings_utf8;
 
+my @warning_only_under_strict = (
+    '/[\N{U+00}-\x01]\x{100}/' => 'Both or neither range ends should be 
Unicode {#} m/[\N{U+00}-\x01{#}]\x{100}/',
+    '/[\x00-\N{SOH}]\x{100}/' => 'Both or neither range ends should be Unicode 
{#} m/[\x00-\N{U+01}{#}]\x{100}/',
+    '/[\N{DEL}-\o{377}]\x{100}/' => 'Both or neither range ends should be 
Unicode {#} m/[\N{U+7F}-\o{377}{#}]\x{100}/',
+    '/[\o{0}-\N{U+01}]\x{100}/' => 'Both or neither range ends should be 
Unicode {#} m/[\o{0}-\N{U+01}{#}]\x{100}/',
+    '/[\000-\N{U+01}]\x{100}/' => 'Both or neither range ends should be 
Unicode {#} m/[\000-\N{U+01}{#}]\x{100}/',
+    '/[\N{DEL}-\377]\x{100}/' => 'Both or neither range ends should be Unicode 
{#} m/[\N{U+7F}-\377{#}]\x{100}/',
+    '/[\N{U+00}-A]\x{100}/' => 'Ranges of ASCII printables should be some 
subset of "0-9", "A-Z", or "a-z" {#} m/[\N{U+00}-A{#}]\x{100}/',
+    '/[a-\N{U+FF}]\x{100}/' => 'Ranges of ASCII printables should be some 
subset of "0-9", "A-Z", or "a-z" {#} m/[a-\N{U+FF}{#}]\x{100}/',
+    '/[\N{U+00}-\a]\x{100}/' => "",
+    '/[\a-\N{U+FF}]\x{100}/' => "",
+    '/[\N{U+FF}-\x{100}]/' => 'Both or neither range ends should be Unicode 
{#} m/[\N{U+FF}-\x{100}{#}]/',
+    '/[\N{U+100}-\x{101}]/' => "",
+    "/[%-%]/" => "",
+    "/[:-\\x$colon_hex]\\x{100}/" => "\":-\\x$colon_hex\" is more clearly 
written simply as \":\" {#} m/[:-\\x$colon_hex\{#}]\\x{100}/",
+    "/[\\x$colon_hex-:]\\x{100}/" => "\"\\x$colon_hex-:\" is more clearly 
written simply as \":\" {#} m/[\\x$colon_hex\-:{#}]\\x{100}/",
+    "/[\\t-\\x$tab_hex]\\x{100}/" => "\"\\t-\\x$tab_hex\" is more clearly 
written simply as \"\\t\" {#} m/[\\t-\\x$tab_hex\{#}]\\x{100}/",
+    "/[\\x$tab_hex-\\t]\\x{100}/" => "\"\\x$tab_hex-\\t\" is more clearly 
written simply as \"\\t\" {#} m/[\\x$tab_hex\-\\t{#}]\\x{100}/",
+    "/[$B_hex-C]/" => "Ranges of ASCII printables should be some subset of 
\"0-9\", \"A-Z\", or \"a-z\" {#} m/[$B_hex-C{#}]/",
+    "/[A-$B_hex]/" => "Ranges of ASCII printables should be some subset of 
\"0-9\", \"A-Z\", or \"a-z\" {#} m/[A-$B_hex\{#}]/",
+    "/[$low_mixed_alpha-$high_mixed_alpha]/" => "Ranges of ASCII printables 
should be some subset of \"0-9\", \"A-Z\", or \"a-z\" {#} 
m/[$low_mixed_alpha-$high_mixed_alpha\{#}]/",
+    "/[$low_mixed_digit-$high_mixed_digit]/" => "Ranges of ASCII printables 
should be some subset of \"0-9\", \"A-Z\", or \"a-z\" {#} 
m/[$low_mixed_digit-$high_mixed_digit\{#}]/",
+);
+
+my @warning_utf8_only_under_strict = mark_as_utf8(
+ '/ã[áª-áª]/; #no latin1' => "Ranges of digits should be from the same 
group of 10 {#} m/ã[áª-áª{#}]/",
+ '/ã(?[ [ áª - áª ] ])/; #no latin1' => "Ranges of digits should be from 
the same group of 10 {#} m/ã(?[ [ áª - áª {#}] ])/",
+ '/ã[á§-á§]/; #no latin1' => "Ranges of digits should be from the same 
group of 10 {#} m/ã[á§-á§{#}]/",
+ '/ã(?[ [ á§ - á§ ] ])/; #no latin1' => "Ranges of digits should be from 
the same group of 10 {#} m/ã(?[ [ á§ - á§ {#}] ])/",
+);
+
+push @warning_only_under_strict, @warning_utf8_only_under_strict;
+
 my @experimental_regex_sets = (
     '/(?[ \t ])/' => 'The regex_sets feature is experimental {#} m/(?[{#} \t 
])/',
     'use utf8; /utf8 ã (?[ [\tã] ])/' => do { use utf8; 'The regex_sets 
feature is experimental {#} m/utf8 ã (?[{#} [\tã] ])/' },
@@ -552,29 +605,42 @@ for my $strict ("", "use re 'strict';") {
     }
 }
 
-for my $strict ("no warnings 'experimental::re_strict'; use re 'strict';", "") 
{
+for my $strict ("",  "no warnings 'experimental::re_strict'; use re 
'strict';") {
+    my @warning_tests = @warning;
 
-    # First time through we use strict to make sure that that doesn't change
-    # any of the warnings into fatal, and outputs them correctly.  The second
-    # time we don't use strict, and add the messages that are warnings when
-    # not under strict to the list of warnings.  This checks that non-strict
-    # works.
-    if (! $strict) {
+    # Build the tests for @warning.  Use the strict/non-strict versions
+    # appropriately.
+    if ($strict) {
+        push @warning_tests, @warning_only_under_strict;
+    }
+    else {
+        for (my $i = 0; $i < @warning_only_under_strict; $i += 2) {
+            if ($warning_only_under_strict[$i] =~ /\Q(?[/) {
+                push @warning_tests, $warning_only_under_strict[$i],  # The 
regex
+                                    $warning_only_under_strict[$i+1];
+            }
+            else {
+                push @warning_tests, $warning_only_under_strict[$i],  # The 
regex
+                                    "";    # No warning because not strict
+            }
+        }
         for (my $i = 0; $i < @death_only_under_strict; $i += 3) {
-            push @warning, $death_only_under_strict[$i],    # The regex
-                           $death_only_under_strict[$i+1];  # The warning
+            push @warning_tests, $death_only_under_strict[$i],    # The regex
+                                 $death_only_under_strict[$i+1];  # The warning
         }
         for (my $i = 0; $i < @death_utf8_only_under_strict; $i += 3) {
-            push @warning, mark_as_utf8($death_utf8_only_under_strict[$i],
+            push @warning_tests, 
mark_as_utf8($death_utf8_only_under_strict[$i],
                                         $death_utf8_only_under_strict[$i+1]);
         }
     }
 
-    foreach my $ref (\@warning, \@experimental_regex_sets, \@deprecated) {
+    foreach my $ref (\@warning_tests, \@experimental_regex_sets, \@deprecated) 
{
         my $warning_type;
+        my $turn_off_warnings = "";
         my $default_on;
-        if ($ref == \@warning) {
+        if ($ref == \@warning_tests) {
             $warning_type = 'regexp, digit';
+            $turn_off_warnings = "no warnings 'experimental::regex_sets';";
             $default_on = $strict;
         }
         elsif ($ref == \@deprecated) {
@@ -588,6 +654,11 @@ for my $strict ("no warnings 'experimental::re_strict'; 
use re 'strict';", "") {
         for (my $i = 0; $i < @$ref; $i += 2) {
             my $regex = $ref->[$i];
             my @expect = fixup_expect($ref->[$i+1]);
+
+            # A length-1 array with an empty warning means no warning gets
+            # generated at all.
+            undef @expect if @expect == 1 && $expect[0] eq "";
+
             {
                 $_ = "x";
                 eval "$strict no warnings; $regex";
@@ -595,7 +666,7 @@ for my $strict ("no warnings 'experimental::re_strict'; use 
re 'strict';", "") {
             if (is($@, "", "$strict $regex did not die")) {
                 my @got = capture_warnings(sub {
                                         $_ = "x";
-                                        eval "$strict $regex" });
+                                        eval "$strict $turn_off_warnings 
$regex" });
                 my $count = @expect;
                 if (! is(scalar @got, scalar @expect,
                             "... and gave expected number ($count) of 
warnings"))
@@ -631,7 +702,9 @@ for my $strict ("no warnings 'experimental::re_strict'; use 
re 'strict';", "") {
                         local $^W;
                         my @warns = capture_warnings(sub { $_ = "x";
                                                         eval "$strict $regex" 
});
-                        if ($default_on) {
+                        # Warning should be on as well if is testing
+                        # '(?[...])' which turns on strict
+                        if ($default_on || grep { $_ =~ /\Q(?[/ } @expect ) {
                            ok @warns > 0, "... and the warning is on by 
default";
                         }
                         else {

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.21.7-445-gacdfc3b

Reply via email to