[perl.git] branch blead, updated. v5.13.5-480-ga12cf05

Jesse Vincent Fri, 15 Oct 2010 08:10:06 -0700

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/a12cf05f80a65e40fe339b086ab2d10e18d838c1?hp=9e126ece33f033d683c5b2ae7c4afac515332519>


- Log -----------------------------------------------------------------
commit a12cf05f80a65e40fe339b086ab2d10e18d838c1
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Thu Sep 23 23:36:40 2010 -0600

    Subject: [perl #58182] partial: Add uni \s,\w matching
    
    This commit causes regex sequences \b, \s, and \w (and complements) to
    match in the latin1 range in the scope of feature 'unicode_strings' or
    with the /u regex modifier.
    
    It uses the previously unused flags field in the respective regnodes to
    indicate the type of matching, and in regexec.c, uses that to decide
    which of the handy.h macros to use, native or Latin1.
    
    I chose this for now rather than create new nodes for each type of
    match.  An earlier version of this patch did that, and in every case the
    switch case: statements were adjacent, offering no performance
    advantage.  If regexec were modified to use in-line functions or more
    macros for various short section of it, then it would be faster to have
    new nodes rather than using the flags field.  But, using that field
    simplified things, as this change flies under the radar in a number of
    places where it would not if separate nodes were used.

M       lib/feature/unicode_strings.t
M       pod/perldelta.pod
M       pod/perlre.pod
M       pod/perlrecharclass.pod
M       pod/perlunicode.pod
M       pod/perlunifaq.pod
M       regcomp.c
M       regcomp.h
M       regexec.c

commit bdc22dd52e899130c8c4111c985fcbd7eec164a5
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Thu Sep 23 09:35:53 2010 -0600

    perlre.pod: slight rewording

M       pod/perlre.pod

commit 1e9e76cc079531f613e810c12048f3a37922b76b
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Thu Sep 23 07:57:02 2010 -0600

    Subject: unicode_strings.t: Imprv test output
    
    This improves the phrasing of the output of the tests

M       lib/feature/unicode_strings.t

commit 9fab35f28bfee6a6c5b8f9f9e37cd4e4755ce417
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Thu Sep 23 07:55:15 2010 -0600

    Subject: unicode_strings.t: rmv trail blanks

M       lib/feature/unicode_strings.t

commit b4eac3bb9c5966ec2001ca9133a9e167a5e1338f
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Thu Sep 23 07:53:10 2010 -0600

    Subject: regcomp.h: Add macro to get regnode flags

M       regcomp.h

commit c02a070252bd579029e13d4770b28ea0b044c389
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Thu Sep 23 07:51:09 2010 -0600

    Subject: [PATCH] regexec.c: make macros fit 80 cols
    
    Certain multi-line macros had their continuation backslashes way out.
    One line of each is longer than 80 chars, but no point in makeing all
    the lines that long.

M       regexec.c

commit b10ac0d8c2528c07fcf050f24571460c35a11aa1
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Thu Sep 23 07:49:37 2010 -0600

    Subject: [PATCH] regexec.c: add and refactor macros
    
    Add macros that will have unicode semantics; these share much code in
    common with ones that don't.  So factor out that common code.
    
    These might be good candidates for inline functions.

M       regexec.c

commit b9a59e08063c3229c83cfc2046d4dc834f96e087
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Thu Sep 23 07:18:12 2010 -0600

    regcomp.c: Fix white space, cuddled else

M       regcomp.c

commit 1726f7e86d5eeba1ee1c367f71ec0a114202689b
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Thu Sep 23 07:15:36 2010 -0600

    perlrebackslash: Fix poor grammar

M       pod/perlrebackslash.pod
-----------------------------------------------------------------------

Summary of changes:
 lib/feature/unicode_strings.t |  146 +++++++++++++++++++++++---
 pod/perldelta.pod             |   21 +++-
 pod/perlre.pod                |   28 ++++--
 pod/perlrebackslash.pod       |    4 +-
 pod/perlrecharclass.pod       |    8 ++-
 pod/perlunicode.pod           |   13 ++-
 pod/perlunifaq.pod            |    3 +-
 regcomp.c                     |  232 ++++++++++++++++++++++++++++++++---------
 regcomp.h                     |    5 +
 regexec.c                     |  234 ++++++++++++++++++++++++++---------------
 10 files changed, 526 insertions(+), 168 deletions(-)

diff --git a/lib/feature/unicode_strings.t b/lib/feature/unicode_strings.t
index dce34bd..2a2ee1d 100644
--- a/lib/feature/unicode_strings.t
+++ b/lib/feature/unicode_strings.t
@@ -7,9 +7,10 @@ BEGIN {
     require './test.pl';
 }
 
-plan(13312);    # Determined by experimentation
+plan(20736);    # Determined by experimentation
 
-# Test the upper/lower/title case mappings for all characters 0-255.
+# In this section, test the upper/lower/title case mappings for all characters
+# 0-255.
 
 # First compute the case mappings without resorting to the functions we're
 # testing.
@@ -26,7 +27,7 @@ my @posix_to_lower
 = my @latin1_to_title
 = @posix_to_upper;
 
-# Override the elements in the to_lower arrays that have different lower case 
+# Override the elements in the to_lower arrays that have different lower case
 # mappings
 for my $i (0x41 .. 0x5A) {
     $posix_to_lower[$i] = chr(ord($posix_to_lower[$i]) + 32);
@@ -84,9 +85,9 @@ for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
             my $cp = sprintf "U+%04X", $i;
 
             # First try using latin1 (Unicode) semantics.
-            use feature "unicode_strings";    
+            use feature "unicode_strings";
 
-            my $phrase = 'with uni8bit';
+            my $phrase = 'in uni8bit';
             my $char = chr($i);
             my $pre_lc = $prefix->{'lc'};
             my $pre_uc = $prefix->{'uc'};
@@ -98,17 +99,17 @@ for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
             my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc;
 
             is (uc($to_upper), $expected_upper,
-                display("$cp: $phrase: uc($to_upper) eq $expected_upper"));
+                display("$cp: $phrase: Verify uc($to_upper) eq 
$expected_upper"));
             is (lc($to_lower), $expected_lower,
-                display("$cp: $phrase: lc($to_lower) eq $expected_lower"));
+                display("$cp: $phrase: Verify lc($to_lower) eq 
$expected_lower"));
 
             if ($pre_uc eq "") {    # Title case if null prefix.
                 my $expected_title = $latin1_to_title[$i] . $post_lc;
                 is (ucfirst($to_upper), $expected_title,
-                    display("$cp: $phrase: ucfirst($to_upper) eq 
$expected_title"));
+                    display("$cp: $phrase: Verify ucfirst($to_upper) eq 
$expected_title"));
                 my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc;
                 is (lcfirst($to_lower), $expected_lcfirst,
-                    display("$cp: $phrase: lcfirst($to_lower) eq 
$expected_lcfirst"));
+                    display("$cp: $phrase: Verify lcfirst($to_lower) eq 
$expected_lcfirst"));
             }
 
             # Then try with posix semantics.
@@ -125,18 +126,137 @@ for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) 
{
             $expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc;
 
             is (uc($to_upper), $expected_upper,
-                display("$cp: $phrase: uc($to_upper) eq $expected_upper"));
+                display("$cp: $phrase: Verify uc($to_upper) eq 
$expected_upper"));
             is (lc($to_lower), $expected_lower,
-                display("$cp: $phrase: lc($to_lower) eq $expected_lower"));
+                display("$cp: $phrase: Verify lc($to_lower) eq 
$expected_lower"));
 
             if ($pre_uc eq "") {
                 my $expected_title = $posix_to_title[$i] . $post_lc;
                 is (ucfirst($to_upper), $expected_title,
-                    display("$cp: $phrase: ucfirst($to_upper) eq 
$expected_title"));
+                    display("$cp: $phrase: Verify ucfirst($to_upper) eq 
$expected_title"));
                 my $expected_lcfirst = $posix_to_lower[$i] . $post_uc;
                 is (lcfirst($to_lower), $expected_lcfirst,
-                    display("$cp: $phrase: lcfirst($to_lower) eq 
$expected_lcfirst"));
+                    display("$cp: $phrase: Verify lcfirst($to_lower) eq 
$expected_lcfirst"));
             }
         }
     }
 }
+
+# In this section test that \w, \s, and \b work correctly.  These are the only
+# character classes affected by this pragma.
+
+# Boolean if w[$i] is a \w character
+my @w = (0) x 256;
+...@w[0x30 .. 0x39] = (1) x 10;     # 0-9
+...@w[0x41 .. 0x5a] = (1) x 26;     # A-Z
+...@w[0x61 .. 0x7a] = (1) x 26;     # a-z
+$w[0x5F] = 1;                    # _
+$w[0xAA] = 1;                    # FEMININE ORDINAL INDICATOR
+$w[0xB5] = 1;                    # MICRO SIGN
+$w[0xBA] = 1;                    # MASCULINE ORDINAL INDICATOR
+...@w[0xc0 .. 0xD6] = (1) x 23;     # various
+...@w[0xd8 .. 0xF6] = (1) x 31;     # various
+...@w[0xf8 .. 0xFF] = (1) x 8;      # various
+
+# Boolean if s[$i] is a \s character
+my @s = (0) x 256;
+$s[0x09] = 1;   # Tab
+$s[0x0A] = 1;   # LF
+$s[0x0C] = 1;   # FF
+$s[0x0D] = 1;   # CR
+$s[0x20] = 1;   # SPACE
+$s[0x85] = 1;   # NEL
+$s[0xA0] = 1;   # NO BREAK SPACE
+
+for my $i (0 .. 255) {
+    my $char = chr($i);
+    my $hex_i = sprintf "%02X", $i;
+    foreach my $which (\...@s, \...@w) {
+        my $basic_name;
+        if ($which == \...@s) {
+            $basic_name = 's';
+        } else {
+            $basic_name = 'w'
+        }
+
+        # Test \w \W \s \S
+        foreach my $complement (0, 1) {
+            my $name = '\\' . (($complement) ? uc($basic_name) : $basic_name);
+
+            # in and out of [...]
+            foreach my $charclass (0, 1) {
+
+                # And like [^...] or just plain [...]
+                foreach my $complement_class (0, 1) {
+                    next if ! $charclass && $complement_class;
+
+                    # Start with the boolean as to if the character is in the
+                    # class, and then complement as needed.
+                    my $expect_success = $which->[$i];
+                    $expect_success = ! $expect_success if $complement;
+                    $expect_success = ! $expect_success if $complement_class;
+
+                    my $test = $name;
+                    $test = "^$test" if $complement_class;
+                    $test = "[$test]" if $charclass;
+                    $test = "^$test\$";
+
+                    use feature 'unicode_strings';
+                    my $prefix = "in uni8bit; Verify chr(0x$hex_i)";
+                    if ($expect_success) {
+                        like($char, qr/$test/, display("$prefix =~ 
qr/$test/"));
+                    } else {
+                        unlike($char, qr/$test/, display("$prefix !~ 
qr/$test/"));
+                    }
+
+                    no feature 'unicode_strings';
+                    $prefix = "no uni8bit; Verify chr(0x$hex_i)";
+
+                    # With the legacy, nothing above 128 should be in the
+                    # class
+                    if ($i >= 128) {
+                        $expect_success = 0;
+                        $expect_success = ! $expect_success if $complement;
+                        $expect_success = ! $expect_success if 
$complement_class;
+                    }
+                    if ($expect_success) {
+                        like($char, qr/$test/, display("$prefix =~ 
qr/$test/"));
+                    } else {
+                        unlike($char, qr/$test/, display("$prefix !~ 
qr/$test/"));
+                    }
+                }
+            }
+        }
+    }
+
+    # Similarly for \b and \B.
+    foreach my $complement (0, 1) {
+        my $name = '\\' . (($complement) ? 'B' : 'b');
+        my $expect_success = ! $w[$i];  # \b is complement of \w
+        $expect_success = ! $expect_success if $complement;
+
+        my $string = "a$char";
+        my $test = "(^a$name\\x{$hex_i}\$)";
+
+        use feature 'unicode_strings';
+        my $prefix = "in uni8bit; Verify $string";
+        if ($expect_success) {
+            like($string, qr/$test/, display("$prefix =~ qr/$test/"));
+        } else {
+            unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
+        }
+
+        no feature 'unicode_strings';
+        $prefix = "no uni8bit; Verify $string";
+        if ($i >= 128) {
+            $expect_success = 1;
+            $expect_success = ! $expect_success if $complement;
+        }
+        if ($expect_success) {
+            like($string, qr/$test/, display("$prefix =~ qr/$test/"));
+            like($string, qr/$test/, display("$prefix =~ qr/$test/"));
+        } else {
+            unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
+        }
+    }
+}
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 3a7d5c6..3117297 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -57,11 +57,24 @@ These modifiers are currently only available within a 
C<(?...)> construct.
 The C<"l"> modifier says to compile the regular expression as if it were
 in the scope of C<use locale>, even if it is not.
 
-The C<"u"> modifier currently does nothing.
+The C<"u"> modifier says to compile the regular expression as if it were
+in the scope of a C<use feature "unicode_strings"> pragma.
 
-The C<"d"> modifier is used in the scope of C<use locale> to compile the
-regular expression as if it were not in that scope.
-See L<perlre/(?dlupimsx-imsx)>.
+The C<"d"> modifier is used to override any C<use locale> and
+C<use feature "unicode_strings"> pragmas that are in effect at the time
+of compiling the regular expression.
+
+See just below and L<perlre/(?dlupimsx-imsx)>.
+
+=head2 C<use feature "unicode_strings"> now applies to some regex matching
+
+Another chunk of the L<perlunicode/The "Unicode Bug"> is fixed in this
+release.  Now, regular expressions compiled within the scope of the
+"unicode_strings" feature will match the same whether or not the target
+string is encoded in utf8, with regard to C<\s>, C<\w>, C<\b>, and their
+complements.  Work is underway to add the C<[[:posix:]]> character
+classes and case sensitive matching to the control of this feature, but
+was not complete in time for this dot release.
 
 =head2 C<\N{...}> now handles Unicode named character sequences
 
diff --git a/pod/perlre.pod b/pod/perlre.pod
index fd09566..d4e6599 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -646,9 +646,23 @@ L<setlocale() function|perllocale/The setlocale function>.
 This modifier is automatically set if the regular expression is compiled
 within the scope of a C<"use locale"> pragma.
 
-C<"u"> has no effect currently.  It is automatically set if the regular
-expression is compiled within the scope of a
-L<C<"use feature 'unicode_strings">|feature> pragma.
+C<"u"> means to use Unicode semantics when pattern matching.  It is
+automatically set if the regular expression is compiled within the scope
+of a L<C<"use feature 'unicode_strings">|feature> pragma (and isn't
+also in the scope of L<C<"use locale">|locale> nor
+L<C<"use bytes">|bytes> pragmas.  It is not fully implemented at the
+time of this writing, but work is being done to complete the job.  On
+EBCDIC platforms this currently has no effect, but on ASCII platforms,
+it effectively turns them into Latin-1 platforms.  That is, the ASCII
+characters remain as ASCII characters (since ASCII is a subset of
+Latin-1), but the non-ASCII code points are treated as Latin-1
+characters.  Right now, this only applies to the C<"\b">, C<"\s">, and
+C<"\w"> pattern matching operators, plus their complements.  For
+example, when this option is not on, C<"\w"> matches precisely
+C<[A-Za-z0-9_]> (on a non-utf8 string).  When the option is on, it
+matches not just those, but all the Latin-1 word characters (such as an
+"n" with a tilde).  It thus matches exactly the same set of code points
+from 0 to 255 as it would if the string were encoded in utf8.
 
 C<"d"> means to use the traditional Perl pattern matching behavior.
 This is dualistic (hence the name C<"d">, which also could stand for
@@ -662,10 +676,10 @@ pragma.
 
 Note that the C<d>, C<l>, C<p>, and C<u> modifiers are special in that
 they can only be enabled, not disabled, and the C<d>, C<l>, and C<u>
-modifiers are mutually exclusive; a maximum of one may appear in the
-construct.  Specifying one de-specifies the others.  Thus, for example,
-C<(?-p)> and C<(?-d:...)> are meaningless and will warn when compiled
-under C<use warnings>.
+modifiers are mutually exclusive: specifying one de-specifies the
+others, and a maximum of one may appear in the construct.  Thus, for
+example, C<(?-p)>, C<(?-d:...)>, and C<(?-dl:...)> will warn when
+compiled under C<use warnings>.
 
 Note also that the C<p> modifier is special in that its presence
 anywhere in a pattern has a global effect.
diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod
index a9257c7..b75c1e4 100644
--- a/pod/perlrebackslash.pod
+++ b/pod/perlrebackslash.pod
@@ -476,7 +476,7 @@ backslash sequences.
 =item \A
 
 C<\A> only matches at the beginning of the string. If the C</m> modifier
-isn't used, then C</\A/> is equivalent with C</^/>. However, if the C</m>
+isn't used, then C</\A/> is equivalent to C</^/>. However, if the C</m>
 modifier is used, then C</^/> matches internal newlines, but the meaning
 of C</\A/> isn't changed by the C</m> modifier. C<\A> matches at the beginning
 of the string regardless whether the C</m> modifier is used.
@@ -484,7 +484,7 @@ of the string regardless whether the C</m> modifier is used.
 =item \z, \Z
 
 C<\z> and C<\Z> match at the end of the string. If the C</m> modifier isn't
-used, then C</\Z/> is equivalent with C</$/>, that is, it matches at the
+used, then C</\Z/> is equivalent to C</$/>, that is, it matches at the
 end of the string, or before the newline at the end of the string. If the
 C</m> modifier is used, then C</$/> matches at internal newlines, but the
 meaning of C</\Z/> isn't changed by the C</m> modifier. C<\Z> matches at
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index 5aa9348..7cb2f78 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -682,7 +682,8 @@ nor EBCDIC, they match the ASCII defaults (0 to 9 for 
C<\d>; 52 letters,
 A regular expression is marked for Unicode semantics if it is encoded in
 utf8 (usually as a result of including a literal character whose code
 point is above 255), or if it contains a C<\N{U+...}> or C<\N{I<name>}>
-construct.
+construct, or (starting in Perl 5.14) if it was compiled in the scope of a
+C<S<use feature "unicode_strings">> pragma.
 
 The differences in behavior between locale and non-locale semantics
 can affect any character whose code point is 255 or less.  The
@@ -693,6 +694,11 @@ L<perlunicode/The "Unicode Bug">.
 
 For portability reasons, it may be better to not use C<\w>, C<\d>, C<\s>
 or the POSIX character classes, and use the Unicode properties instead.
+That way you can control whether you want matching of just characters in
+the ASCII character set, or any Unicode characters.
+C<S<use feature "unicode_strings">> will allow seamless Unicode behavior
+no matter what the internal encodings are, but won't allow restricting
+to just the ASCII characters.
 
 =head4 Examples
 
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index fc6a8a9..8ff5bb0 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1509,17 +1509,20 @@ ASCII range (except in a locale), along with Perl's 
desire to add Unicode
 support seamlessly.  The result wasn't seamless: these characters were
 orphaned.
 
-Work is being done to correct this, but only some of it was complete in time
-for the 5.12 release.  What has been finished is the important part of the case
+Work is being done to correct this, but only some of it is complete.
+What has been finished is the matching of C<\b>, C<\s>, C<\w> and their
+complements in regular expressions, and the important part of the case
 changing component.  Due to concerns, and some evidence, that older code might
 have come to rely on the existing behavior, the new behavior must be explicitly
 enabled by the feature C<unicode_strings> in the L<feature> pragma, even though
 no new syntax is involved.
 
 See L<perlfunc/lc> for details on how this pragma works in combination with
-various others for casing.  Even though the pragma only affects casing
-operations in the 5.12 release, it is planned to have it affect all the
-problematic behaviors in later releases: you can't have one without them all.
+various others for casing.
+
+Even though the implementation is incomplete, it is planned to have this
+pragma affect all the problematic behaviors in later releases: you can't
+have one without them all.
 
 In the meantime, a workaround is to always call utf8::upgrade($string), or to
 use the standard module L<Encode>.   Also, a scalar that has any characters
diff --git a/pod/perlunifaq.pod b/pod/perlunifaq.pod
index 66d3b0f..877e4d1 100644
--- a/pod/perlunifaq.pod
+++ b/pod/perlunifaq.pod
@@ -155,8 +155,7 @@ have furnished your own casing functions to override the 
default, these will
 not be called unless the UTF8 flag is on)
 
 This remains a problem for the regular expression constructs
-C<\s>, C<\w>, C<\S>, C<\W>, C</.../i>, C<(?i:...)>,
-and C</[[:posix:]]/>.
+C</.../i>, C<(?i:...)>, and C</[[:posix:]]/>.
 
 To force Unicode semantics, you can upgrade the internal representation to
 by doing C<utf8::upgrade($string)>. This can be used
diff --git a/regcomp.c b/regcomp.c
index 2068530..e0f65fa 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -881,7 +881,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct 
regnode_charclass_class *cl, con
   Dumps the final compressed table form of the trie to Perl_debug_log.
   Used for debugging make_trie().
 */
- 
+
 STATIC void
 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
            AV *revcharmap, U32 depth)
@@ -3207,7 +3207,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                /* These are the cases when once a subexpression
                   fails at a particular position, it cannot succeed
                   even after backtracking at the enclosing scope.
-               
+
                   XXXX what if minimal match and we are at the
                        initial run of {n,m}? */
                if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
@@ -3348,7 +3348,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
 #if 0
                        while ( nxt1 && (OP(nxt1) != WHILEM)) {
                            regnode *nnxt = regnext(nxt1);
-                       
                            if (nnxt == nxt) {
                                if (reg_off_by_arg[OP(nxt1)])
                                    ARG_SET(nxt1, nxt2 - nxt1);
@@ -3415,7 +3414,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
 
                        if (UTF)
                            old = utf8_hop((U8*)s, old) - (U8*)s;
-                       
                        l -= old;
                        /* Get the added string: */
                        last_str = newSVpvn_utf8(s  + old, l, UTF);
@@ -3503,13 +3501,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                if (flags & SCF_DO_STCLASS_AND) {
                     for (value = 0; value < 256; value++)
                         if (!is_VERTWS_cp(value))
-                            ANYOF_BITMAP_CLEAR(data->start_class, value);  
-                }                                                              
-                else {                                                         
+                            ANYOF_BITMAP_CLEAR(data->start_class, value);
+                }
+                else {
                     for (value = 0; value < 256; value++)
                         if (is_VERTWS_cp(value))
-                            ANYOF_BITMAP_SET(data->start_class, value);        
   
-                }                                                              
+                            ANYOF_BITMAP_SET(data->start_class, value);
+                }
                 if (flags & SCF_DO_STCLASS_OR)
                    cl_and(data->start_class, and_withp);
                flags &= ~SCF_DO_STCLASS;
@@ -3522,7 +3520,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                data->pos_delta += 1;
                data->longest = &(data->longest_float);
            }
-           
        }
        else if (OP(scan) == FOLDCHAR) {
            int d = ARG(scan)==0xDF ? 1 : 2;
@@ -3580,19 +3577,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (!isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value); 
                
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isWORDCHAR_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isALNUM(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                    }
                    break;
                case ALNUML:
@@ -3609,9 +3624,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                           for (value = 0; value < 256; value++)
-                               if (isALNUM(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isWORDCHAR_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isALNUM(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                                    }
+                                }
+                           }
                        }
                    }
                    else {
@@ -3620,7 +3645,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isALNUM(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value); 
                
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
@@ -3638,18 +3663,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                           if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (!isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
+                        if (data->start_class->flags & ANYOF_LOCALE) {
                            ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value); 
                
+                        }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        } else {
+                            for (value = 0; value < 256; value++) {
+                                if (isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
                        }
                    }
                    break;
@@ -3667,19 +3711,38 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                           for (value = 0; value < 256; value++)
-                               if (isSPACE(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                            if (FLAGS(scan) & USE_UNI) {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE_L1(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                                    }
+                                }
+                            } else {
+                                for (value = 0; value < 256; value++) {
+                                    if (isSPACE(value)) {
+                                        ANYOF_BITMAP_CLEAR(data->start_class, 
value);
+                                    }
+                                }
+                            }
                        }
                    }
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                       else {
-                           for (value = 0; value < 256; value++)
-                               if (!isSPACE(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value); 
                
-                       }
+                        else if (FLAGS(scan) & USE_UNI) {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE_L1(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
+                        else {
+                            for (value = 0; value < 256; value++) {
+                                if (!isSPACE(value)) {
+                                    ANYOF_BITMAP_SET(data->start_class, value);
+                                }
+                            }
+                        }
                    }
                    break;
                case NSPACEL:
@@ -3709,7 +3772,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value); 
                
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
@@ -3726,7 +3789,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                        else {
                            for (value = 0; value < 256; value++)
                                if (!isDIGIT(value))
-                                   ANYOF_BITMAP_SET(data->start_class, value); 
                
+                                   ANYOF_BITMAP_SET(data->start_class, value);
                        }
                    }
                    break;
@@ -7232,31 +7295,61 @@ tryagain:
            *flagp |= HASWIDTH;
            goto finish_meta_pat;
        case 'w':
-           ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
+           if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(ALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(ALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'W':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NALNUML));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NALNUM));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'b':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(BOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(BOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 'B':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NBOUNDL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NBOUND));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 's':
-           ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(SPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(SPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'S':
-           ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NSPACEL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NSPACE));
+                FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
@@ -7988,6 +8081,7 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+/* Like above, but no locale test */
 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
 ANYOF_##NAME:                                           \
        for (value = 0; value < 256; value++)           \
@@ -8004,6 +8098,42 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+/* Like the above, but there are differences if we are in uni-8-bit or not, so
+ * there are two tests passed in, to use depending on that. There aren't any
+ * cases where the label is different from the name, so no need for that
+ * parameter */
+#define _C_C_T_UNI_8_BIT(NAME,TEST_8,TEST_7,WORD)       \
+ANYOF_##NAME:                                           \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
+        }                                               \
+    }                                                   \
+    yesno = '+';                                        \
+    what = WORD;                                        \
+    break;                                              \
+case ANYOF_N##NAME:                                     \
+    if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
+    else if (UNI_SEMANTICS) {                           \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    else {                                              \
+        for (value = 0; value < 256; value++) {         \
+            if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+        }                                               \
+    }                                                   \
+    yesno = '!';                                        \
+    what = WORD;                                        \
+    break
+
 /* 
    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
    so that it is possible to override the option here without having to 
@@ -8339,11 +8469,13 @@ parseit:
                case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
                case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
-               case _C_C_T_(ALNUM, isALNUM(value), "Word");
-               case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
+                /* \s, \w match all unicode if utf8. */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), 
isSPACE(value), "SpacePerl");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), 
isALNUM(value), "Word");
 #else
-               case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
-               case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+                /* \s, \w match ascii and locale only */
+                case _C_C_T_UNI_8_BIT(SPACE, isSPACE_L1(value), 
isSPACE(value), "PerlSpace");
+                case _C_C_T_UNI_8_BIT(ALNUM, isWORDCHAR_L1(value), 
isALNUM(value), "PerlWord");
 #endif         
                case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
                case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), 
"VertSpace");
diff --git a/regcomp.h b/regcomp.h
index 362a8ed..dcb449f 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -271,6 +271,8 @@ struct regnode_charclass_class {    /* has [[:blah:]] 
classes */
 #undef STRING
 
 #define        OP(p)           ((p)->type)
+#define FLAGS(p)       ((p)->flags)    /* Caution: Doesn't apply to all
+                                          regnode types */
 #define        OPERAND(p)      (((struct regnode_string *)p)->string)
 #define MASK(p)                ((char*)OPERAND(p))
 #define        STR_LEN(p)      (((struct regnode_string *)p)->str_len)
@@ -306,6 +308,9 @@ struct regnode_charclass_class {    /* has [[:blah:]] 
classes */
 
 #define SIZE_ONLY (RExC_emit == &PL_regdummy)
 
+/* Flags for node->flags of several of the node types */
+#define USE_UNI                0x01
+
 /* Flags for node->flags of ANYOF */
 
 #define ANYOF_CLASS            0x08    /* has [[:blah:]] classes */
diff --git a/regexec.c b/regexec.c
index e569a91..901703f 100644
--- a/regexec.c
+++ b/regexec.c
@@ -176,69 +176,91 @@
 #endif
 
 
-#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)              
            \
-        case NAMEL:                                                            
  \
-            PL_reg_flags |= RF_tainted;                                        
         \
-            /* FALL THROUGH */                                                 
         \
-        case NAME:                                                             
        \
-            if (!nextchr)                                                      
         \
-                sayNO;                                                         
         \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                   
             \
-                if (!CAT2(PL_utf8_,CLASS)) {                                   
         \
-                    bool ok;                                                   
         \
-                    ENTER;                                                     
         \
-                    save_re_context();                                         
         \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                   
         \
-                    assert(ok);                                                
         \
-                    LEAVE;                                                     
         \
-                }                                                              
         \
-                if (!(OP(scan) == NAME                                         
         \
+#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
+        case NAMEL:                                                         \
+            PL_reg_flags |= RF_tainted;                                     \
+            /* FALL THROUGH */                                              \
+        case NAME:                                                          \
+            if (!nextchr)                                                   \
+                sayNO;                                                      \
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
+                if (!CAT2(PL_utf8_,CLASS)) {                                \
+                    bool ok;                                                \
+                    ENTER;                                                  \
+                    save_re_context();                                      \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
+                    assert(ok);                                             \
+                    LEAVE;                                                  \
+                }                                                           \
+                if (!(OP(scan) == NAME                                      \
                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, 
utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                             
         \
-                {                                                              
         \
-                    sayNO;                                                     
         \
-                }                                                              
         \
-                locinput += PL_utf8skip[nextchr];                              
         \
-                nextchr = UCHARAT(locinput);                                   
         \
-                break;                                                         
         \
-            }                                                                  
         \
-            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))         
         \
-                sayNO;                                                         
         \
-            nextchr = UCHARAT(++locinput);                                     
         \
+                    : LCFUNC_utf8((U8*)locinput)))                          \
+                {                                                           \
+                    sayNO;                                                  \
+                }                                                           \
+                locinput += PL_utf8skip[nextchr];                           \
+                nextchr = UCHARAT(locinput);                                \
+                break;                                                      \
+            }                                                               \
+           /* Drops through to the macro that calls this one */
+
+#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
+    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
+            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
+                sayNO;                                                      \
+            nextchr = UCHARAT(++locinput);                                  \
             break
 
-#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)              
          \
-        case NAMEL:                                                            
  \
-            PL_reg_flags |= RF_tainted;                                        
         \
-            /* FALL THROUGH */                                                 
         \
-        case NAME :                                                            
         \
-            if (!nextchr && locinput >= PL_regeol)                             
         \
-                sayNO;                                                         
         \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                   
             \
-                if (!CAT2(PL_utf8_,CLASS)) {                                   
         \
-                    bool ok;                                                   
         \
-                    ENTER;                                                     
         \
-                    save_re_context();                                         
         \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                   
         \
-                    assert(ok);                                                
         \
-                    LEAVE;                                                     
         \
-                }                                                              
         \
-                if ((OP(scan) == NAME                                          
        \
+/* Almost identical to the above, but has a case for a node that matches chars
+ * between 128 and 255 using Unicode (latin1) semantics. */
+#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
+    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
+            if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && 
(isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
+            break
+
+#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
+        case NAMEL:                                                          \
+            PL_reg_flags |= RF_tainted;                                      \
+            /* FALL THROUGH */                                               \
+        case NAME :                                                          \
+            if (!nextchr && locinput >= PL_regeol)                           \
+                sayNO;                                                       \
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
+                if (!CAT2(PL_utf8_,CLASS)) {                                 \
+                    bool ok;                                                 \
+                    ENTER;                                                   \
+                    save_re_context();                                       \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
+                    assert(ok);                                              \
+                    LEAVE;                                                   \
+                }                                                            \
+                if ((OP(scan) == NAME                                        \
                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, 
utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                             
         \
-                {                                                              
         \
-                    sayNO;                                                     
         \
-                }                                                              
         \
-                locinput += PL_utf8skip[nextchr];                              
         \
-                nextchr = UCHARAT(locinput);                                   
         \
-                break;                                                         
         \
-            }                                                                  
         \
-            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))          
         \
-                sayNO;                                                         
         \
-            nextchr = UCHARAT(++locinput);                                     
         \
+                    : LCFUNC_utf8((U8*)locinput)))                           \
+                {                                                            \
+                    sayNO;                                                   \
+                }                                                            \
+                locinput += PL_utf8skip[nextchr];                            \
+                nextchr = UCHARAT(locinput);                                 \
+                break;                                                       \
+            }
+
+#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
+    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
+            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
             break
 
 
+#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
+    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
+            if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && 
(isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
+            break
 
 
 
@@ -1506,12 +1528,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
                }
                );
            }
-           else {
+            else {  /* Not utf8 */
                tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-               tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == BOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
                REXEC_FBC_SCAN(
                    if (tmp ==
-                       !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
+                        !((OP(c) == BOUNDL)
+                          ? isALNUM_LC(*s)
+                          : (isWORDCHAR_L1((U8) *s)
+                             && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                   {
                        tmp = !tmp;
                        REXEC_FBC_TRYIT;
                }
@@ -1544,12 +1573,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
            }
            else {
                tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-               tmp = ((OP(c) == NBOUND ?
-                       isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == NBOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
                REXEC_FBC_SCAN(
-                   if (tmp ==
-                       !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
+                   if (tmp == ! cBOOL(
+                            (OP(c) == NBOUNDL)
+                            ? isALNUM_LC(*s)
+                            : (isWORDCHAR_L1((U8) *s)
+                               && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                    {
                        tmp = !tmp;
+                    }
                    else REXEC_FBC_TRYIT;
                );
            }
@@ -1560,7 +1596,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_WORD(),
                swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-               isALNUM(*s)
+                (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
            );
        case ALNUML:
            REXEC_FBC_CSCAN_TAINT(
@@ -1571,7 +1607,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_WORD(),
                !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-               !isALNUM(*s)
+                ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
            );
        case NALNUML:
            REXEC_FBC_CSCAN_TAINT(
@@ -1582,7 +1618,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, 
utf8_target),
-               isSPACE(*s)
+                isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & 
USE_UNI))
            );
        case SPACEL:
            REXEC_FBC_CSCAN_TAINT(
@@ -1593,7 +1629,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, 
utf8_target)),
-               !isSPACE(*s)
+                !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & 
USE_UNI)))
            );
        case NSPACEL:
            REXEC_FBC_CSCAN_TAINT(
@@ -3569,7 +3605,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            else {
                ln = (locinput != PL_bostr) ?
                    UCHARAT(locinput - 1) : '\n';
-               if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+               if (FLAGS(scan) & USE_UNI) {
+
+                    /* Here, can't be BOUNDL or NBOUNDL because they never set
+                     * the flags to USE_UNI */
+                    ln = isWORDCHAR_L1(ln);
+                    n = isWORDCHAR_L1(nextchr);
+                }
+                else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
                    ln = isALNUM(ln);
                    n = isALNUM(nextchr);
                }
@@ -3616,11 +3659,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                 sayNO;
            break;
        /* Special char classes - The defines start on line 129 or so */
-       CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, 
isALNUM, isALNUM_LC);
-       CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, 
isALNUM, isALNUM_LC);
+        CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, 
isWORDCHAR_L1, isALNUM_LC);
+        CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, 
isWORDCHAR_L1, isALNUM_LC);
 
-       CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, 
isSPACE, isSPACE_LC);
-       CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, 
isSPACE, isSPACE_LC);
+        CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, 
isSPACE_L1, isSPACE_LC);
+        CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, 
isSPACE_L1, isSPACE_LC);
 
        CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, 
isDIGIT, isDIGIT_LC);
        CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, 
isDIGIT, isDIGIT_LC);
@@ -5743,13 +5786,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_ALNUM();
            while (hardcount < max && scan < loceol &&
-                  swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && isALNUM(*scan))
-               scan++;
+            while (scan < loceol && isALNUM((U8) *scan)) {
+                scan++;
+            }
        }
        break;
     case ALNUML:
@@ -5771,13 +5820,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_ALNUM();
            while (hardcount < max && scan < loceol &&
-                  !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && !isALNUM(*scan))
-               scan++;
+            while (scan < loceol && ! isALNUM((U8) *scan)) {
+                scan++;
+            }
        }
        break;
     case NALNUML:
@@ -5800,13 +5855,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
            LOAD_UTF8_CHARCLASS_SPACE();
            while (hardcount < max && scan < loceol &&
                   (*scan == ' ' ||
-                   swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && isSPACE(*scan))
-               scan++;
+            while (scan < loceol && isSPACE((U8) *scan))
+                scan++;
        }
        break;
     case SPACEL:
@@ -5829,13 +5889,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
            LOAD_UTF8_CHARCLASS_SPACE();
            while (hardcount < max && scan < loceol &&
                   !(*scan == ' ' ||
-                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && !isSPACE(*scan))
-               scan++;
+            while (scan < loceol && ! isSPACE((U8) *scan)) {
+                scan++;
+            }
        }
        break;
     case NSPACEL:

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.13.5-480-ga12cf05

Reply via email to