Skip to site navigation (Press enter)

[webkit-changes] [202490] trunk

msaboff Mon, 27 Jun 2016 10:40:01 -0700

Title: [202490] trunk

Revision: 202490
Author: [email protected]
Date: 2016-06-27 10:38:55 -0700 (Mon, 27 Jun 2016)

Log Message

ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
https://bugs.webkit.org/show_bug.cgi?id=158505


Reviewed by Geoffrey Garen.

Source/_javascript_Core:

This change makes it so that the CharacterClassEscape \w matches the inverse of
\W and vice versa for unicode, ignore case RegExp's.

Before this change, both /\w/ui and /\W/ui RegExp's would match the characters
k, K, s, S, \u017f (Latin Small Letter Long S) and \u212a (Kelvin Sign).
This was due to how the ES6 standard defined matching of character classes
specifically that the abstract operation "Canonicalize()" is called for the
character to be matched AND for the characters in the character class we are
matching against.  This change is to make \W always be the inverse of \w.
It is still the case that the characters that match against \w changes
depending on a regular _expression_'s flags.

The only real changes occur for regular expressions with both the unicode and
ignore case flags set.  Updated the character class generator to make 
nonwordUnicodeIgnoreCaseChar not include k, K, s, S, \u017f and \u212a.
Changed BytecodePattern.wordcharCharacterClass to use the correct
word character class for the flags.  Simplfied character class set up in
in the pattern to use m_pattern.wordUnicodeIgnoreCaseCharCharacterClass and
invert as appropriate when unicode and ignore case are both set.

* create_regex_tables:
* yarr/YarrInterpreter.h:
(JSC::Yarr::BytecodePattern::BytecodePattern):
* yarr/YarrPattern.cpp:
(JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):

LayoutTests:

Updated and added test cases.

* js/regexp-unicode-expected.txt:
* js/script-tests/regexp-unicode.js:

Modified Paths

trunk/LayoutTests/ChangeLog
trunk/LayoutTests/js/regexp-unicode-expected.txt
trunk/LayoutTests/js/script-tests/regexp-unicode.js
trunk/Source/_javascript_Core/ChangeLog
trunk/Source/_javascript_Core/create_regex_tables
trunk/Source/_javascript_Core/yarr/YarrInterpreter.h
trunk/Source/_javascript_Core/yarr/YarrPattern.cpp

Diff

Modified: trunk/LayoutTests/ChangeLog (202489 => 202490)


--- trunk/LayoutTests/ChangeLog	2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/LayoutTests/ChangeLog	2016-06-27 17:38:55 UTC (rev 202490)
@@ -1,3 +1,15 @@
+2016-06-27  Michael Saboff  <[email protected]>
+
+        ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
+        https://bugs.webkit.org/show_bug.cgi?id=158505
+
+        Reviewed by Geoffrey Garen.
+
+        Updated and added test cases.
+
+        * js/regexp-unicode-expected.txt:
+        * js/script-tests/regexp-unicode.js:
+
 2016-06-27  Frederic Wang  <[email protected]>
 
         Set an upper limit for the size or number of pieces of stretchy operators

Modified: trunk/LayoutTests/js/regexp-unicode-expected.txt (202489 => 202490)


--- trunk/LayoutTests/js/regexp-unicode-expected.txt	2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/LayoutTests/js/regexp-unicode-expected.txt	2016-06-27 17:38:55 UTC (rev 202490)
@@ -41,36 +41,33 @@
 PASS "Ťx".match(/ťx/iu)[0].length is 2
 PASS /\w/iu.test("ſ") is true
 PASS /\w/iu.test("K") is true
-PASS /!\w/iu.test("ſ") is false
-PASS /!\w/iu.test("K") is false
-PASS /\W/iu.test("ſ") is true
-PASS /\W/iu.test("K") is true
-PASS /!\W/iu.test("ſ") is false
-PASS /!\W/iu.test("K") is false
+PASS /\W/iu.test("ſ") is false
+PASS /\W/iu.test("K") is false
 PASS /[\w\d]/iu.test("ſ") is true
 PASS /[\w\d]/iu.test("K") is true
 PASS /[^\w\d]/iu.test("ſ") is false
 PASS /[^\w\d]/iu.test("K") is false
-PASS /[\W\d]/iu.test("ſ") is true
-PASS /[\W\d]/iu.test("K") is true
-PASS /[^\W\d]/iu.test("ſ") is false
-PASS /[^\W\d]/iu.test("K") is false
+PASS /[\W\d]/iu.test("ſ") is false
+PASS /[\W\d]/iu.test("K") is false
+PASS /[^\W\d]/iu.test("ſ") is true
+PASS /[^\W\d]/iu.test("K") is true
 PASS /\w/iu.test("S") is true
 PASS /\w/iu.test("K") is true
-PASS /!\w/iu.test("S") is false
-PASS /!\w/iu.test("K") is false
-PASS /\W/iu.test("S") is true
-PASS /\W/iu.test("K") is true
-PASS /!\W/iu.test("S") is false
-PASS /!\W/iu.test("K") is false
+PASS /\W/iu.test("S") is false
+PASS /\W/iu.test("K") is false
 PASS /[\w\d]/iu.test("S") is true
 PASS /[\w\d]/iu.test("K") is true
 PASS /[^\w\d]/iu.test("S") is false
 PASS /[^\w\d]/iu.test("K") is false
-PASS /[\W\d]/iu.test("S") is true
-PASS /[\W\d]/iu.test("K") is true
-PASS /[^\W\d]/iu.test("S") is false
-PASS /[^\W\d]/iu.test("K") is false
+PASS /[\W\d]/iu.test("S") is false
+PASS /[\W\d]/iu.test("K") is false
+PASS /[^\W\d]/iu.test("S") is true
+PASS /[^\W\d]/iu.test("K") is true
+PASS "Grasſoden is old German for grass".match(/.*?\Bs\u017foden/iu)[0] is "Grasſoden"
+PASS "Grasſoden is old German for grass".match(/.*?\B\u017foden/iu)[0] is "Grasſoden"
+PASS "Grasſoden is old German for grass".match(/.*?\Boden/iu)[0] is "Grasſoden"
+PASS "Grasſoden is old German for grass".match(/.*?\Bden/iu)[0] is "Grasſoden"
+PASS "Water freezes at 273K which is 0C.".split(/\b\s/iu) is ["Water","freezes","at","273K","which","is","0C."]
 PASS "𝌆".match(/^.$/u)[0].length is 2
 PASS "It is 78°".match(/.*/u)[0].length is 9
 PASS stringWithDanglingFirstSurrogate.match(/.*/u)[0].length is 3

Modified: trunk/LayoutTests/js/script-tests/regexp-unicode.js (202489 => 202490)


--- trunk/LayoutTests/js/script-tests/regexp-unicode.js	2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/LayoutTests/js/script-tests/regexp-unicode.js	2016-06-27 17:38:55 UTC (rev 202490)
@@ -45,38 +45,34 @@
 shouldBe('"\u0164x".match(/\u0165x/iu)[0].length', '2');
 shouldBeTrue('/\\w/iu.test("\u017f")');
 shouldBeTrue('/\\w/iu.test("\u212a")');
-shouldBeFalse('/!\\w/iu.test("\u017f")');
-shouldBeFalse('/!\\w/iu.test("\u212a")');
-shouldBeTrue('/\\W/iu.test("\u017f")');
-shouldBeTrue('/\\W/iu.test("\u212a")');
-shouldBeFalse('/!\\W/iu.test("\u017f")');
-shouldBeFalse('/!\\W/iu.test("\u212a")');
+shouldBeFalse('/\\W/iu.test("\u017f")');
+shouldBeFalse('/\\W/iu.test("\u212a")');
 shouldBeTrue('/[\\w\\d]/iu.test("\u017f")');
 shouldBeTrue('/[\\w\\d]/iu.test("\u212a")');
 shouldBeFalse('/[^\\w\\d]/iu.test("\u017f")');
 shouldBeFalse('/[^\\w\\d]/iu.test("\u212a")');
-shouldBeTrue('/[\\W\\d]/iu.test("\u017f")');
-shouldBeTrue('/[\\W\\d]/iu.test("\u212a")');
-shouldBeFalse('/[^\\W\\d]/iu.test("\u017f")');
-shouldBeFalse('/[^\\W\\d]/iu.test("\u212a")');
+shouldBeFalse('/[\\W\\d]/iu.test("\u017f")');
+shouldBeFalse('/[\\W\\d]/iu.test("\u212a")');
+shouldBeTrue('/[^\\W\\d]/iu.test("\u017f")');
+shouldBeTrue('/[^\\W\\d]/iu.test("\u212a")');
 shouldBeTrue('/\\w/iu.test("S")');
 shouldBeTrue('/\\w/iu.test("K")');
-shouldBeFalse('/!\\w/iu.test("S")');
-shouldBeFalse('/!\\w/iu.test("K")');
-shouldBeTrue('/\\W/iu.test("S")');
-shouldBeTrue('/\\W/iu.test("K")');
-shouldBeFalse('/!\\W/iu.test("S")');
-shouldBeFalse('/!\\W/iu.test("K")');
+shouldBeFalse('/\\W/iu.test("S")');
+shouldBeFalse('/\\W/iu.test("K")');
 shouldBeTrue('/[\\w\\d]/iu.test("S")');
 shouldBeTrue('/[\\w\\d]/iu.test("K")');
 shouldBeFalse('/[^\\w\\d]/iu.test("S")');
 shouldBeFalse('/[^\\w\\d]/iu.test("K")');
-shouldBeTrue('/[\\W\\d]/iu.test("S")');
-shouldBeTrue('/[\\W\\d]/iu.test("K")');
-shouldBeFalse('/[^\\W\\d]/iu.test("S")');
-shouldBeFalse('/[^\\W\\d]/iu.test("K")');
+shouldBeFalse('/[\\W\\d]/iu.test("S")');
+shouldBeFalse('/[\\W\\d]/iu.test("K")');
+shouldBeTrue('/[^\\W\\d]/iu.test("S")');
+shouldBeTrue('/[^\\W\\d]/iu.test("K")');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Bs\\u017foden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\B\\u017foden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Boden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Bden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Water freezes at 273\u212a which is 0C.".split(/\\b\\s/iu)', '["Water","freezes","at","273\u212a","which","is","0C."]');
 
-
 // Test . matches with Unicode flag
 shouldBe('"\u{1D306}".match(/^.$/u)[0].length', '2');
 shouldBe('"It is 78\u00B0".match(/.*/u)[0].length', '9');

Modified: trunk/Source/_javascript_Core/ChangeLog (202489 => 202490)


--- trunk/Source/_javascript_Core/ChangeLog	2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/Source/_javascript_Core/ChangeLog	2016-06-27 17:38:55 UTC (rev 202490)
@@ -1,3 +1,36 @@
+2016-06-27  Michael Saboff  <[email protected]>
+
+        ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
+        https://bugs.webkit.org/show_bug.cgi?id=158505
+
+        Reviewed by Geoffrey Garen.
+
+        This change makes it so that the CharacterClassEscape \w matches the inverse of
+        \W and vice versa for unicode, ignore case RegExp's.
+
+        Before this change, both /\w/ui and /\W/ui RegExp's would match the characters
+        k, K, s, S, \u017f (Latin Small Letter Long S) and \u212a (Kelvin Sign).
+        This was due to how the ES6 standard defined matching of character classes
+        specifically that the abstract operation "Canonicalize()" is called for the
+        character to be matched AND for the characters in the character class we are
+        matching against.  This change is to make \W always be the inverse of \w.
+        It is still the case that the characters that match against \w changes
+        depending on a regular _expression_'s flags.
+
+        The only real changes occur for regular expressions with both the unicode and
+        ignore case flags set.  Updated the character class generator to make 
+        nonwordUnicodeIgnoreCaseChar not include k, K, s, S, \u017f and \u212a.
+        Changed BytecodePattern.wordcharCharacterClass to use the correct
+        word character class for the flags.  Simplfied character class set up in
+        in the pattern to use m_pattern.wordUnicodeIgnoreCaseCharCharacterClass and
+        invert as appropriate when unicode and ignore case are both set.
+
+        * create_regex_tables:
+        * yarr/YarrInterpreter.h:
+        (JSC::Yarr::BytecodePattern::BytecodePattern):
+        * yarr/YarrPattern.cpp:
+        (JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):
+
 2016-06-25  Keith Miller  <[email protected]>
 
         DFGByteCodeParsing does not handle calling the Object constructor with no arguments correctly

Modified: trunk/Source/_javascript_Core/create_regex_tables (202489 => 202490)


--- trunk/Source/_javascript_Core/create_regex_tables	2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/Source/_javascript_Core/create_regex_tables	2016-06-27 17:38:55 UTC (rev 202490)
@@ -27,7 +27,7 @@
     "wordchar": { "UseTable" : True, "data": ['_', ('0','9'), ('A', 'Z'), ('a','z')]},
     "wordUnicodeIgnoreCaseChar": { "UseTable" : False, "data": ['_', ('0', '9'), ('A', 'Z'), ('a', 'z'), 0x017f, 0x212a]},
     "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
-    "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordchar", "data": ['k', 'K', 's', 'S', '`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
+    "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordUnicodeIgnoreCaseChar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x017e), (0x0180, 0x2129), (0x212b, 0x10ffff)]},
     "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]},
     "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]},
     "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0x10ffff)]},

Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.h (202489 => 202490)


--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.h	2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.h	2016-06-27 17:38:55 UTC (rev 202490)
@@ -347,7 +347,10 @@
         m_body->terms.shrinkToFit();
 
         newlineCharacterClass = pattern.newlineCharacterClass();
-        wordcharCharacterClass = pattern.wordcharCharacterClass();
+        if (unicode() && ignoreCase())
+            wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass();
+        else
+            wordcharCharacterClass = pattern.wordcharCharacterClass();
 
         m_allParenthesesInfo.swap(parenthesesInfoToAdopt);
         m_allParenthesesInfo.shrinkToFit();

Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.cpp (202489 => 202490)


--- trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2016-06-27 17:38:55 UTC (rev 202490)
@@ -351,12 +351,9 @@
             m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
             break;
         case WordClassID:
-            if (m_pattern.unicode() && m_pattern.ignoreCase()) {
-                if (invert)
-                    m_alternative->m_terms.append(PatternTerm(m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass(), false));
-                else
-                    m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), false));
-            } else
+            if (m_pattern.unicode() && m_pattern.ignoreCase())
+                m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
+            else
                 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
             break;
         case NewlineClassID:

_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes