Skip to site navigation (Press enter)

[webkit-changes] [199523] trunk

msaboff Wed, 13 Apr 2016 17:48:07 -0700

Title: [199523] trunk

Revision: 199523
Author: [email protected]
Date: 2016-04-13 17:47:40 -0700 (Wed, 13 Apr 2016)

Log Message

Some tests fail with ES6 `u` (Unicode) flag for regular expressions
https://bugs.webkit.org/show_bug.cgi?id=151597


Reviewed by Geoffrey Garen.

Source/_javascript_Core:

Added two new tables to handle the anomolies of \w and \W CharacterClassEscapes
when specified in RegExp's with both the unicode and ignoreCase flags.  Given the
case folding rules described in the standard vie the meta function Canonicalize(),
which allow cross ASCII case folding when unicode is specified, the unicode characters
\u017f (small sharp s) and \u212a (kelvin symbol) are part of the \w (word) characterClassEscape.
This is true because they case fold to 's' and 'k' respectively.  Because they case fold
to lower case letters, the corresponding letters, 'k', 'K', 's' and 'S', are also matched with
\W with the unicode and ignoreCase flags.

* create_regex_tables:
* yarr/YarrPattern.cpp:
(JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):
(JSC::Yarr::YarrPatternConstructor::atomCharacterClassBuiltIn):
(JSC::Yarr::YarrPattern::YarrPattern):
* yarr/YarrPattern.h:
(JSC::Yarr::YarrPattern::wordcharCharacterClass):
(JSC::Yarr::YarrPattern::wordUnicodeIgnoreCaseCharCharacterClass):
(JSC::Yarr::YarrPattern::nonwordcharCharacterClass):
(JSC::Yarr::YarrPattern::nonwordUnicodeIgnoreCaseCharCharacterClass):

LayoutTests:

Updated tests.

* js/regexp-unicode-expected.txt:
* js/script-tests/regexp-unicode.js:

Modified Paths

trunk/LayoutTests/ChangeLog
trunk/LayoutTests/js/regexp-unicode-expected.txt
trunk/LayoutTests/js/script-tests/regexp-unicode.js
trunk/Source/_javascript_Core/ChangeLog
trunk/Source/_javascript_Core/create_regex_tables
trunk/Source/_javascript_Core/yarr/YarrPattern.cpp
trunk/Source/_javascript_Core/yarr/YarrPattern.h

Diff

Modified: trunk/LayoutTests/ChangeLog (199522 => 199523)


--- trunk/LayoutTests/ChangeLog	2016-04-14 00:42:00 UTC (rev 199522)
+++ trunk/LayoutTests/ChangeLog	2016-04-14 00:47:40 UTC (rev 199523)
@@ -1,3 +1,15 @@
+2016-04-13  Michael Saboff  <[email protected]>
+
+        Some tests fail with ES6 `u` (Unicode) flag for regular expressions
+        https://bugs.webkit.org/show_bug.cgi?id=151597
+
+        Reviewed by Geoffrey Garen.
+
+        Updated tests.
+
+        * js/regexp-unicode-expected.txt:
+        * js/script-tests/regexp-unicode.js:
+
 2016-04-13  Chris Dumez  <[email protected]>
 
         We should not speculatively revalidate cached redirects

Modified: trunk/LayoutTests/js/regexp-unicode-expected.txt (199522 => 199523)


--- trunk/LayoutTests/js/regexp-unicode-expected.txt	2016-04-14 00:42:00 UTC (rev 199522)
+++ trunk/LayoutTests/js/regexp-unicode-expected.txt	2016-04-14 00:47:40 UTC (rev 199523)
@@ -39,6 +39,38 @@
 PASS /(?:A|𐄣|b)x/iu.test("bx") is true
 PASS "a𐄣X".match(/a𐄣b|a𐄣x/iu)[0].length is 4
 PASS "Ťx".match(/ťx/iu)[0].length is 2
+PASS /\w/iu.test("ſ") is true
+PASS /\w/iu.test("K") is true
+PASS /!\w/iu.test("ſ") is false
+PASS /!\w/iu.test("K") is false
+PASS /\W/iu.test("ſ") is true
+PASS /\W/iu.test("K") is true
+PASS /!\W/iu.test("ſ") is false
+PASS /!\W/iu.test("K") is false
+PASS /[\w\d]/iu.test("ſ") is true
+PASS /[\w\d]/iu.test("K") is true
+PASS /[^\w\d]/iu.test("ſ") is false
+PASS /[^\w\d]/iu.test("K") is false
+PASS /[\W\d]/iu.test("ſ") is true
+PASS /[\W\d]/iu.test("K") is true
+PASS /[^\W\d]/iu.test("ſ") is false
+PASS /[^\W\d]/iu.test("K") is false
+PASS /\w/iu.test("S") is true
+PASS /\w/iu.test("K") is true
+PASS /!\w/iu.test("S") is false
+PASS /!\w/iu.test("K") is false
+PASS /\W/iu.test("S") is true
+PASS /\W/iu.test("K") is true
+PASS /!\W/iu.test("S") is false
+PASS /!\W/iu.test("K") is false
+PASS /[\w\d]/iu.test("S") is true
+PASS /[\w\d]/iu.test("K") is true
+PASS /[^\w\d]/iu.test("S") is false
+PASS /[^\w\d]/iu.test("K") is false
+PASS /[\W\d]/iu.test("S") is true
+PASS /[\W\d]/iu.test("K") is true
+PASS /[^\W\d]/iu.test("S") is false
+PASS /[^\W\d]/iu.test("K") is false
 PASS "𝌆".match(/^.$/u)[0].length is 2
 PASS "It is 78°".match(/.*/u)[0].length is 9
 PASS stringWithDanglingFirstSurrogate.match(/.*/u)[0].length is 3

Modified: trunk/LayoutTests/js/script-tests/regexp-unicode.js (199522 => 199523)


--- trunk/LayoutTests/js/script-tests/regexp-unicode.js	2016-04-14 00:42:00 UTC (rev 199522)
+++ trunk/LayoutTests/js/script-tests/regexp-unicode.js	2016-04-14 00:47:40 UTC (rev 199523)
@@ -43,7 +43,40 @@
 shouldBeTrue('/(?:A|\u{10123}|b)x/iu.test("bx")');
 shouldBe('"a\u{10123}X".match(/a\u{10123}b|a\u{10123}x/iu)[0].length', '4');
 shouldBe('"\u0164x".match(/\u0165x/iu)[0].length', '2');
+shouldBeTrue('/\\w/iu.test("\u017f")');
+shouldBeTrue('/\\w/iu.test("\u212a")');
+shouldBeFalse('/!\\w/iu.test("\u017f")');
+shouldBeFalse('/!\\w/iu.test("\u212a")');
+shouldBeTrue('/\\W/iu.test("\u017f")');
+shouldBeTrue('/\\W/iu.test("\u212a")');
+shouldBeFalse('/!\\W/iu.test("\u017f")');
+shouldBeFalse('/!\\W/iu.test("\u212a")');
+shouldBeTrue('/[\\w\\d]/iu.test("\u017f")');
+shouldBeTrue('/[\\w\\d]/iu.test("\u212a")');
+shouldBeFalse('/[^\\w\\d]/iu.test("\u017f")');
+shouldBeFalse('/[^\\w\\d]/iu.test("\u212a")');
+shouldBeTrue('/[\\W\\d]/iu.test("\u017f")');
+shouldBeTrue('/[\\W\\d]/iu.test("\u212a")');
+shouldBeFalse('/[^\\W\\d]/iu.test("\u017f")');
+shouldBeFalse('/[^\\W\\d]/iu.test("\u212a")');
+shouldBeTrue('/\\w/iu.test("S")');
+shouldBeTrue('/\\w/iu.test("K")');
+shouldBeFalse('/!\\w/iu.test("S")');
+shouldBeFalse('/!\\w/iu.test("K")');
+shouldBeTrue('/\\W/iu.test("S")');
+shouldBeTrue('/\\W/iu.test("K")');
+shouldBeFalse('/!\\W/iu.test("S")');
+shouldBeFalse('/!\\W/iu.test("K")');
+shouldBeTrue('/[\\w\\d]/iu.test("S")');
+shouldBeTrue('/[\\w\\d]/iu.test("K")');
+shouldBeFalse('/[^\\w\\d]/iu.test("S")');
+shouldBeFalse('/[^\\w\\d]/iu.test("K")');
+shouldBeTrue('/[\\W\\d]/iu.test("S")');
+shouldBeTrue('/[\\W\\d]/iu.test("K")');
+shouldBeFalse('/[^\\W\\d]/iu.test("S")');
+shouldBeFalse('/[^\\W\\d]/iu.test("K")');
 
+
 // Test . matches with Unicode flag
 shouldBe('"\u{1D306}".match(/^.$/u)[0].length', '2');
 shouldBe('"It is 78\u00B0".match(/.*/u)[0].length', '9');

Modified: trunk/Source/_javascript_Core/ChangeLog (199522 => 199523)


--- trunk/Source/_javascript_Core/ChangeLog	2016-04-14 00:42:00 UTC (rev 199522)
+++ trunk/Source/_javascript_Core/ChangeLog	2016-04-14 00:47:40 UTC (rev 199523)
@@ -1,3 +1,30 @@
+2016-04-13  Michael Saboff  <[email protected]>
+
+        Some tests fail with ES6 `u` (Unicode) flag for regular expressions
+        https://bugs.webkit.org/show_bug.cgi?id=151597
+
+        Reviewed by Geoffrey Garen.
+
+        Added two new tables to handle the anomolies of \w and \W CharacterClassEscapes
+        when specified in RegExp's with both the unicode and ignoreCase flags.  Given the
+        case folding rules described in the standard vie the meta function Canonicalize(),
+        which allow cross ASCII case folding when unicode is specified, the unicode characters
+        \u017f (small sharp s) and \u212a (kelvin symbol) are part of the \w (word) characterClassEscape.
+        This is true because they case fold to 's' and 'k' respectively.  Because they case fold
+        to lower case letters, the corresponding letters, 'k', 'K', 's' and 'S', are also matched with
+        \W with the unicode and ignoreCase flags.
+
+        * create_regex_tables:
+        * yarr/YarrPattern.cpp:
+        (JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):
+        (JSC::Yarr::YarrPatternConstructor::atomCharacterClassBuiltIn):
+        (JSC::Yarr::YarrPattern::YarrPattern):
+        * yarr/YarrPattern.h:
+        (JSC::Yarr::YarrPattern::wordcharCharacterClass):
+        (JSC::Yarr::YarrPattern::wordUnicodeIgnoreCaseCharCharacterClass):
+        (JSC::Yarr::YarrPattern::nonwordcharCharacterClass):
+        (JSC::Yarr::YarrPattern::nonwordUnicodeIgnoreCaseCharCharacterClass):
+
 2016-04-13  Commit Queue  <[email protected]>
 
         Unreviewed, rolling out r199502 and r199511.

Modified: trunk/Source/_javascript_Core/create_regex_tables (199522 => 199523)


--- trunk/Source/_javascript_Core/create_regex_tables	2016-04-14 00:42:00 UTC (rev 199522)
+++ trunk/Source/_javascript_Core/create_regex_tables	2016-04-14 00:47:40 UTC (rev 199523)
@@ -25,12 +25,14 @@
 
 types = {
     "wordchar": { "UseTable" : True, "data": ['_', ('0','9'), ('A', 'Z'), ('a','z')]},
-    "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0xffff)]},
+    "wordUnicodeIgnoreCaseChar": { "UseTable" : False, "data": ['_', ('0', '9'), ('A', 'Z'), ('a', 'z'), 0x017f, 0x212a]},
+    "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
+    "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordchar", "data": ['k', 'K', 's', 'S', '`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
     "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]},
     "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]},
-    "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0xffff)]},
+    "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0x10ffff)]},
     "digits": { "UseTable" : False, "data": [('0', '9')]},
-    "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0xffff)] }
+    "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0x10ffff)] }
 }
 entriesPerLine = 50
 arrays = "";

Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.cpp (199522 => 199523)


--- trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2016-04-14 00:42:00 UTC (rev 199522)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2016-04-14 00:47:40 UTC (rev 199523)
@@ -349,7 +349,13 @@
             m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
             break;
         case WordClassID:
-            m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
+            if (m_pattern.unicode() && m_pattern.ignoreCase()) {
+                if (invert)
+                    m_alternative->m_terms.append(PatternTerm(m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass(), false));
+                else
+                    m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), false));
+            } else
+                m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
             break;
         case NewlineClassID:
             m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
@@ -386,7 +392,10 @@
             break;
         
         case WordClassID:
-            m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
+            if (m_pattern.unicode() && m_pattern.ignoreCase())
+                m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
+            else
+                m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
             break;
         
         default:
@@ -884,9 +893,11 @@
     , digitsCached(0)
     , spacesCached(0)
     , wordcharCached(0)
+    , wordUnicodeIgnoreCaseCharCached(0)
     , nondigitsCached(0)
     , nonspacesCached(0)
     , nonwordcharCached(0)
+    , nonwordUnicodeIgnoreCasecharCached(0)
 {
     *error = compile(pattern);
 }

Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.h (199522 => 199523)


--- trunk/Source/_javascript_Core/yarr/YarrPattern.h	2016-04-14 00:42:00 UTC (rev 199522)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.h	2016-04-14 00:47:40 UTC (rev 199523)
@@ -287,9 +287,11 @@
 std::unique_ptr<CharacterClass> digitsCreate();
 std::unique_ptr<CharacterClass> spacesCreate();
 std::unique_ptr<CharacterClass> wordcharCreate();
+std::unique_ptr<CharacterClass> wordUnicodeIgnoreCaseCharCreate();
 std::unique_ptr<CharacterClass> nondigitsCreate();
 std::unique_ptr<CharacterClass> nonspacesCreate();
 std::unique_ptr<CharacterClass> nonwordcharCreate();
+std::unique_ptr<CharacterClass> nonwordUnicodeIgnoreCaseCharCreate();
 
 struct TermChain {
     TermChain(PatternTerm term)
@@ -317,9 +319,11 @@
         digitsCached = 0;
         spacesCached = 0;
         wordcharCached = 0;
+        wordUnicodeIgnoreCaseCharCached = 0;
         nondigitsCached = 0;
         nonspacesCached = 0;
         nonwordcharCached = 0;
+        nonwordUnicodeIgnoreCasecharCached = 0;
 
         m_disjunctions.clear();
         m_userCharacterClasses.clear();
@@ -367,6 +371,14 @@
         }
         return wordcharCached;
     }
+    CharacterClass* wordUnicodeIgnoreCaseCharCharacterClass()
+    {
+        if (!wordUnicodeIgnoreCaseCharCached) {
+            m_userCharacterClasses.append(wordUnicodeIgnoreCaseCharCreate());
+            wordUnicodeIgnoreCaseCharCached = m_userCharacterClasses.last().get();
+        }
+        return wordUnicodeIgnoreCaseCharCached;
+    }
     CharacterClass* nondigitsCharacterClass()
     {
         if (!nondigitsCached) {
@@ -391,6 +403,14 @@
         }
         return nonwordcharCached;
     }
+    CharacterClass* nonwordUnicodeIgnoreCaseCharCharacterClass()
+    {
+        if (!nonwordUnicodeIgnoreCasecharCached) {
+            m_userCharacterClasses.append(nonwordUnicodeIgnoreCaseCharCreate());
+            nonwordUnicodeIgnoreCasecharCached = m_userCharacterClasses.last().get();
+        }
+        return nonwordUnicodeIgnoreCasecharCached;
+    }
 
     bool ignoreCase() const { return m_flags & FlagIgnoreCase; }
     bool multiline() const { return m_flags & FlagMultiline; }
@@ -414,9 +434,11 @@
     CharacterClass* digitsCached;
     CharacterClass* spacesCached;
     CharacterClass* wordcharCached;
+    CharacterClass* wordUnicodeIgnoreCaseCharCached;
     CharacterClass* nondigitsCached;
     CharacterClass* nonspacesCached;
     CharacterClass* nonwordcharCached;
+    CharacterClass* nonwordUnicodeIgnoreCasecharCached;
 };
 
 } } // namespace JSC::Yarr

_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes