Log Message
ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B https://bugs.webkit.org/show_bug.cgi?id=158505
Reviewed by Geoffrey Garen. Source/_javascript_Core: This change makes it so that the CharacterClassEscape \w matches the inverse of \W and vice versa for unicode, ignore case RegExp's. Before this change, both /\w/ui and /\W/ui RegExp's would match the characters k, K, s, S, \u017f (Latin Small Letter Long S) and \u212a (Kelvin Sign). This was due to how the ES6 standard defined matching of character classes specifically that the abstract operation "Canonicalize()" is called for the character to be matched AND for the characters in the character class we are matching against. This change is to make \W always be the inverse of \w. It is still the case that the characters that match against \w changes depending on a regular _expression_'s flags. The only real changes occur for regular expressions with both the unicode and ignore case flags set. Updated the character class generator to make nonwordUnicodeIgnoreCaseChar not include k, K, s, S, \u017f and \u212a. Changed BytecodePattern.wordcharCharacterClass to use the correct word character class for the flags. Simplfied character class set up in in the pattern to use m_pattern.wordUnicodeIgnoreCaseCharCharacterClass and invert as appropriate when unicode and ignore case are both set. * create_regex_tables: * yarr/YarrInterpreter.h: (JSC::Yarr::BytecodePattern::BytecodePattern): * yarr/YarrPattern.cpp: (JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass): LayoutTests: Updated and added test cases. * js/regexp-unicode-expected.txt: * js/script-tests/regexp-unicode.js:
Modified Paths
- trunk/LayoutTests/ChangeLog
- trunk/LayoutTests/js/regexp-unicode-expected.txt
- trunk/LayoutTests/js/script-tests/regexp-unicode.js
- trunk/Source/_javascript_Core/ChangeLog
- trunk/Source/_javascript_Core/create_regex_tables
- trunk/Source/_javascript_Core/yarr/YarrInterpreter.h
- trunk/Source/_javascript_Core/yarr/YarrPattern.cpp
Diff
Modified: trunk/LayoutTests/ChangeLog (202489 => 202490)
--- trunk/LayoutTests/ChangeLog 2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/LayoutTests/ChangeLog 2016-06-27 17:38:55 UTC (rev 202490)
@@ -1,3 +1,15 @@
+2016-06-27 Michael Saboff <[email protected]>
+
+ ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
+ https://bugs.webkit.org/show_bug.cgi?id=158505
+
+ Reviewed by Geoffrey Garen.
+
+ Updated and added test cases.
+
+ * js/regexp-unicode-expected.txt:
+ * js/script-tests/regexp-unicode.js:
+
2016-06-27 Frederic Wang <[email protected]>
Set an upper limit for the size or number of pieces of stretchy operators
Modified: trunk/LayoutTests/js/regexp-unicode-expected.txt (202489 => 202490)
--- trunk/LayoutTests/js/regexp-unicode-expected.txt 2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/LayoutTests/js/regexp-unicode-expected.txt 2016-06-27 17:38:55 UTC (rev 202490)
@@ -41,36 +41,33 @@
PASS "Ťx".match(/ťx/iu)[0].length is 2
PASS /\w/iu.test("ſ") is true
PASS /\w/iu.test("K") is true
-PASS /!\w/iu.test("ſ") is false
-PASS /!\w/iu.test("K") is false
-PASS /\W/iu.test("ſ") is true
-PASS /\W/iu.test("K") is true
-PASS /!\W/iu.test("ſ") is false
-PASS /!\W/iu.test("K") is false
+PASS /\W/iu.test("ſ") is false
+PASS /\W/iu.test("K") is false
PASS /[\w\d]/iu.test("ſ") is true
PASS /[\w\d]/iu.test("K") is true
PASS /[^\w\d]/iu.test("ſ") is false
PASS /[^\w\d]/iu.test("K") is false
-PASS /[\W\d]/iu.test("ſ") is true
-PASS /[\W\d]/iu.test("K") is true
-PASS /[^\W\d]/iu.test("ſ") is false
-PASS /[^\W\d]/iu.test("K") is false
+PASS /[\W\d]/iu.test("ſ") is false
+PASS /[\W\d]/iu.test("K") is false
+PASS /[^\W\d]/iu.test("ſ") is true
+PASS /[^\W\d]/iu.test("K") is true
PASS /\w/iu.test("S") is true
PASS /\w/iu.test("K") is true
-PASS /!\w/iu.test("S") is false
-PASS /!\w/iu.test("K") is false
-PASS /\W/iu.test("S") is true
-PASS /\W/iu.test("K") is true
-PASS /!\W/iu.test("S") is false
-PASS /!\W/iu.test("K") is false
+PASS /\W/iu.test("S") is false
+PASS /\W/iu.test("K") is false
PASS /[\w\d]/iu.test("S") is true
PASS /[\w\d]/iu.test("K") is true
PASS /[^\w\d]/iu.test("S") is false
PASS /[^\w\d]/iu.test("K") is false
-PASS /[\W\d]/iu.test("S") is true
-PASS /[\W\d]/iu.test("K") is true
-PASS /[^\W\d]/iu.test("S") is false
-PASS /[^\W\d]/iu.test("K") is false
+PASS /[\W\d]/iu.test("S") is false
+PASS /[\W\d]/iu.test("K") is false
+PASS /[^\W\d]/iu.test("S") is true
+PASS /[^\W\d]/iu.test("K") is true
+PASS "Grasſoden is old German for grass".match(/.*?\Bs\u017foden/iu)[0] is "Grasſoden"
+PASS "Grasſoden is old German for grass".match(/.*?\B\u017foden/iu)[0] is "Grasſoden"
+PASS "Grasſoden is old German for grass".match(/.*?\Boden/iu)[0] is "Grasſoden"
+PASS "Grasſoden is old German for grass".match(/.*?\Bden/iu)[0] is "Grasſoden"
+PASS "Water freezes at 273K which is 0C.".split(/\b\s/iu) is ["Water","freezes","at","273K","which","is","0C."]
PASS "𝌆".match(/^.$/u)[0].length is 2
PASS "It is 78°".match(/.*/u)[0].length is 9
PASS stringWithDanglingFirstSurrogate.match(/.*/u)[0].length is 3
Modified: trunk/LayoutTests/js/script-tests/regexp-unicode.js (202489 => 202490)
--- trunk/LayoutTests/js/script-tests/regexp-unicode.js 2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/LayoutTests/js/script-tests/regexp-unicode.js 2016-06-27 17:38:55 UTC (rev 202490)
@@ -45,38 +45,34 @@
shouldBe('"\u0164x".match(/\u0165x/iu)[0].length', '2');
shouldBeTrue('/\\w/iu.test("\u017f")');
shouldBeTrue('/\\w/iu.test("\u212a")');
-shouldBeFalse('/!\\w/iu.test("\u017f")');
-shouldBeFalse('/!\\w/iu.test("\u212a")');
-shouldBeTrue('/\\W/iu.test("\u017f")');
-shouldBeTrue('/\\W/iu.test("\u212a")');
-shouldBeFalse('/!\\W/iu.test("\u017f")');
-shouldBeFalse('/!\\W/iu.test("\u212a")');
+shouldBeFalse('/\\W/iu.test("\u017f")');
+shouldBeFalse('/\\W/iu.test("\u212a")');
shouldBeTrue('/[\\w\\d]/iu.test("\u017f")');
shouldBeTrue('/[\\w\\d]/iu.test("\u212a")');
shouldBeFalse('/[^\\w\\d]/iu.test("\u017f")');
shouldBeFalse('/[^\\w\\d]/iu.test("\u212a")');
-shouldBeTrue('/[\\W\\d]/iu.test("\u017f")');
-shouldBeTrue('/[\\W\\d]/iu.test("\u212a")');
-shouldBeFalse('/[^\\W\\d]/iu.test("\u017f")');
-shouldBeFalse('/[^\\W\\d]/iu.test("\u212a")');
+shouldBeFalse('/[\\W\\d]/iu.test("\u017f")');
+shouldBeFalse('/[\\W\\d]/iu.test("\u212a")');
+shouldBeTrue('/[^\\W\\d]/iu.test("\u017f")');
+shouldBeTrue('/[^\\W\\d]/iu.test("\u212a")');
shouldBeTrue('/\\w/iu.test("S")');
shouldBeTrue('/\\w/iu.test("K")');
-shouldBeFalse('/!\\w/iu.test("S")');
-shouldBeFalse('/!\\w/iu.test("K")');
-shouldBeTrue('/\\W/iu.test("S")');
-shouldBeTrue('/\\W/iu.test("K")');
-shouldBeFalse('/!\\W/iu.test("S")');
-shouldBeFalse('/!\\W/iu.test("K")');
+shouldBeFalse('/\\W/iu.test("S")');
+shouldBeFalse('/\\W/iu.test("K")');
shouldBeTrue('/[\\w\\d]/iu.test("S")');
shouldBeTrue('/[\\w\\d]/iu.test("K")');
shouldBeFalse('/[^\\w\\d]/iu.test("S")');
shouldBeFalse('/[^\\w\\d]/iu.test("K")');
-shouldBeTrue('/[\\W\\d]/iu.test("S")');
-shouldBeTrue('/[\\W\\d]/iu.test("K")');
-shouldBeFalse('/[^\\W\\d]/iu.test("S")');
-shouldBeFalse('/[^\\W\\d]/iu.test("K")');
+shouldBeFalse('/[\\W\\d]/iu.test("S")');
+shouldBeFalse('/[\\W\\d]/iu.test("K")');
+shouldBeTrue('/[^\\W\\d]/iu.test("S")');
+shouldBeTrue('/[^\\W\\d]/iu.test("K")');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Bs\\u017foden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\B\\u017foden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Boden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Bden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Water freezes at 273\u212a which is 0C.".split(/\\b\\s/iu)', '["Water","freezes","at","273\u212a","which","is","0C."]');
-
// Test . matches with Unicode flag
shouldBe('"\u{1D306}".match(/^.$/u)[0].length', '2');
shouldBe('"It is 78\u00B0".match(/.*/u)[0].length', '9');
Modified: trunk/Source/_javascript_Core/ChangeLog (202489 => 202490)
--- trunk/Source/_javascript_Core/ChangeLog 2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/Source/_javascript_Core/ChangeLog 2016-06-27 17:38:55 UTC (rev 202490)
@@ -1,3 +1,36 @@
+2016-06-27 Michael Saboff <[email protected]>
+
+ ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
+ https://bugs.webkit.org/show_bug.cgi?id=158505
+
+ Reviewed by Geoffrey Garen.
+
+ This change makes it so that the CharacterClassEscape \w matches the inverse of
+ \W and vice versa for unicode, ignore case RegExp's.
+
+ Before this change, both /\w/ui and /\W/ui RegExp's would match the characters
+ k, K, s, S, \u017f (Latin Small Letter Long S) and \u212a (Kelvin Sign).
+ This was due to how the ES6 standard defined matching of character classes
+ specifically that the abstract operation "Canonicalize()" is called for the
+ character to be matched AND for the characters in the character class we are
+ matching against. This change is to make \W always be the inverse of \w.
+ It is still the case that the characters that match against \w changes
+ depending on a regular _expression_'s flags.
+
+ The only real changes occur for regular expressions with both the unicode and
+ ignore case flags set. Updated the character class generator to make
+ nonwordUnicodeIgnoreCaseChar not include k, K, s, S, \u017f and \u212a.
+ Changed BytecodePattern.wordcharCharacterClass to use the correct
+ word character class for the flags. Simplfied character class set up in
+ in the pattern to use m_pattern.wordUnicodeIgnoreCaseCharCharacterClass and
+ invert as appropriate when unicode and ignore case are both set.
+
+ * create_regex_tables:
+ * yarr/YarrInterpreter.h:
+ (JSC::Yarr::BytecodePattern::BytecodePattern):
+ * yarr/YarrPattern.cpp:
+ (JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):
+
2016-06-25 Keith Miller <[email protected]>
DFGByteCodeParsing does not handle calling the Object constructor with no arguments correctly
Modified: trunk/Source/_javascript_Core/create_regex_tables (202489 => 202490)
--- trunk/Source/_javascript_Core/create_regex_tables 2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/Source/_javascript_Core/create_regex_tables 2016-06-27 17:38:55 UTC (rev 202490)
@@ -27,7 +27,7 @@
"wordchar": { "UseTable" : True, "data": ['_', ('0','9'), ('A', 'Z'), ('a','z')]},
"wordUnicodeIgnoreCaseChar": { "UseTable" : False, "data": ['_', ('0', '9'), ('A', 'Z'), ('a', 'z'), 0x017f, 0x212a]},
"nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
- "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordchar", "data": ['k', 'K', 's', 'S', '`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
+ "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordUnicodeIgnoreCaseChar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x017e), (0x0180, 0x2129), (0x212b, 0x10ffff)]},
"newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]},
"spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]},
"nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0x10ffff)]},
Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.h (202489 => 202490)
--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.h 2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.h 2016-06-27 17:38:55 UTC (rev 202490)
@@ -347,7 +347,10 @@
m_body->terms.shrinkToFit();
newlineCharacterClass = pattern.newlineCharacterClass();
- wordcharCharacterClass = pattern.wordcharCharacterClass();
+ if (unicode() && ignoreCase())
+ wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass();
+ else
+ wordcharCharacterClass = pattern.wordcharCharacterClass();
m_allParenthesesInfo.swap(parenthesesInfoToAdopt);
m_allParenthesesInfo.shrinkToFit();
Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.cpp (202489 => 202490)
--- trunk/Source/_javascript_Core/yarr/YarrPattern.cpp 2016-06-27 17:36:09 UTC (rev 202489)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.cpp 2016-06-27 17:38:55 UTC (rev 202490)
@@ -351,12 +351,9 @@
m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
break;
case WordClassID:
- if (m_pattern.unicode() && m_pattern.ignoreCase()) {
- if (invert)
- m_alternative->m_terms.append(PatternTerm(m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass(), false));
- else
- m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), false));
- } else
+ if (m_pattern.unicode() && m_pattern.ignoreCase())
+ m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
+ else
m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
break;
case NewlineClassID:
_______________________________________________ webkit-changes mailing list [email protected] https://lists.webkit.org/mailman/listinfo/webkit-changes
