- Revision
- 198624
- Author
- [email protected]
- Date
- 2016-03-24 07:19:37 -0700 (Thu, 24 Mar 2016)
Log Message
[ES6] Greedy unicode RegExp's don't properly backtrack past non BMP characters
https://bugs.webkit.org/show_bug.cgi?id=155829
Reviewed by Saam Barati.
Source/_javascript_Core:
When we backup when matching part of a unicode pattern, we can't just backup one character.
Instead we need to save our start position before trying to match a character and
restore the position if the match fails. This was done in other places, but wasn't
done for all greedy types.
Fixed matchGlobal() to properly handle advancing past non BMP characters.
* runtime/RegExpObject.cpp:
(JSC::RegExpObject::matchGlobal):
* runtime/RegExpObjectInlines.h:
(JSC::RegExpObject::advanceStringUnicode):
* yarr/YarrInterpreter.cpp:
(JSC::Yarr::Interpreter::matchCharacterClass):
(JSC::Yarr::Interpreter::matchDisjunction):
LayoutTests:
Added new test cases.
* js/regexp-unicode-expected.txt:
* js/script-tests/regexp-unicode.js:
Modified Paths
Diff
Modified: trunk/LayoutTests/ChangeLog (198623 => 198624)
--- trunk/LayoutTests/ChangeLog 2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/LayoutTests/ChangeLog 2016-03-24 14:19:37 UTC (rev 198624)
@@ -1,3 +1,15 @@
+2016-03-24 Michael Saboff <[email protected]>
+
+ [ES6] Greedy unicode RegExp's don't properly backtrack past non BMP characters
+ https://bugs.webkit.org/show_bug.cgi?id=155829
+
+ Reviewed by Saam Barati.
+
+ Added new test cases.
+
+ * js/regexp-unicode-expected.txt:
+ * js/script-tests/regexp-unicode.js:
+
2016-03-24 Gyuyoung Kim <[email protected]>
Unreviewed EFL gardening.
Modified: trunk/LayoutTests/js/regexp-unicode-expected.txt (198623 => 198624)
--- trunk/LayoutTests/js/regexp-unicode-expected.txt 2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/LayoutTests/js/regexp-unicode-expected.txt 2016-03-24 14:19:37 UTC (rev 198624)
@@ -77,6 +77,11 @@
PASS "ab𐐨𐐨𐐨c𐨁".match(/abc|ab𐐀*cd|ab𐐀+c𐨁d|ab𐐀+c𐨁/iu)[0] is "ab𐐨𐐨𐐨c𐨁"
PASS "ab𐐨𐐨𐐨".match(/abc|ab𐐨*./u)[0] is "ab𐐨𐐨𐐨"
PASS "ab𐐨𐐨𐐨".match(/abc|ab𐐀*./iu)[0] is "ab𐐨𐐨𐐨"
+PASS "𐐀".match(/a*/u)[0].length is 0
+PASS "𐐀".match(/a*/ui)[0].length is 0
+PASS "𐐀".match(/\d*/u)[0].length is 0
+PASS "123𐐀".match(/\d*/u)[0] is "123"
+PASS "12X3𐐀4".match(/\d{0,1}/ug) is ["1", "2", "", "3", "", "4", ""]
PASS match3[0] is "a𐐐𐐐b"
PASS match3[1] is undefined.
PASS match3[2] is "a𐐐𐐐b"
Modified: trunk/LayoutTests/js/script-tests/regexp-unicode.js (198623 => 198624)
--- trunk/LayoutTests/js/script-tests/regexp-unicode.js 2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/LayoutTests/js/script-tests/regexp-unicode.js 2016-03-24 14:19:37 UTC (rev 198624)
@@ -113,6 +113,11 @@
shouldBe('"ab\u{10428}\u{10428}\u{10428}c\u{10a01}".match(/abc|ab\u{10400}*cd|ab\u{10400}+c\u{10a01}d|ab\u{10400}+c\u{10a01}/iu)[0]', '"ab\u{10428}\u{10428}\u{10428}c\u{10a01}"');
shouldBe('"ab\u{10428}\u{10428}\u{10428}".match(/abc|ab\u{10428}*./u)[0]', '"ab\u{10428}\u{10428}\u{10428}"');
shouldBe('"ab\u{10428}\u{10428}\u{10428}".match(/abc|ab\u{10400}*./iu)[0]', '"ab\u{10428}\u{10428}\u{10428}"');
+shouldBe('"\u{10400}".match(/a*/u)[0].length', '0');
+shouldBe('"\u{10400}".match(/a*/ui)[0].length', '0');
+shouldBe('"\u{10400}".match(/\\d*/u)[0].length', '0');
+shouldBe('"123\u{10400}".match(/\\d*/u)[0]', '"123"');
+shouldBe('"12X3\u{10400}4".match(/\\d{0,1}/ug)', '["1", "2", "", "3", "", "4", ""]');
var re3 = new RegExp("(a\u{10410}*bc)|(a\u{10410}*b)", "u");
var match3 = "a\u{10410}\u{10410}b".match(re3);
Modified: trunk/Source/_javascript_Core/ChangeLog (198623 => 198624)
--- trunk/Source/_javascript_Core/ChangeLog 2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/Source/_javascript_Core/ChangeLog 2016-03-24 14:19:37 UTC (rev 198624)
@@ -1,3 +1,25 @@
+2016-03-24 Michael Saboff <[email protected]>
+
+ [ES6] Greedy unicode RegExp's don't properly backtrack past non BMP characters
+ https://bugs.webkit.org/show_bug.cgi?id=155829
+
+ Reviewed by Saam Barati.
+
+ When we backup when matching part of a unicode pattern, we can't just backup one character.
+ Instead we need to save our start position before trying to match a character and
+ restore the position if the match fails. This was done in other places, but wasn't
+ done for all greedy types.
+
+ Fixed matchGlobal() to properly handle advancing past non BMP characters.
+
+ * runtime/RegExpObject.cpp:
+ (JSC::RegExpObject::matchGlobal):
+ * runtime/RegExpObjectInlines.h:
+ (JSC::RegExpObject::advanceStringUnicode):
+ * yarr/YarrInterpreter.cpp:
+ (JSC::Yarr::Interpreter::matchCharacterClass):
+ (JSC::Yarr::Interpreter::matchDisjunction):
+
2016-03-24 Benjamin Poulain <[email protected]>
[JSC] In some cases, the integer range optimization phase never converges
Modified: trunk/Source/_javascript_Core/runtime/RegExpObject.cpp (198623 => 198624)
--- trunk/Source/_javascript_Core/runtime/RegExpObject.cpp 2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/Source/_javascript_Core/runtime/RegExpObject.cpp 2016-03-24 14:19:37 UTC (rev 198624)
@@ -191,6 +191,7 @@
const size_t maximumReasonableMatchSize = 1000000000;
if (regExp->unicode()) {
+ unsigned stringLength = s.length();
while (result) {
if (list.size() > maximumReasonableMatchSize) {
throwOutOfMemoryError(exec);
@@ -201,7 +202,7 @@
size_t length = end - result.start;
list.append(jsSubstring(exec, s, result.start, length));
if (!length)
- end = advanceStringUnicode(s, length, end);
+ end = advanceStringUnicode(s, stringLength, end);
result = regExpConstructor->performMatch(*vm, regExp, string, s, end);
}
} else {
Modified: trunk/Source/_javascript_Core/runtime/RegExpObjectInlines.h (198623 => 198624)
--- trunk/Source/_javascript_Core/runtime/RegExpObjectInlines.h 2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/Source/_javascript_Core/runtime/RegExpObjectInlines.h 2016-03-24 14:19:37 UTC (rev 198624)
@@ -117,7 +117,7 @@
if (first < 0xD800 || first > 0xDBFF)
return currentIndex + 1;
- UChar second = s[currentIndex];
+ UChar second = s[currentIndex + 1];
if (second < 0xDC00 || second > 0xDFFF)
return currentIndex + 1;
Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp (198623 => 198624)
--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp 2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp 2016-03-24 14:19:37 UTC (rev 198624)
@@ -507,14 +507,16 @@
}
case QuantifierGreedy: {
- backTrack->begin = input.getPos();
+ unsigned position = input.getPos();
+ backTrack->begin = position;
unsigned matchAmount = 0;
while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
- input.uncheckInput(1);
+ input.setPos(position);
break;
}
++matchAmount;
+ position = input.getPos();
}
backTrack->matchAmount = matchAmount;
@@ -1242,12 +1244,14 @@
case ByteTerm::TypePatternCharacterGreedy: {
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
unsigned matchAmount = 0;
+ unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
- input.uncheckInput(1);
+ input.setPos(position);
break;
}
++matchAmount;
+ position = input.getPos();
}
backTrack->matchAmount = matchAmount;