Skip to site navigation (Press enter)

[webkit-changes] [198624] trunk

msaboff Thu, 24 Mar 2016 07:37:23 -0700

Title: [198624] trunk

Revision: 198624
Author: [email protected]
Date: 2016-03-24 07:19:37 -0700 (Thu, 24 Mar 2016)

Log Message

[ES6] Greedy unicode RegExp's don't properly backtrack past non BMP characters
https://bugs.webkit.org/show_bug.cgi?id=155829


Reviewed by Saam Barati.

Source/_javascript_Core:

When we backup when matching part of a unicode pattern, we can't just backup one character.
Instead we need to save our start position before trying to match a character and
restore the position if the match fails.  This was done in other places, but wasn't
done for all greedy types.

Fixed matchGlobal() to properly handle advancing past non BMP characters.

* runtime/RegExpObject.cpp:
(JSC::RegExpObject::matchGlobal):
* runtime/RegExpObjectInlines.h:
(JSC::RegExpObject::advanceStringUnicode):
* yarr/YarrInterpreter.cpp:
(JSC::Yarr::Interpreter::matchCharacterClass):
(JSC::Yarr::Interpreter::matchDisjunction):

LayoutTests:

Added new test cases.

* js/regexp-unicode-expected.txt:
* js/script-tests/regexp-unicode.js:

Modified Paths

trunk/LayoutTests/ChangeLog
trunk/LayoutTests/js/regexp-unicode-expected.txt
trunk/LayoutTests/js/script-tests/regexp-unicode.js
trunk/Source/_javascript_Core/ChangeLog
trunk/Source/_javascript_Core/runtime/RegExpObject.cpp
trunk/Source/_javascript_Core/runtime/RegExpObjectInlines.h
trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp

Diff

Modified: trunk/LayoutTests/ChangeLog (198623 => 198624)


--- trunk/LayoutTests/ChangeLog	2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/LayoutTests/ChangeLog	2016-03-24 14:19:37 UTC (rev 198624)
@@ -1,3 +1,15 @@
+2016-03-24  Michael Saboff  <[email protected]>
+
+        [ES6] Greedy unicode RegExp's don't properly backtrack past non BMP characters
+        https://bugs.webkit.org/show_bug.cgi?id=155829
+
+        Reviewed by Saam Barati.
+
+        Added new test cases.
+
+        * js/regexp-unicode-expected.txt:
+        * js/script-tests/regexp-unicode.js:
+
 2016-03-24  Gyuyoung Kim  <[email protected]>
 
         Unreviewed EFL gardening.

Modified: trunk/LayoutTests/js/regexp-unicode-expected.txt (198623 => 198624)


--- trunk/LayoutTests/js/regexp-unicode-expected.txt	2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/LayoutTests/js/regexp-unicode-expected.txt	2016-03-24 14:19:37 UTC (rev 198624)
@@ -77,6 +77,11 @@
 PASS "ab𐐨𐐨𐐨c𐨁".match(/abc|ab𐐀*cd|ab𐐀+c𐨁d|ab𐐀+c𐨁/iu)[0] is "ab𐐨𐐨𐐨c𐨁"
 PASS "ab𐐨𐐨𐐨".match(/abc|ab𐐨*./u)[0] is "ab𐐨𐐨𐐨"
 PASS "ab𐐨𐐨𐐨".match(/abc|ab𐐀*./iu)[0] is "ab𐐨𐐨𐐨"
+PASS "𐐀".match(/a*/u)[0].length is 0
+PASS "𐐀".match(/a*/ui)[0].length is 0
+PASS "𐐀".match(/\d*/u)[0].length is 0
+PASS "123𐐀".match(/\d*/u)[0] is "123"
+PASS "12X3𐐀4".match(/\d{0,1}/ug) is ["1", "2", "", "3", "", "4", ""]
 PASS match3[0] is "a𐐐𐐐b"
 PASS match3[1] is undefined.
 PASS match3[2] is "a𐐐𐐐b"

Modified: trunk/LayoutTests/js/script-tests/regexp-unicode.js (198623 => 198624)


--- trunk/LayoutTests/js/script-tests/regexp-unicode.js	2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/LayoutTests/js/script-tests/regexp-unicode.js	2016-03-24 14:19:37 UTC (rev 198624)
@@ -113,6 +113,11 @@
 shouldBe('"ab\u{10428}\u{10428}\u{10428}c\u{10a01}".match(/abc|ab\u{10400}*cd|ab\u{10400}+c\u{10a01}d|ab\u{10400}+c\u{10a01}/iu)[0]', '"ab\u{10428}\u{10428}\u{10428}c\u{10a01}"');
 shouldBe('"ab\u{10428}\u{10428}\u{10428}".match(/abc|ab\u{10428}*./u)[0]', '"ab\u{10428}\u{10428}\u{10428}"');
 shouldBe('"ab\u{10428}\u{10428}\u{10428}".match(/abc|ab\u{10400}*./iu)[0]', '"ab\u{10428}\u{10428}\u{10428}"');
+shouldBe('"\u{10400}".match(/a*/u)[0].length', '0');
+shouldBe('"\u{10400}".match(/a*/ui)[0].length', '0');
+shouldBe('"\u{10400}".match(/\\d*/u)[0].length', '0');
+shouldBe('"123\u{10400}".match(/\\d*/u)[0]', '"123"');
+shouldBe('"12X3\u{10400}4".match(/\\d{0,1}/ug)', '["1", "2", "", "3", "", "4", ""]');
 
 var re3 = new RegExp("(a\u{10410}*bc)|(a\u{10410}*b)", "u");
 var match3 = "a\u{10410}\u{10410}b".match(re3);

Modified: trunk/Source/_javascript_Core/ChangeLog (198623 => 198624)


--- trunk/Source/_javascript_Core/ChangeLog	2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/Source/_javascript_Core/ChangeLog	2016-03-24 14:19:37 UTC (rev 198624)
@@ -1,3 +1,25 @@
+2016-03-24  Michael Saboff  <[email protected]>
+
+        [ES6] Greedy unicode RegExp's don't properly backtrack past non BMP characters
+        https://bugs.webkit.org/show_bug.cgi?id=155829
+
+        Reviewed by Saam Barati.
+
+        When we backup when matching part of a unicode pattern, we can't just backup one character.
+        Instead we need to save our start position before trying to match a character and
+        restore the position if the match fails.  This was done in other places, but wasn't
+        done for all greedy types.
+
+        Fixed matchGlobal() to properly handle advancing past non BMP characters.
+
+        * runtime/RegExpObject.cpp:
+        (JSC::RegExpObject::matchGlobal):
+        * runtime/RegExpObjectInlines.h:
+        (JSC::RegExpObject::advanceStringUnicode):
+        * yarr/YarrInterpreter.cpp:
+        (JSC::Yarr::Interpreter::matchCharacterClass):
+        (JSC::Yarr::Interpreter::matchDisjunction):
+
 2016-03-24  Benjamin Poulain  <[email protected]>
 
         [JSC] In some cases, the integer range optimization phase never converges

Modified: trunk/Source/_javascript_Core/runtime/RegExpObject.cpp (198623 => 198624)


--- trunk/Source/_javascript_Core/runtime/RegExpObject.cpp	2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/Source/_javascript_Core/runtime/RegExpObject.cpp	2016-03-24 14:19:37 UTC (rev 198624)
@@ -191,6 +191,7 @@
     const size_t maximumReasonableMatchSize = 1000000000;
 
     if (regExp->unicode()) {
+        unsigned stringLength = s.length();
         while (result) {
             if (list.size() > maximumReasonableMatchSize) {
                 throwOutOfMemoryError(exec);
@@ -201,7 +202,7 @@
             size_t length = end - result.start;
             list.append(jsSubstring(exec, s, result.start, length));
             if (!length)
-                end = advanceStringUnicode(s, length, end);
+                end = advanceStringUnicode(s, stringLength, end);
             result = regExpConstructor->performMatch(*vm, regExp, string, s, end);
         }
     } else {

Modified: trunk/Source/_javascript_Core/runtime/RegExpObjectInlines.h (198623 => 198624)


--- trunk/Source/_javascript_Core/runtime/RegExpObjectInlines.h	2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/Source/_javascript_Core/runtime/RegExpObjectInlines.h	2016-03-24 14:19:37 UTC (rev 198624)
@@ -117,7 +117,7 @@
     if (first < 0xD800 || first > 0xDBFF)
         return currentIndex + 1;
 
-    UChar second = s[currentIndex];
+    UChar second = s[currentIndex + 1];
     if (second < 0xDC00 || second > 0xDFFF)
         return currentIndex + 1;

Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp (198623 => 198624)


--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp	2016-03-24 13:27:30 UTC (rev 198623)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp	2016-03-24 14:19:37 UTC (rev 198624)
@@ -507,14 +507,16 @@
         }
 
         case QuantifierGreedy: {
-            backTrack->begin = input.getPos();
+            unsigned position = input.getPos();
+            backTrack->begin = position;
             unsigned matchAmount = 0;
             while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
                 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
-                    input.uncheckInput(1);
+                    input.setPos(position);
                     break;
                 }
                 ++matchAmount;
+                position = input.getPos();
             }
             backTrack->matchAmount = matchAmount;
 
@@ -1242,12 +1244,14 @@
         case ByteTerm::TypePatternCharacterGreedy: {
             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
             unsigned matchAmount = 0;
+            unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
             while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
                 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
-                    input.uncheckInput(1);
+                    input.setPos(position);
                     break;
                 }
                 ++matchAmount;
+                position = input.getPos();
             }
             backTrack->matchAmount = matchAmount;

_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes