Title: [250223] releases/WebKitGTK/webkit-2.26
Revision
250223
Author
carlo...@webkit.org
Date
2019-09-23 03:14:36 -0700 (Mon, 23 Sep 2019)

Log Message

Merge r249926 - [JSC] Perform check again when we found non-BMP characters
https://bugs.webkit.org/show_bug.cgi?id=201647

Reviewed by Yusuke Suzuki.

JSTests:

* stress/regexp-unicode-surrogate-pair-increment-should-involve-length-check.js: Added.
* stress/regexp-unicode-within-string.js: Updated test to eliminate the bogus print().
(testRegExpInbounds):

Source/_javascript_Core:

We need to check for end of input for non-BMP characters when matching a character class that contains
both BMP and non-BMP characters.  In advanceIndexAfterCharacterClassTermMatch() we were checking for
end of input for both BMP and non-BMP characters.  For BMP characters, this check is redundant.
After moving the check to after the "is BMP check", we need to decrement index after reaching the failure
label to back out the index++ for the first surrogate of the non-BMP character.

Added the same kind of check in generateCharacterClassOnce().  In that case, we have pre-checked the
first character (surrogate) for a non-BMP codepoint, so we just need to check for end of input before
we increment for the second surrogate.

While writing tests, I found an off by one error in backtrackCharacterClassGreedy() and changed the
loop to check the count at loop top instead of loop bottom.

* yarr/YarrJIT.cpp:
(JSC::Yarr::YarrGenerator::advanceIndexAfterCharacterClassTermMatch):
(JSC::Yarr::YarrGenerator::generateCharacterClassOnce):
(JSC::Yarr::YarrGenerator::generateCharacterClassGreedy):
(JSC::Yarr::YarrGenerator::backtrackCharacterClassGreedy):
(JSC::Yarr::YarrGenerator::backtrackCharacterClassNonGreedy):

Modified Paths

Added Paths

Diff

Modified: releases/WebKitGTK/webkit-2.26/JSTests/ChangeLog (250222 => 250223)


--- releases/WebKitGTK/webkit-2.26/JSTests/ChangeLog	2019-09-23 10:14:32 UTC (rev 250222)
+++ releases/WebKitGTK/webkit-2.26/JSTests/ChangeLog	2019-09-23 10:14:36 UTC (rev 250223)
@@ -1,3 +1,14 @@
+2019-09-16  Michael Saboff  <msab...@apple.com>
+
+        [JSC] Perform check again when we found non-BMP characters
+        https://bugs.webkit.org/show_bug.cgi?id=201647
+
+        Reviewed by Yusuke Suzuki.
+
+        * stress/regexp-unicode-surrogate-pair-increment-should-involve-length-check.js: Added.
+        * stress/regexp-unicode-within-string.js: Updated test to eliminate the bogus print().
+        (testRegExpInbounds):
+
 2019-09-10  Michael Saboff  <msab...@apple.com>
 
         JSC crashes due to stack overflow while building RegExp

Added: releases/WebKitGTK/webkit-2.26/JSTests/stress/regexp-unicode-surrogate-pair-increment-should-involve-length-check.js (0 => 250223)


--- releases/WebKitGTK/webkit-2.26/JSTests/stress/regexp-unicode-surrogate-pair-increment-should-involve-length-check.js	                        (rev 0)
+++ releases/WebKitGTK/webkit-2.26/JSTests/stress/regexp-unicode-surrogate-pair-increment-should-involve-length-check.js	2019-09-23 10:14:36 UTC (rev 250223)
@@ -0,0 +1,27 @@
+// This test checks for proper incrementing around / over individual surrogates and surrogate pairs.
+// This test should run without crashing.
+
+function testRegExpMatch(re, str)
+{
+    for (let i = 0; i < 100; ++i) {
+        let match = re.exec(str);
+        if (!match || match[0] != str) {
+            print(match);
+            throw "Expected " + re + " to match \"" + str + "\" but it didn't";
+        }
+    }
+}
+
+let testString = "\ud800\ud800\udc00";
+let greedyRegExp = /([^x]+)[^]*\1([^])/u;
+
+testRegExpMatch(greedyRegExp, testString);
+
+let nonGreedyRegExp = /(.*[^x]+?)[^]*([^])/u;
+
+testRegExpMatch(nonGreedyRegExp, testString);
+
+let testString2 = "\ud800\ud800\udc00Test\udc00123";
+let backtrackGreedyRegExp = /.*[\x20-\udffff].\w*.\d{3}/u;
+
+testRegExpMatch(backtrackGreedyRegExp, testString2);

Modified: releases/WebKitGTK/webkit-2.26/JSTests/stress/regexp-unicode-within-string.js (250222 => 250223)


--- releases/WebKitGTK/webkit-2.26/JSTests/stress/regexp-unicode-within-string.js	2019-09-23 10:14:32 UTC (rev 250222)
+++ releases/WebKitGTK/webkit-2.26/JSTests/stress/regexp-unicode-within-string.js	2019-09-23 10:14:36 UTC (rev 250223)
@@ -10,7 +10,6 @@
     if (match !== null && match[0] === str) 
         throw "Error: Read past end of a Unicode substring processing a Unicode RegExp";
     else if (match === null || match[0] !== subStr) {
-        print("Error: match[0].length = " + match[0].length + ", match[0] = \"" + match[0] + "\"");
         throw "Error: Didn't properly match a Unicode substring with a matching Unicode RegExp";
     }
 }

Modified: releases/WebKitGTK/webkit-2.26/Source/_javascript_Core/ChangeLog (250222 => 250223)


--- releases/WebKitGTK/webkit-2.26/Source/_javascript_Core/ChangeLog	2019-09-23 10:14:32 UTC (rev 250222)
+++ releases/WebKitGTK/webkit-2.26/Source/_javascript_Core/ChangeLog	2019-09-23 10:14:36 UTC (rev 250223)
@@ -1,3 +1,30 @@
+2019-09-16  Michael Saboff  <msab...@apple.com>
+
+        [JSC] Perform check again when we found non-BMP characters
+        https://bugs.webkit.org/show_bug.cgi?id=201647
+
+        Reviewed by Yusuke Suzuki.
+
+        We need to check for end of input for non-BMP characters when matching a character class that contains
+        both BMP and non-BMP characters.  In advanceIndexAfterCharacterClassTermMatch() we were checking for
+        end of input for both BMP and non-BMP characters.  For BMP characters, this check is redundant.
+        After moving the check to after the "is BMP check", we need to decrement index after reaching the failure
+        label to back out the index++ for the first surrogate of the non-BMP character.
+
+        Added the same kind of check in generateCharacterClassOnce().  In that case, we have pre-checked the
+        first character (surrogate) for a non-BMP codepoint, so we just need to check for end of input before
+        we increment for the second surrogate.
+
+        While writing tests, I found an off by one error in backtrackCharacterClassGreedy() and changed the
+        loop to check the count at loop top instead of loop bottom.
+
+        * yarr/YarrJIT.cpp:
+        (JSC::Yarr::YarrGenerator::advanceIndexAfterCharacterClassTermMatch):
+        (JSC::Yarr::YarrGenerator::generateCharacterClassOnce):
+        (JSC::Yarr::YarrGenerator::generateCharacterClassGreedy):
+        (JSC::Yarr::YarrGenerator::backtrackCharacterClassGreedy):
+        (JSC::Yarr::YarrGenerator::backtrackCharacterClassNonGreedy):
+
 2019-09-11  Michael Saboff  <msab...@apple.com>
 
         JSC crashes due to stack overflow while building RegExp

Modified: releases/WebKitGTK/webkit-2.26/Source/_javascript_Core/yarr/YarrJIT.cpp (250222 => 250223)


--- releases/WebKitGTK/webkit-2.26/Source/_javascript_Core/yarr/YarrJIT.cpp	2019-09-23 10:14:32 UTC (rev 250222)
+++ releases/WebKitGTK/webkit-2.26/Source/_javascript_Core/yarr/YarrJIT.cpp	2019-09-23 10:14:36 UTC (rev 250223)
@@ -453,16 +453,16 @@
     }
 
 #ifdef JIT_UNICODE_EXPRESSIONS
-    void advanceIndexAfterCharacterClassTermMatch(const PatternTerm* term, JumpList& failures, const RegisterID character)
+    void advanceIndexAfterCharacterClassTermMatch(const PatternTerm* term, JumpList& failuresAfterIncrementingIndex, const RegisterID character)
     {
         ASSERT(term->type == PatternTerm::TypeCharacterClass);
 
-        if (term->characterClass->hasOneCharacterSize() && !term->invert())
+        if (term->isFixedWidthCharacterClass())
             add32(TrustedImm32(term->characterClass->hasNonBMPCharacters() ? 2 : 1), index);
         else {
             add32(TrustedImm32(1), index);
-            failures.append(atEndOfInput());
             Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase);
+            failuresAfterIncrementingIndex.append(atEndOfInput());
             add32(TrustedImm32(1), index);
             isBMPChar.link(this);
         }
@@ -1757,6 +1757,7 @@
 #ifdef JIT_UNICODE_EXPRESSIONS
         if (m_decodeSurrogatePairs && (!term->characterClass->hasOneCharacterSize() || term->invert())) {
             Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase);
+            op.m_jumps.append(atEndOfInput());
             add32(TrustedImm32(1), index);
             isBMPChar.link(this);
         }
@@ -1816,7 +1817,7 @@
 
 #ifdef JIT_UNICODE_EXPRESSIONS
         if (m_decodeSurrogatePairs) {
-            if (term->characterClass->hasOneCharacterSize() && !term->invert())
+            if (term->isFixedWidthCharacterClass())
                 add32(TrustedImm32(term->characterClass->hasNonBMPCharacters() ? 2 : 1), countRegister);
             else {
                 add32(TrustedImm32(1), countRegister);
@@ -1849,9 +1850,10 @@
         move(TrustedImm32(0), countRegister);
 
         JumpList failures;
+        JumpList failuresDecrementIndex;
         Label loop(this);
 #ifdef JIT_UNICODE_EXPRESSIONS
-        if (term->characterClass->hasOneCharacterSize() && !term->invert() && term->characterClass->hasNonBMPCharacters()) {
+        if (term->isFixedWidthCharacterClass() && term->characterClass->hasNonBMPCharacters()) {
             move(TrustedImm32(1), character);
             failures.append(checkNotEnoughInput(character));
         } else
@@ -1875,7 +1877,7 @@
 
 #ifdef JIT_UNICODE_EXPRESSIONS
         if (m_decodeSurrogatePairs)
-            advanceIndexAfterCharacterClassTermMatch(term, failures, character);
+            advanceIndexAfterCharacterClassTermMatch(term, failuresDecrementIndex, character);
         else
 #endif
             add32(TrustedImm32(1), index);
@@ -1887,6 +1889,11 @@
         } else
             jump(loop);
 
+        if (!failuresDecrementIndex.empty()) {
+            failuresDecrementIndex.link(this);
+            sub32(TrustedImm32(1), index);
+        }
+
         failures.link(this);
         op.m_reentry = label();
 
@@ -1908,7 +1915,7 @@
 
         if (!m_decodeSurrogatePairs)
             sub32(TrustedImm32(1), index);
-        else if (term->characterClass->hasOneCharacterSize() && !term->invert())
+        else if (term->isFixedWidthCharacterClass())
             sub32(TrustedImm32(term->characterClass->hasNonBMPCharacters() ? 2 : 1), index);
         else {
             // Rematch one less
@@ -1917,6 +1924,8 @@
             loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::beginIndex(), index);
 
             Label rematchLoop(this);
+            Jump doneRematching = branchTest32(Zero, countRegister);
+
             readCharacter(m_checkedOffset - term->inputPosition, character);
 
             sub32(TrustedImm32(1), countRegister);
@@ -1928,7 +1937,8 @@
             isBMPChar.link(this);
 #endif
 
-            branchTest32(Zero, countRegister).linkTo(rematchLoop, this);
+            jump(rematchLoop);
+            doneRematching.link(this);
 
             loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex(), countRegister);
         }
@@ -1964,6 +1974,7 @@
         const RegisterID countRegister = regT1;
 
         JumpList nonGreedyFailures;
+        JumpList nonGreedyFailuresDecrementIndex;
 
         m_backtrackingState.link(this);
 
@@ -1996,7 +2007,7 @@
 
 #ifdef JIT_UNICODE_EXPRESSIONS
         if (m_decodeSurrogatePairs)
-            advanceIndexAfterCharacterClassTermMatch(term, nonGreedyFailures, character);
+            advanceIndexAfterCharacterClassTermMatch(term, nonGreedyFailuresDecrementIndex, character);
         else
 #endif
             add32(TrustedImm32(1), index);
@@ -2004,6 +2015,10 @@
 
         jump(op.m_reentry);
 
+        if (!nonGreedyFailuresDecrementIndex.empty()) {
+            nonGreedyFailuresDecrementIndex.link(this);
+            breakpoint();
+        }
         nonGreedyFailures.link(this);
         sub32(countRegister, index);
         m_backtrackingState.fallthrough();

Modified: releases/WebKitGTK/webkit-2.26/Source/_javascript_Core/yarr/YarrPattern.h (250222 => 250223)


--- releases/WebKitGTK/webkit-2.26/Source/_javascript_Core/yarr/YarrPattern.h	2019-09-23 10:14:32 UTC (rev 250222)
+++ releases/WebKitGTK/webkit-2.26/Source/_javascript_Core/yarr/YarrPattern.h	2019-09-23 10:14:36 UTC (rev 250223)
@@ -258,6 +258,11 @@
         return m_capture;
     }
 
+    bool isFixedWidthCharacterClass() const
+    {
+        return type == TypeCharacterClass && characterClass->hasOneCharacterSize() && !invert();
+    }
+
     bool containsAnyCaptures()
     {
         ASSERT(this->type == TypeParenthesesSubpattern);
_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to