Title: [201467] trunk/Source/_javascript_Core
Revision
201467
Author
[email protected]
Date
2016-05-27 13:45:08 -0700 (Fri, 27 May 2016)

Log Message

regExpProtoFuncSplitFast should OOM before it swaps
https://bugs.webkit.org/show_bug.cgi?id=158157

Reviewed by Mark Lam.
        
This is a huge speed-up on some jsfunfuzz test cases because it makes us realize much
sooner that running a regexp split will result in swapping. It uses the same basic
approach as http://trac.webkit.org/changeset/201451: if the result array crosses a certain
size threshold, we proceed with a dry run to see how big the array will get before
allocating anything else. This way, bogus uses of split that would have OOMed only after
killing the user's machine will now OOM before killing the user's machine.
        
This is an enormous speed-up on some jsfunfuzz tests: they go from running for a long
time to running instantly.

* runtime/RegExpPrototype.cpp:
(JSC::advanceStringIndex):
(JSC::genericSplit):
(JSC::regExpProtoFuncSplitFast):
* runtime/StringObject.h:
(JSC::jsStringWithReuse):
(JSC::jsSubstring):
* tests/stress/big-split-captures.js: Added.
* tests/stress/big-split.js: Added.

Modified Paths

Added Paths

Diff

Modified: trunk/Source/_javascript_Core/ChangeLog (201466 => 201467)


--- trunk/Source/_javascript_Core/ChangeLog	2016-05-27 20:32:42 UTC (rev 201466)
+++ trunk/Source/_javascript_Core/ChangeLog	2016-05-27 20:45:08 UTC (rev 201467)
@@ -1,3 +1,30 @@
+2016-05-27  Filip Pizlo  <[email protected]>
+
+        regExpProtoFuncSplitFast should OOM before it swaps
+        https://bugs.webkit.org/show_bug.cgi?id=158157
+
+        Reviewed by Mark Lam.
+        
+        This is a huge speed-up on some jsfunfuzz test cases because it makes us realize much
+        sooner that running a regexp split will result in swapping. It uses the same basic
+        approach as http://trac.webkit.org/changeset/201451: if the result array crosses a certain
+        size threshold, we proceed with a dry run to see how big the array will get before
+        allocating anything else. This way, bogus uses of split that would have OOMed only after
+        killing the user's machine will now OOM before killing the user's machine.
+        
+        This is an enormous speed-up on some jsfunfuzz tests: they go from running for a long
+        time to running instantly.
+
+        * runtime/RegExpPrototype.cpp:
+        (JSC::advanceStringIndex):
+        (JSC::genericSplit):
+        (JSC::regExpProtoFuncSplitFast):
+        * runtime/StringObject.h:
+        (JSC::jsStringWithReuse):
+        (JSC::jsSubstring):
+        * tests/stress/big-split-captures.js: Added.
+        * tests/stress/big-split.js: Added.
+
 2016-05-27  Saam barati  <[email protected]>
 
         ShadowChicken/DebuggerCallFrame don't properly handle when the entry stack frame is a tail deleted frame

Modified: trunk/Source/_javascript_Core/runtime/RegExpPrototype.cpp (201466 => 201467)


--- trunk/Source/_javascript_Core/runtime/RegExpPrototype.cpp	2016-05-27 20:32:42 UTC (rev 201466)
+++ trunk/Source/_javascript_Core/runtime/RegExpPrototype.cpp	2016-05-27 20:45:08 UTC (rev 201467)
@@ -457,6 +457,84 @@
     return RegExpObject::advanceStringUnicode(str, strSize, index);
 }
 
+enum SplitControl {
+    ContinueSplit,
+    AbortSplit
+};
+
+template<typename ControlFunc, typename PushFunc>
+void genericSplit(
+    VM& vm, RegExp* regexp, const String& input, unsigned inputSize, unsigned& position,
+    unsigned& matchPosition, bool regExpIsSticky, bool regExpIsUnicode,
+    const ControlFunc& control, const PushFunc& push)
+{
+    while (matchPosition < inputSize) {
+        if (control() == AbortSplit)
+            return;
+        
+        Vector<int, 32> ovector;
+
+        // a. Perform ? Set(splitter, "lastIndex", q, true).
+        // b. Let z be ? RegExpExec(splitter, S).
+        int mpos = regexp->match(vm, input, matchPosition, ovector);
+
+        // c. If z is null, let q be AdvanceStringIndex(S, q, unicodeMatching).
+        if (mpos < 0) {
+            if (!regExpIsSticky)
+                break;
+            matchPosition = advanceStringIndex(input, inputSize, matchPosition, regExpIsUnicode);
+            continue;
+        }
+        if (static_cast<unsigned>(mpos) >= inputSize) {
+            // The spec redoes the RegExpExec starting at the next character of the input.
+            // But in our case, mpos < 0 means that the native regexp already searched all permutations
+            // and know that we won't be able to find a match for the separator even if we redo the
+            // RegExpExec starting at the next character of the input. So, just bail.
+            break;
+        }
+
+        // d. Else, z is not null
+        //    i. Let e be ? ToLength(? Get(splitter, "lastIndex")).
+        //   ii. Let e be min(e, size).
+        matchPosition = mpos;
+        unsigned matchEnd = ovector[1];
+
+        //  iii. If e = p, let q be AdvanceStringIndex(S, q, unicodeMatching).
+        if (matchEnd == position) {
+            matchPosition = advanceStringIndex(input, inputSize, matchPosition, regExpIsUnicode);
+            continue;
+        }
+        // if matchEnd == 0 then position should also be zero and thus matchEnd should equal position.
+        ASSERT(matchEnd);
+
+        //   iv. Else e != p,
+        unsigned numberOfCaptures = regexp->numSubpatterns();
+        
+        // 1. Let T be a String value equal to the substring of S consisting of the elements at indices p (inclusive) through q (exclusive).
+        // 2. Perform ! CreateDataProperty(A, ! ToString(lengthA), T).
+        if (push(true, position, matchPosition - position) == AbortSplit)
+            return;
+        
+        // 5. Let p be e.
+        position = matchEnd;
+        
+        // 6. Let numberOfCaptures be ? ToLength(? Get(z, "length")).
+        // 7. Let numberOfCaptures be max(numberOfCaptures-1, 0).
+        // 8. Let i be 1.
+        // 9. Repeat, while i <= numberOfCaptures,
+        for (unsigned i = 1; i <= numberOfCaptures; ++i) {
+            // a. Let nextCapture be ? Get(z, ! ToString(i)).
+            // b. Perform ! CreateDataProperty(A, ! ToString(lengthA), nextCapture).
+            int sub = ovector[i * 2];
+            if (push(sub >= 0, sub, ovector[i * 2 + 1] - sub) == AbortSplit)
+                return;
+        }
+        
+        // 10. Let q be p.
+        matchPosition = position;
+    }
+}
+
 // ES 21.2.5.11 RegExp.prototype[@@split](string, limit)
 EncodedJSValue JSC_HOST_CALL regExpProtoFuncSplitFast(ExecState* exec)
 {
@@ -468,7 +546,8 @@
     RegExp* regexp = asRegExpObject(thisValue)->regExp();
 
     // 3. [handled by JS builtin] Let S be ? ToString(string).
-    String input = exec->argument(0).toString(exec)->value(exec);
+    JSString* inputString = exec->argument(0).toString(exec);
+    String input = inputString->value(exec);
     if (vm.exception())
         return JSValue::encode(jsUndefined());
     ASSERT(!input.isNull());
@@ -507,7 +586,7 @@
         // c. Perform ! CreateDataProperty(A, "0", S).
         // d. Return A.
         if (!regexp->match(vm, input, 0))
-            result->putDirectIndex(exec, 0, jsStringWithReuse(exec, thisValue, input));
+            result->putDirectIndex(exec, 0, inputString);
         return JSValue::encode(result);
     }
 
@@ -516,99 +595,79 @@
     // 19. Repeat, while q < size
     bool regExpIsSticky = regexp->sticky();
     bool regExpIsUnicode = regexp->unicode();
-    while (matchPosition < inputSize) {
-        Vector<int, 32> ovector;
-
-        // a. Perform ? Set(splitter, "lastIndex", q, true).
-        // b. Let z be ? RegExpExec(splitter, S).
-        int mpos = regexp->match(vm, input, matchPosition, ovector);
-
-        // c. If z is null, let q be AdvanceStringIndex(S, q, unicodeMatching).
-        if (mpos < 0) {
-            if (!regExpIsSticky)
-                break;
-            matchPosition = advanceStringIndex(input, inputSize, matchPosition, regExpIsUnicode);
-            continue;
-        }
-        if (static_cast<unsigned>(mpos) >= inputSize) {
-            // The spec redoes the RegExpExec starting at the next character of the input.
-            // But in our case, mpos < 0 means that the native regexp already searched all permutations
-            // and know that we won't be able to find a match for the separator even if we redo the
-            // RegExpExec starting at the next character of the input. So, just bail.
-            break;
-        }
-
-        // d. Else, z is not null
-        //    i. Let e be ? ToLength(? Get(splitter, "lastIndex")).
-        //   ii. Let e be min(e, size).
-        matchPosition = mpos;
-        unsigned matchEnd = ovector[1];
-
-        //  iii. If e = p, let q be AdvanceStringIndex(S, q, unicodeMatching).
-        if (matchEnd == position) {
-            matchPosition = advanceStringIndex(input, inputSize, matchPosition, regExpIsUnicode);
-            continue;
-        }
-        // if matchEnd == 0 then position should also be zero and thus matchEnd should equal position.
-        ASSERT(matchEnd);
-
-        //   iv. Else e != p,
-        {
-            unsigned numberOfCaptures = regexp->numSubpatterns();
-            unsigned newResultLength = resultLength + numberOfCaptures + 1;
-            if (newResultLength < numberOfCaptures || newResultLength >= MAX_STORAGE_VECTOR_INDEX) {
-                // Let's consider what's best for users here. We're about to increase the length of
-                // the split array beyond the maximum length that we can support efficiently. This
-                // will cause us to use a HashMap for the new entries after this point. That's going
-                // to result in a very long running time of this function and very large memory
-                // usage. In my experiments, JSC will sit spinning for minutes after getting here and
-                // it was using >4GB of memory and eventually grew to 8GB. It kept running without
-                // finishing until I killed it. That's probably not what the user wanted. The user,
-                // or the program that the user is running, probably made a mistake by calling this
-                // method in such a way that it resulted in such an obnoxious array. Therefore, to
-                // protect ourselves, we bail at this point.
-                throwOutOfMemoryError(exec);
-                return JSValue::encode(jsUndefined());
-            }
-
-            // 1. Let T be a String value equal to the substring of S consisting of the elements at indices p (inclusive) through q (exclusive).
-            // 2. Perform ! CreateDataProperty(A, ! ToString(lengthA), T).
-            result->putDirectIndex(exec, resultLength, jsSubstring(exec, thisValue, input, position, matchPosition - position));
-
-            // 3. Let lengthA be lengthA + 1.
-            // 4. If lengthA = lim, return A.
-            if (++resultLength == limit)
-                return JSValue::encode(result);
-
-            // 5. Let p be e.
-            position = matchEnd;
-
-            // 6. Let numberOfCaptures be ? ToLength(? Get(z, "length")).
-            // 7. Let numberOfCaptures be max(numberOfCaptures-1, 0).
-            // 8. Let i be 1.
-            // 9. Repeat, while i <= numberOfCaptures,
-            for (unsigned i = 1; i <= numberOfCaptures; ++i) {
-                // a. Let nextCapture be ? Get(z, ! ToString(i)).
-                // b. Perform ! CreateDataProperty(A, ! ToString(lengthA), nextCapture).
-                int sub = ovector[i * 2];
-                result->putDirectIndex(exec, resultLength, sub < 0 ? jsUndefined() : jsSubstring(exec, thisValue, input, sub, ovector[i * 2 + 1] - sub));
-
-                // c. Let i be i + 1.
-                // d. Let lengthA be lengthA + 1.
-                // e. If lengthA = lim, return A.
-                if (++resultLength == limit)
-                    return JSValue::encode(result);
-            }
-
-            // 10. Let q be p.
-            matchPosition = position;
-        }
+    
+    unsigned maxSizeForDirectPath = 100000;
+    
+    genericSplit(
+        vm, regexp, input, inputSize, position, matchPosition, regExpIsSticky, regExpIsUnicode,
+        [&] () -> SplitControl {
+            if (resultLength >= maxSizeForDirectPath)
+                return AbortSplit;
+            return ContinueSplit;
+        },
+        [&] (bool isDefined, unsigned start, unsigned length) -> SplitControl {
+            result->putDirectIndex(exec, resultLength++, isDefined ? JSRopeString::createSubstringOfResolved(vm, inputString, start, length) : jsUndefined());
+            if (resultLength >= limit)
+                return AbortSplit;
+            return ContinueSplit;
+        });
+    
+    if (resultLength >= limit)
+        return JSValue::encode(result);
+    if (resultLength < maxSizeForDirectPath) {
+        // 20. Let T be a String value equal to the substring of S consisting of the elements at indices p (inclusive) through size (exclusive).
+        // 21. Perform ! CreateDataProperty(A, ! ToString(lengthA), T).
+        result->putDirectIndex(exec, resultLength, JSRopeString::createSubstringOfResolved(vm, inputString, position, inputSize - position));
+        
+        // 22. Return A.
+        return JSValue::encode(result);
     }
-
+    
+    // Now do a dry run to see how big things get. Give up if they get absurd.
+    unsigned savedPosition = position;
+    unsigned savedMatchPosition = matchPosition;
+    unsigned dryRunCount = 0;
+    genericSplit(
+        vm, regexp, input, inputSize, position, matchPosition, regExpIsSticky, regExpIsUnicode,
+        [&] () -> SplitControl {
+            if (resultLength + dryRunCount >= MAX_STORAGE_VECTOR_LENGTH)
+                return AbortSplit;
+            return ContinueSplit;
+        },
+        [&] (bool, unsigned, unsigned) -> SplitControl {
+            dryRunCount++;
+            if (resultLength + dryRunCount >= limit)
+                return AbortSplit;
+            return ContinueSplit;
+        });
+    
+    if (resultLength + dryRunCount >= MAX_STORAGE_VECTOR_LENGTH) {
+        throwOutOfMemoryError(exec);
+        return JSValue::encode(jsUndefined());
+    }
+    
+    // OK, we know that if we finish the split, we won't have to OOM.
+    position = savedPosition;
+    matchPosition = savedMatchPosition;
+    
+    genericSplit(
+        vm, regexp, input, inputSize, position, matchPosition, regExpIsSticky, regExpIsUnicode,
+        [&] () -> SplitControl {
+            return ContinueSplit;
+        },
+        [&] (bool isDefined, unsigned start, unsigned length) -> SplitControl {
+            result->putDirectIndex(exec, resultLength++, isDefined ? JSRopeString::createSubstringOfResolved(vm, inputString, start, length) : jsUndefined());
+            if (resultLength >= limit)
+                return AbortSplit;
+            return ContinueSplit;
+        });
+    
+    if (resultLength >= limit)
+        return JSValue::encode(result);
+    
     // 20. Let T be a String value equal to the substring of S consisting of the elements at indices p (inclusive) through size (exclusive).
     // 21. Perform ! CreateDataProperty(A, ! ToString(lengthA), T).
-    result->putDirectIndex(exec, resultLength++, jsSubstring(exec, thisValue, input, position, inputSize - position));
-
+    result->putDirectIndex(exec, resultLength, JSRopeString::createSubstringOfResolved(vm, inputString, position, inputSize - position));
     // 22. Return A.
     return JSValue::encode(result);
 }

Modified: trunk/Source/_javascript_Core/runtime/StringObject.h (201466 => 201467)


--- trunk/Source/_javascript_Core/runtime/StringObject.h	2016-05-27 20:32:42 UTC (rev 201466)
+++ trunk/Source/_javascript_Core/runtime/StringObject.h	2016-05-27 20:45:08 UTC (rev 201467)
@@ -84,6 +84,9 @@
 // Helper for producing a JSString for 'string', where 'string' was been produced by
 // calling ToString on 'originalValue'. In cases where 'originalValue' already was a
 // string primitive we can just use this, otherwise we need to allocate a new JSString.
+// FIXME: Basically any use of this is bad. toString() returns a JSString* so we don't need to
+// pass around the originalValue; we could just pass around the JSString*. Then you don't need
+// this function. You just use the JSString* that toString() returned.
 static inline JSString* jsStringWithReuse(ExecState* exec, JSValue originalValue, const String& string)
 {
     if (originalValue.isString()) {
@@ -94,10 +97,9 @@
 }
 
 // Helper that tries to use the JSString substring sharing mechanism if 'originalValue' is a JSString.
-// FIXME: It would be even better if toString returned a JSString*, or if anyone who called
-// toString with the intent of later calling this functon first created a jsString from the String
-// that toString returned. That way, we'd get the substring optimization even when the input was
-// not a JSString.
+// FIXME: Basically any use of this is bad. toString() returns a JSString* so we don't need to
+// pass around the originalValue; we could just pass around the JSString*. And since we've
+// resolved it, we know that we can just allocate the substring cell directly.
 // https://bugs.webkit.org/show_bug.cgi?id=158140
 static inline JSString* jsSubstring(ExecState* exec, JSValue originalValue, const String& string, unsigned offset, unsigned length)
 {

Added: trunk/Source/_javascript_Core/tests/stress/big-split-captures.js (0 => 201467)


--- trunk/Source/_javascript_Core/tests/stress/big-split-captures.js	                        (rev 0)
+++ trunk/Source/_javascript_Core/tests/stress/big-split-captures.js	2016-05-27 20:45:08 UTC (rev 201467)
@@ -0,0 +1,27 @@
+"use strict";
+
+var bigString = "xyz";
+while (bigString.length < 200000)
+    bigString = bigString + bigString;
+
+if (bigString.length != 393216)
+    throw "Error: bad string length: " + bigString.length;
+
+var result = /(x)(y)(z)/[Symbol.split](bigString);
+
+if (result.length != 524289)
+    throw "Error: bad result array length: " + result.length;
+
+if (result[0] != "")
+    throw "Error: array does not start with an empty string.";
+
+for (var i = 1; i < result.length; i += 4) {
+    if (result[i + 0] != "x")
+        throw "Error: array does not contain \"x\" at i = " + i + " + 0: " + result[i + 0];
+    if (result[i + 1] != "y")
+        throw "Error: array does not contain \"y\" at i = " + i + " + 1: " + result[i + 1];
+    if (result[i + 2] != "z")
+        throw "Error: array does not contain \"z\" at i = " + i + " + 2: " + result[i + 2];
+    if (result[i + 3] != "")
+        throw "Error: array does not contain \"\" at i = " + i + " + 3: " + result[i + 3];
+}

Added: trunk/Source/_javascript_Core/tests/stress/big-split.js (0 => 201467)


--- trunk/Source/_javascript_Core/tests/stress/big-split.js	                        (rev 0)
+++ trunk/Source/_javascript_Core/tests/stress/big-split.js	2016-05-27 20:45:08 UTC (rev 201467)
@@ -0,0 +1,21 @@
+"use strict";
+
+var bigString = "xy";
+while (bigString.length < 200000)
+    bigString = bigString + bigString;
+
+if (bigString.length != 262144)
+    throw "Error: bad string length: " + bigString.length;
+
+var result = /x/[Symbol.split](bigString);
+
+if (result.length != 131073)
+    throw "Error: bad result array length: " + result.length;
+
+if (result[0] != "")
+    throw "Error: array does not start with an empty string.";
+
+for (var i = 1; i < result.length; ++i) {
+    if (result[i] != "y")
+        throw "Error: array does not contain \"y\" at i = " + i + ": " + result[i];
+}
_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to