Title: [225683] trunk/Source/_javascript_Core
Revision
225683
Author
msab...@apple.com
Date
2017-12-08 10:27:18 -0800 (Fri, 08 Dec 2017)

Log Message

YARR: Coalesce constructed character classes
https://bugs.webkit.org/show_bug.cgi?id=180537

Reviewed by JF Bastien.

When adding characters or character ranges to a character class being constructed,
we now coalesce adjacent characters and character ranges.  When we create a
character class after construction is complete, we do a final coalescing pass
across the character list and ranges to catch any remaining coalescing
opportunities.

Added an optimization for character classes that will match any character.
This is somewhat common in code created before the /s (dotAll) flag was added
to the engine.

* yarr/YarrInterpreter.cpp:
(JSC::Yarr::Interpreter::checkCharacterClass):
* yarr/YarrJIT.cpp:
(JSC::Yarr::YarrGenerator::generateCharacterClassOnce):
(JSC::Yarr::YarrGenerator::generateCharacterClassFixed):
(JSC::Yarr::YarrGenerator::generateCharacterClassGreedy):
(JSC::Yarr::YarrGenerator::backtrackCharacterClassNonGreedy):
* yarr/YarrPattern.cpp:
(JSC::Yarr::CharacterClassConstructor::CharacterClassConstructor):
(JSC::Yarr::CharacterClassConstructor::reset):
(JSC::Yarr::CharacterClassConstructor::charClass):
(JSC::Yarr::CharacterClassConstructor::addSorted):
(JSC::Yarr::CharacterClassConstructor::addSortedRange):
(JSC::Yarr::CharacterClassConstructor::mergeRangesFrom):
(JSC::Yarr::CharacterClassConstructor::coalesceTables):
(JSC::Yarr::CharacterClassConstructor::anyCharacter):
(JSC::Yarr::YarrPatternConstructor::atomCharacterClassEnd):
(JSC::Yarr::PatternTerm::dump):
(JSC::Yarr::anycharCreate):
* yarr/YarrPattern.h:
(JSC::Yarr::CharacterClass::CharacterClass):

Modified Paths

Diff

Modified: trunk/Source/_javascript_Core/ChangeLog (225682 => 225683)


--- trunk/Source/_javascript_Core/ChangeLog	2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/ChangeLog	2017-12-08 18:27:18 UTC (rev 225683)
@@ -1,3 +1,42 @@
+2017-12-08  Michael Saboff  <msab...@apple.com>
+
+        YARR: Coalesce constructed character classes
+        https://bugs.webkit.org/show_bug.cgi?id=180537
+
+        Reviewed by JF Bastien.
+
+        When adding characters or character ranges to a character class being constructed,
+        we now coalesce adjacent characters and character ranges.  When we create a
+        character class after construction is complete, we do a final coalescing pass
+        across the character list and ranges to catch any remaining coalescing
+        opportunities.
+
+        Added an optimization for character classes that will match any character.
+        This is somewhat common in code created before the /s (dotAll) flag was added
+        to the engine.
+
+        * yarr/YarrInterpreter.cpp:
+        (JSC::Yarr::Interpreter::checkCharacterClass):
+        * yarr/YarrJIT.cpp:
+        (JSC::Yarr::YarrGenerator::generateCharacterClassOnce):
+        (JSC::Yarr::YarrGenerator::generateCharacterClassFixed):
+        (JSC::Yarr::YarrGenerator::generateCharacterClassGreedy):
+        (JSC::Yarr::YarrGenerator::backtrackCharacterClassNonGreedy):
+        * yarr/YarrPattern.cpp:
+        (JSC::Yarr::CharacterClassConstructor::CharacterClassConstructor):
+        (JSC::Yarr::CharacterClassConstructor::reset):
+        (JSC::Yarr::CharacterClassConstructor::charClass):
+        (JSC::Yarr::CharacterClassConstructor::addSorted):
+        (JSC::Yarr::CharacterClassConstructor::addSortedRange):
+        (JSC::Yarr::CharacterClassConstructor::mergeRangesFrom):
+        (JSC::Yarr::CharacterClassConstructor::coalesceTables):
+        (JSC::Yarr::CharacterClassConstructor::anyCharacter):
+        (JSC::Yarr::YarrPatternConstructor::atomCharacterClassEnd):
+        (JSC::Yarr::PatternTerm::dump):
+        (JSC::Yarr::anycharCreate):
+        * yarr/YarrPattern.h:
+        (JSC::Yarr::CharacterClass::CharacterClass):
+
 2017-12-07  Saam Barati  <sbar...@apple.com>
 
         Modify our dollar VM clflush intrinsic to aid in some perf testing

Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp (225682 => 225683)


--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp	2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp	2017-12-08 18:27:18 UTC (rev 225683)
@@ -408,6 +408,9 @@
 
     bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
     {
+        if (characterClass->m_anyCharacter)
+            return !invert;
+
         bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
         return invert ? !match : match;
     }

Modified: trunk/Source/_javascript_Core/yarr/YarrJIT.cpp (225682 => 225683)


--- trunk/Source/_javascript_Core/yarr/YarrJIT.cpp	2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/yarr/YarrJIT.cpp	2017-12-08 18:27:18 UTC (rev 225683)
@@ -1171,7 +1171,7 @@
         readCharacter(m_checkedOffset - term->inputPosition, character);
         // If we are matching the "any character" builtin class we only need to read the
         // character and don't need to match as it will always succeed.
-        if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {
+        if (term->invert() || !term->characterClass->m_anyCharacter) {
             matchCharacterClass(character, matchDest, term->characterClass);
 
             if (term->invert())
@@ -1220,7 +1220,7 @@
         readCharacter(m_checkedOffset - term->inputPosition - term->quantityMaxCount, character, countRegister);
         // If we are matching the "any character" builtin class we only need to read the
         // character and don't need to match as it will always succeed.
-        if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {
+        if (term->invert() || !term->characterClass->m_anyCharacter) {
             matchCharacterClass(character, matchDest, term->characterClass);
 
             if (term->invert())
@@ -1272,7 +1272,7 @@
             readCharacter(m_checkedOffset - term->inputPosition, character);
             // If we are matching the "any character" builtin class we only need to read the
             // character and don't need to match as it will always succeed.
-            if (term->characterClass != m_pattern.anyCharacterClass()) {
+            if (!term->characterClass->m_anyCharacter) {
                 matchCharacterClass(character, matchDest, term->characterClass);
                 failures.append(jump());
             }
@@ -1378,7 +1378,7 @@
         readCharacter(m_checkedOffset - term->inputPosition, character);
         // If we are matching the "any character" builtin class we only need to read the
         // character and don't need to match as it will always succeed.
-        if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {
+        if (term->invert() || !term->characterClass->m_anyCharacter) {
             matchCharacterClass(character, matchDest, term->characterClass);
 
             if (term->invert())

Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.cpp (225682 => 225683)


--- trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2017-12-08 18:27:18 UTC (rev 225683)
@@ -48,6 +48,7 @@
     CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
         : m_isCaseInsensitive(isCaseInsensitive)
         , m_hasNonBMPCharacters(false)
+        , m_anyCharacter(false)
         , m_canonicalMode(canonicalMode)
     {
     }
@@ -59,6 +60,7 @@
         m_matchesUnicode.clear();
         m_rangesUnicode.clear();
         m_hasNonBMPCharacters = false;
+        m_anyCharacter = false;
     }
 
     void append(const CharacterClass* other)
@@ -238,6 +240,8 @@
 
     std::unique_ptr<CharacterClass> charClass()
     {
+        coalesceTables();
+
         auto characterClass = std::make_unique<CharacterClass>();
 
         characterClass->m_matches.swap(m_matches);
@@ -245,7 +249,11 @@
         characterClass->m_matchesUnicode.swap(m_matchesUnicode);
         characterClass->m_rangesUnicode.swap(m_rangesUnicode);
         characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters();
+        characterClass->m_anyCharacter = anyCharacter();
 
+        m_hasNonBMPCharacters = false;
+        m_anyCharacter = false;
+
         return characterClass;
     }
 
@@ -270,9 +278,31 @@
             int val = matches[pos+index] - ch;
             if (!val)
                 return;
-            else if (val > 0)
+            else if (val > 0) {
+                if (val == 1) {
+                    UChar32 lo = ch;
+                    UChar32 hi = ch + 1;
+                    matches.remove(pos + index);
+                    if (pos + index > 0 && matches[pos + index - 1] == ch - 1) {
+                        lo = ch - 1;
+                        matches.remove(pos + index - 1);
+                    }
+                    addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
+                    return;
+                }
                 range = index;
-            else {
+            } else {
+                if (val == -1) {
+                    UChar32 lo = ch - 1;
+                    UChar32 hi = ch;
+                    matches.remove(pos + index);
+                    if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) {
+                        hi = ch + 1;
+                        matches.remove(pos + index + 1);
+                    }
+                    addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
+                    return;
+                }
                 pos += (index+1);
                 range -= (index+1);
             }
@@ -286,7 +316,7 @@
 
     void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
     {
-        unsigned end = ranges.size();
+        size_t end = ranges.size();
 
         if (!U_IS_BMP(hi))
             m_hasNonBMPCharacters = true;
@@ -293,10 +323,10 @@
 
         // Simple linear scan - I doubt there are that many ranges anyway...
         // feel free to fix this with something faster (eg binary chop).
-        for (unsigned i = 0; i < end; ++i) {
+        for (size_t i = 0; i < end; ++i) {
             // does the new range fall before the current position in the array
             if (hi < ranges[i].begin) {
-                // optional optimization: concatenate appending ranges? - may not be worthwhile.
+                // Concatenate appending ranges.
                 if (hi == (ranges[i].begin - 1)) {
                     ranges[i].begin = lo;
                     return;
@@ -312,18 +342,7 @@
                 ranges[i].begin = std::min(ranges[i].begin, lo);
                 ranges[i].end = std::max(ranges[i].end, hi);
 
-                // now check if the new range can subsume any subsequent ranges.
-                unsigned next = i+1;
-                // each iteration of the loop we will either remove something from the list, or break the loop.
-                while (next < ranges.size()) {
-                    if (ranges[next].begin <= (ranges[i].end + 1)) {
-                        // the next entry now overlaps / concatenates this one.
-                        ranges[i].end = std::max(ranges[i].end, ranges[next].end);
-                        ranges.remove(next);
-                    } else
-                        break;
-                }
-                
+                mergeRangesFrom(ranges, i);
                 return;
             }
         }
@@ -332,13 +351,76 @@
         ranges.append(CharacterRange(lo, hi));
     }
 
+    void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
+    {
+        unsigned next = index + 1;
+
+        // each iteration of the loop we will either remove something from the list, or break out of the loop.
+        while (next < ranges.size()) {
+            if (ranges[next].begin <= (ranges[index].end + 1)) {
+                // the next entry now overlaps / concatenates with this one.
+                ranges[index].end = std::max(ranges[index].end, ranges[next].end);
+                ranges.remove(next);
+            } else
+                break;
+        }
+
+    }
+
+    void coalesceTables()
+    {
+        auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
+
+            size_t matchesIndex = 0;
+            size_t rangesIndex = 0;
+
+            while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
+                while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1)
+                    matchesIndex++;
+
+                if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) {
+                    ranges[rangesIndex].begin = matches[matchesIndex];
+                    matches.remove(matchesIndex);
+                }
+
+                while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1)
+                    matchesIndex++;
+
+                if (matchesIndex < matches.size()) {
+                    if (matches[matchesIndex] == ranges[rangesIndex].end + 1) {
+                        ranges[rangesIndex].end = matches[matchesIndex];
+                        matches.remove(matchesIndex);
+
+                        mergeRangesFrom(ranges, rangesIndex);
+                    } else
+                        matchesIndex++;
+                }
+            }
+        };
+
+        coalesceMatchesAndRanges(m_matches, m_ranges);
+        coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
+
+        if (!m_matches.size() && !m_matchesUnicode.size()
+            && m_ranges.size() == 1 && m_rangesUnicode.size() == 1
+            && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f
+            && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff)
+            m_anyCharacter = true;
+    }
+
     bool hasNonBMPCharacters()
     {
         return m_hasNonBMPCharacters;
     }
 
-    bool m_isCaseInsensitive;
-    bool m_hasNonBMPCharacters;
+    bool anyCharacter()
+    {
+        return m_anyCharacter;
+    }
+
+    bool m_isCaseInsensitive : 1;
+    bool m_hasNonBMPCharacters : 1;
+    bool m_anyCharacter : 1;
     CanonicalMode m_canonicalMode;
 
     Vector<UChar32> m_matches;
@@ -489,6 +571,11 @@
     void atomCharacterClassEnd()
     {
         auto newCharacterClass = m_characterClassConstructor.charClass();
+
+        if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
+            m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
+            return;
+        }
         m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
         m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
     }
@@ -1180,7 +1267,7 @@
         break;
     case TypeCharacterClass:
         out.print("character class ");
-        if (characterClass == thisPattern->anyCharacterClass())
+        if (characterClass->m_anyCharacter)
             out.print("<any character>");
         else if (characterClass == thisPattern->newlineCharacterClass())
             out.print("<newline>");
@@ -1383,6 +1470,7 @@
     characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
     characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
     characterClass->m_hasNonBMPCharacters = true;
+    characterClass->m_anyCharacter = true;
     return characterClass;
 }
 

Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.h (225682 => 225683)


--- trunk/Source/_javascript_Core/yarr/YarrPattern.h	2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.h	2017-12-08 18:27:18 UTC (rev 225683)
@@ -59,6 +59,7 @@
     CharacterClass()
         : m_table(0)
         , m_hasNonBMPCharacters(false)
+        , m_anyCharacter(false)
     {
     }
     CharacterClass(const char* table, bool inverted)
@@ -65,6 +66,7 @@
         : m_table(table)
         , m_tableInverted(inverted)
         , m_hasNonBMPCharacters(false)
+        , m_anyCharacter(false)
     {
     }
     CharacterClass(std::initializer_list<UChar32> matches, std::initializer_list<CharacterRange> ranges, std::initializer_list<UChar32> matchesUnicode, std::initializer_list<CharacterRange> rangesUnicode)
@@ -75,6 +77,7 @@
         , m_table(0)
         , m_tableInverted(false)
         , m_hasNonBMPCharacters(false)
+        , m_anyCharacter(false)
     {
     }
 
@@ -84,8 +87,9 @@
     Vector<CharacterRange> m_rangesUnicode;
 
     const char* m_table;
-    bool m_tableInverted;
-    bool m_hasNonBMPCharacters;
+    bool m_tableInverted : 1;
+    bool m_hasNonBMPCharacters : 1;
+    bool m_anyCharacter : 1;
 };
 
 enum QuantifierType {
_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to