Title: [280744] trunk/Source/_javascript_Core
Revision
280744
Author
[email protected]
Date
2021-08-06 17:02:51 -0700 (Fri, 06 Aug 2021)

Log Message

[JSC] Yarr's character tracking for BoyerMoore search should be more precise
https://bugs.webkit.org/show_bug.cgi?id=228810

Reviewed by Saam Barati.

We should track up to 2 candidate characters without masking, so
that we can search a character without masking. To track candidates,
we introduce BoyerMooreCharacterCandidates.

We also add support for m_table because m_table can be used
for important CharacterClass like `\s`, and still that does not have
many character candidates if the mode is Char8.

To make `\s` work on Char8 case, we use Char8 / Char16 information
to filter characters that never appears in BoyreMoore search bitmap
construction.

* yarr/YarrJIT.cpp:
(JSC::Yarr::BoyerMooreInfo::BoyerMooreInfo):
(JSC::Yarr::BoyerMooreInfo::set):
(JSC::Yarr::BoyerMooreInfo::addCharacters):
(JSC::Yarr::BoyerMooreInfo::addRanges):
(JSC::Yarr::BoyerMooreInfo::create):
(JSC::Yarr::BoyerMooreInfo::createCandidateBitmap const):
* yarr/YarrJIT.h:
(JSC::Yarr::BoyerMooreCharacterCandidates::isValid const):
(JSC::Yarr::BoyerMooreCharacterCandidates::invalidate):
(JSC::Yarr::BoyerMooreCharacterCandidates::isEmpty const):
(JSC::Yarr::BoyerMooreCharacterCandidates::size const):
(JSC::Yarr::BoyerMooreCharacterCandidates::at const):
(JSC::Yarr::BoyerMooreCharacterCandidates::add):
(JSC::Yarr::BoyerMooreCharacterCandidates::merge):
(JSC::Yarr::BoyerMooreBitmap::characterCandidates const):
(JSC::Yarr::BoyerMooreBitmap::add):
(JSC::Yarr::BoyerMooreBitmap::addCharacters):
(JSC::Yarr::BoyerMooreBitmap::addRanges):
(JSC::Yarr::BoyerMooreBitmap::isMaskEffective const): Deleted.

Modified Paths

Diff

Modified: trunk/Source/_javascript_Core/ChangeLog (280743 => 280744)


--- trunk/Source/_javascript_Core/ChangeLog	2021-08-06 23:21:07 UTC (rev 280743)
+++ trunk/Source/_javascript_Core/ChangeLog	2021-08-07 00:02:51 UTC (rev 280744)
@@ -1,3 +1,43 @@
+2021-08-06  Yusuke Suzuki  <[email protected]>
+
+        [JSC] Yarr's character tracking for BoyerMoore search should be more precise
+        https://bugs.webkit.org/show_bug.cgi?id=228810
+
+        Reviewed by Saam Barati.
+
+        We should track up to 2 candidate characters without masking, so
+        that we can search a character without masking. To track candidates,
+        we introduce BoyerMooreCharacterCandidates.
+
+        We also add support for m_table because m_table can be used
+        for important CharacterClass like `\s`, and still that does not have
+        many character candidates if the mode is Char8.
+
+        To make `\s` work on Char8 case, we use Char8 / Char16 information
+        to filter characters that never appears in BoyreMoore search bitmap
+        construction.
+
+        * yarr/YarrJIT.cpp:
+        (JSC::Yarr::BoyerMooreInfo::BoyerMooreInfo):
+        (JSC::Yarr::BoyerMooreInfo::set):
+        (JSC::Yarr::BoyerMooreInfo::addCharacters):
+        (JSC::Yarr::BoyerMooreInfo::addRanges):
+        (JSC::Yarr::BoyerMooreInfo::create):
+        (JSC::Yarr::BoyerMooreInfo::createCandidateBitmap const):
+        * yarr/YarrJIT.h:
+        (JSC::Yarr::BoyerMooreCharacterCandidates::isValid const):
+        (JSC::Yarr::BoyerMooreCharacterCandidates::invalidate):
+        (JSC::Yarr::BoyerMooreCharacterCandidates::isEmpty const):
+        (JSC::Yarr::BoyerMooreCharacterCandidates::size const):
+        (JSC::Yarr::BoyerMooreCharacterCandidates::at const):
+        (JSC::Yarr::BoyerMooreCharacterCandidates::add):
+        (JSC::Yarr::BoyerMooreCharacterCandidates::merge):
+        (JSC::Yarr::BoyerMooreBitmap::characterCandidates const):
+        (JSC::Yarr::BoyerMooreBitmap::add):
+        (JSC::Yarr::BoyerMooreBitmap::addCharacters):
+        (JSC::Yarr::BoyerMooreBitmap::addRanges):
+        (JSC::Yarr::BoyerMooreBitmap::isMaskEffective const): Deleted.
+
 2021-08-05  Mikhail R. Gadelha  <[email protected]>
 
         Assertion failure when checking array in DFG (32 bits)

Modified: trunk/Source/_javascript_Core/yarr/YarrJIT.cpp (280743 => 280744)


--- trunk/Source/_javascript_Core/yarr/YarrJIT.cpp	2021-08-06 23:21:07 UTC (rev 280743)
+++ trunk/Source/_javascript_Core/yarr/YarrJIT.cpp	2021-08-07 00:02:51 UTC (rev 280744)
@@ -36,6 +36,7 @@
 #include "YarrMatchingContextHolder.h"
 #include <wtf/ASCIICType.h>
 #include <wtf/HexNumber.h>
+#include <wtf/ListDump.h>
 #include <wtf/Threading.h>
 
 
@@ -50,6 +51,15 @@
 JSC_ANNOTATE_JIT_OPERATION(_JITTarget_vmEntryToYarrJITAfter, vmEntryToYarrJITAfter);
 #endif
 
+void BoyerMooreFastCandidates::dump(PrintStream& out) const
+{
+    if (!isValid()) {
+        out.print("isValid:(false)");
+        return;
+    }
+    out.print("isValid:(true),characters:(", listDump(m_characters), ")");
+}
+
 class BoyerMooreInfo {
     WTF_MAKE_NONCOPYABLE(BoyerMooreInfo);
     WTF_MAKE_FAST_ALLOCATED(BoyerMooreInfo);
@@ -56,8 +66,9 @@
 public:
     static constexpr unsigned maxLength = 32;
 
-    explicit BoyerMooreInfo(unsigned length)
+    explicit BoyerMooreInfo(CharSize charSize, unsigned length)
         : m_characters(length)
+        , m_charSize(charSize)
     {
         ASSERT(this->length() <= maxLength);
     }
@@ -71,7 +82,7 @@
 
     void set(unsigned index, UChar32 character)
     {
-        m_characters[index].add(character);
+        m_characters[index].add(m_charSize, character);
     }
 
     void setAll(unsigned index)
@@ -81,26 +92,27 @@
 
     void addCharacters(unsigned index, const Vector<UChar32>& characters)
     {
-        m_characters[index].addCharacters(characters);
+        m_characters[index].addCharacters(m_charSize, characters);
     }
 
     void addRanges(unsigned index, const Vector<CharacterRange>& range)
     {
-        m_characters[index].addRanges(range);
+        m_characters[index].addRanges(m_charSize, range);
     }
 
-    static UniqueRef<BoyerMooreInfo> create(unsigned length)
+    static UniqueRef<BoyerMooreInfo> create(CharSize charSize, unsigned length)
     {
-        return makeUniqueRef<BoyerMooreInfo>(length);
+        return makeUniqueRef<BoyerMooreInfo>(charSize, length);
     }
 
     std::optional<std::tuple<unsigned, unsigned>> findWorthwhileCharacterSequenceForLookahead() const;
-    std::tuple<BoyerMooreBitmap::Map, bool> createCandidateBitmap(unsigned begin, unsigned end) const;
+    std::tuple<BoyerMooreBitmap::Map, BoyerMooreFastCandidates> createCandidateBitmap(unsigned begin, unsigned end) const;
 
 private:
     std::tuple<int, unsigned, unsigned> findBestCharacterSequence(unsigned numberOfCandidatesLimit) const;
 
     Vector<BoyerMooreBitmap> m_characters;
+    CharSize m_charSize;
 };
 
 std::tuple<int, unsigned, unsigned> BoyerMooreInfo::findBestCharacterSequence(unsigned numberOfCandidatesLimit) const
@@ -158,16 +170,16 @@
     return std::tuple { begin, end };
 }
 
-std::tuple<BoyerMooreBitmap::Map, bool> BoyerMooreInfo::createCandidateBitmap(unsigned begin, unsigned end) const
+std::tuple<BoyerMooreBitmap::Map, BoyerMooreFastCandidates> BoyerMooreInfo::createCandidateBitmap(unsigned begin, unsigned end) const
 {
     BoyerMooreBitmap::Map map { };
-    bool isMaskEffective = false;
+    BoyerMooreFastCandidates charactersFastPath;
     for (unsigned index = begin; index < end; ++index) {
         auto& bmBitmap = m_characters[index];
         map.merge(bmBitmap.map());
-        isMaskEffective |= bmBitmap.isMaskEffective();
+        charactersFastPath.merge(bmBitmap.charactersFastPath());
     }
-    return std::tuple { map, isMaskEffective };
+    return std::tuple { WTFMove(map), WTFMove(charactersFastPath) };
 }
 
 class YarrGenerator final : public YarrJITInfo, private MacroAssembler {
@@ -2409,7 +2421,7 @@
                         auto [beginIndex, endIndex] = *range;
                         ASSERT(endIndex <= alternative->m_minimumSize);
 
-                        auto [map, isMaskEffective] = op.m_bmInfo->createCandidateBitmap(beginIndex, endIndex);
+                        auto [map, charactersFastPath] = op.m_bmInfo->createCandidateBitmap(beginIndex, endIndex);
                         unsigned mapCount = map.count();
                         // If candiate characters are <= 2, checking each is better than using vector.
                         JumpList outOfLengthFailure;
@@ -2418,23 +2430,15 @@
                         // Patterns like /[]/ have zero candidates. Since it is rare, we do not do nothing for now.
                         if (!mapCount)
                             break;
-                        if (mapCount <= 2) {
-                            UChar32 character1 = map.findBit(0, true);
-                            ASSERT(character1 != BoyerMooreBitmap::Map::size());
-                            UChar32 character2 = 0xff;
-                            if (mapCount == 2) {
-                                character2 = map.findBit(character1 + 1, true);
-                                ASSERT(character2 != BoyerMooreBitmap::Map::size());
-                            }
-                            dataLogLnIf(Options::verboseRegExpCompilation(), "Found 1-or-2 characters lookahead character:(0x", hex(character1), "),character2:(", hex(character2), "),isMaskEffective:(", isMaskEffective,"),range:[", beginIndex, ", ", endIndex, ")");
+                        if (charactersFastPath.isValid() && !charactersFastPath.isEmpty()) {
+                            static_assert(BoyerMooreFastCandidates::maxSize == 2);
+                            dataLogLnIf(Options::verboseRegExpCompilation(), "Found characters fastpath lookahead ", charactersFastPath, " range:[", beginIndex, ", ", endIndex, ")");
 
                             auto loopHead = label();
                             readCharacter(m_checkedOffset - endIndex + 1, regT0);
-                            if (isMaskEffective)
-                                and32(TrustedImm32(BoyerMooreBitmap::mapMask), regT0, regT0);
-                            matched.append(branch32(Equal, regT0, TrustedImm32(character1)));
-                            if (mapCount == 2)
-                                matched.append(branch32(Equal, regT0, TrustedImm32(character2)));
+                            matched.append(branch32(Equal, regT0, TrustedImm32(charactersFastPath.at(0))));
+                            if (charactersFastPath.size() > 1)
+                                matched.append(branch32(Equal, regT0, TrustedImm32(charactersFastPath.at(1))));
                             outOfLengthFailure.append(jumpIfNoAvailableInput(endIndex - beginIndex));
                             jump().linkTo(loopHead, this);
                         } else {
@@ -3895,7 +3899,7 @@
         // FIXME: Support unicode flag.
         // https://bugs.webkit.org/show_bug.cgi?id=228611
         if (disjunction->m_minimumSize && !m_pattern.sticky() && !m_pattern.unicode()) {
-            auto bmInfo = BoyerMooreInfo::create(std::min<unsigned>(disjunction->m_minimumSize, BoyerMooreInfo::maxLength));
+            auto bmInfo = BoyerMooreInfo::create(m_charSize, std::min<unsigned>(disjunction->m_minimumSize, BoyerMooreInfo::maxLength));
             if (collectBoyerMooreInfo(disjunction, currentAlternativeIndex, bmInfo.get())) {
                 m_ops.last().m_bmInfo = bmInfo.ptr();
                 m_bmInfos.append(WTFMove(bmInfo));
@@ -3975,11 +3979,6 @@
                         ++cursor;
                         continue;
                     }
-                    if (characterClass.m_table) {
-                        bmInfo.setAll(cursor);
-                        ++cursor;
-                        continue;
-                    }
                     if (!characterClass.m_rangesUnicode.isEmpty())
                         bmInfo.addRanges(cursor, characterClass.m_rangesUnicode);
                     if (!characterClass.m_matchesUnicode.isEmpty())

Modified: trunk/Source/_javascript_Core/yarr/YarrJIT.h (280743 => 280744)


--- trunk/Source/_javascript_Core/yarr/YarrJIT.h	2021-08-06 23:21:07 UTC (rev 280743)
+++ trunk/Source/_javascript_Core/yarr/YarrJIT.h	2021-08-07 00:02:51 UTC (rev 280744)
@@ -60,6 +60,56 @@
     ExecutableMemoryAllocationFailure,
 };
 
+class BoyerMooreFastCandidates {
+    WTF_MAKE_FAST_ALLOCATED(BoyerMooreFastCandidates);
+public:
+    static constexpr unsigned maxSize = 2;
+    using CharacterVector = Vector<UChar32, maxSize>;
+
+    BoyerMooreFastCandidates() = default;
+
+    bool isValid() const { return m_isValid; }
+    void invalidate()
+    {
+        m_characters.clear();
+        m_isValid = false;
+    }
+
+    bool isEmpty() const { return m_characters.isEmpty(); }
+    unsigned size() const { return m_characters.size(); }
+    UChar32 at(unsigned index) const { return m_characters.at(index); }
+
+    void add(UChar32 character)
+    {
+        if (!isValid())
+            return;
+        if (!m_characters.contains(character)) {
+            if (m_characters.size() < maxSize)
+                m_characters.append(character);
+            else
+                invalidate();
+        }
+    }
+
+    void merge(const BoyerMooreFastCandidates& other)
+    {
+        if (!isValid())
+            return;
+        if (!other.isValid()) {
+            invalidate();
+            return;
+        }
+        for (unsigned index = 0; index < other.size(); ++index)
+            add(other.at(index));
+    }
+
+    void dump(PrintStream&) const;
+
+private:
+    CharacterVector m_characters;
+    bool m_isValid { true };
+};
+
 class BoyerMooreBitmap {
     WTF_MAKE_NONCOPYABLE(BoyerMooreBitmap);
     WTF_MAKE_FAST_ALLOCATED(BoyerMooreBitmap);
@@ -72,48 +122,62 @@
 
     unsigned count() const { return m_count; }
     const Map& map() const { return m_map; }
-    bool isMaskEffective() const { return m_isMaskEffective; }
+    const BoyerMooreFastCandidates& charactersFastPath() const { return m_charactersFastPath; }
 
-    void add(UChar32 character)
+    bool add(CharSize charSize, UChar32 character)
     {
         if (isAllSet())
-            return;
+            return false;
+        if (charSize == CharSize::Char8 && character > 0xff)
+            return true;
+        m_charactersFastPath.add(character);
         unsigned position = character & mapMask;
-        if (position != static_cast<unsigned>(character))
-            m_isMaskEffective = true;
         if (!m_map.get(position)) {
             m_map.set(position);
             ++m_count;
         }
+        return !isAllSet();
     }
 
-    void addCharacters(const Vector<UChar32>& characters)
+    void addCharacters(CharSize charSize, const Vector<UChar32>& characters)
     {
         if (isAllSet())
             return;
-        if (characters.size() >= mapSize) {
-            setAll();
-            return;
+        ASSERT(std::is_sorted(characters));
+        for (UChar32 character : characters) {
+            // Early return since characters are sorted.
+            if (charSize == CharSize::Char8 && character > 0xff)
+                return;
+            if (!add(charSize, character))
+                return;
         }
-        for (UChar32 character : characters)
-            add(character);
     }
 
-    void addRanges(const Vector<CharacterRange>& ranges)
+    void addRanges(CharSize charSize, const Vector<CharacterRange>& ranges)
     {
-        if (ranges.size() >= mapSize) {
-            setAll();
+        if (isAllSet())
             return;
-        }
+        ASSERT(std::is_sorted(characters, [](CharacterRange lhs, CharacterRange rhs) {
+                return lhs.begin < rhs.begin;
+            }));
         for (CharacterRange range : ranges) {
-            if (isAllSet())
-                return;
-            if (static_cast<unsigned>(range.end - range.begin + 1) >= mapSize) {
+            auto begin = range.begin;
+            auto end = range.end;
+            if (charSize == CharSize::Char8) {
+                // Early return since ranges are sorted.
+                if (begin > 0xff)
+                    return;
+                if (end > 0xff)
+                    end = 0xff;
+            }
+            if (static_cast<unsigned>(end - begin + 1) >= mapSize) {
                 setAll();
                 return;
             }
-            for (UChar32 character = range.begin; character <= range.end; ++character)
-                add(character);
+            for (UChar32 character = begin; character <= end; ++character) {
+                if (!add(charSize, character))
+                    return;
+            }
         }
     }
 
@@ -126,8 +190,8 @@
 
 private:
     Map m_map { };
+    BoyerMooreFastCandidates m_charactersFastPath;
     unsigned m_count { 0 };
-    bool m_isMaskEffective { false };
 };
 
 #if CPU(ARM64E)
_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to