Diff
Modified: trunk/Source/_javascript_Core/ChangeLog (225682 => 225683)
--- trunk/Source/_javascript_Core/ChangeLog 2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/ChangeLog 2017-12-08 18:27:18 UTC (rev 225683)
@@ -1,3 +1,42 @@
+2017-12-08 Michael Saboff <msab...@apple.com>
+
+ YARR: Coalesce constructed character classes
+ https://bugs.webkit.org/show_bug.cgi?id=180537
+
+ Reviewed by JF Bastien.
+
+ When adding characters or character ranges to a character class being constructed,
+ we now coalesce adjacent characters and character ranges. When we create a
+ character class after construction is complete, we do a final coalescing pass
+ across the character list and ranges to catch any remaining coalescing
+ opportunities.
+
+ Added an optimization for character classes that will match any character.
+ This is somewhat common in code created before the /s (dotAll) flag was added
+ to the engine.
+
+ * yarr/YarrInterpreter.cpp:
+ (JSC::Yarr::Interpreter::checkCharacterClass):
+ * yarr/YarrJIT.cpp:
+ (JSC::Yarr::YarrGenerator::generateCharacterClassOnce):
+ (JSC::Yarr::YarrGenerator::generateCharacterClassFixed):
+ (JSC::Yarr::YarrGenerator::generateCharacterClassGreedy):
+ (JSC::Yarr::YarrGenerator::backtrackCharacterClassNonGreedy):
+ * yarr/YarrPattern.cpp:
+ (JSC::Yarr::CharacterClassConstructor::CharacterClassConstructor):
+ (JSC::Yarr::CharacterClassConstructor::reset):
+ (JSC::Yarr::CharacterClassConstructor::charClass):
+ (JSC::Yarr::CharacterClassConstructor::addSorted):
+ (JSC::Yarr::CharacterClassConstructor::addSortedRange):
+ (JSC::Yarr::CharacterClassConstructor::mergeRangesFrom):
+ (JSC::Yarr::CharacterClassConstructor::coalesceTables):
+ (JSC::Yarr::CharacterClassConstructor::anyCharacter):
+ (JSC::Yarr::YarrPatternConstructor::atomCharacterClassEnd):
+ (JSC::Yarr::PatternTerm::dump):
+ (JSC::Yarr::anycharCreate):
+ * yarr/YarrPattern.h:
+ (JSC::Yarr::CharacterClass::CharacterClass):
+
2017-12-07 Saam Barati <sbar...@apple.com>
Modify our dollar VM clflush intrinsic to aid in some perf testing
Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp (225682 => 225683)
--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp 2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp 2017-12-08 18:27:18 UTC (rev 225683)
@@ -408,6 +408,9 @@
bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
{
+ if (characterClass->m_anyCharacter)
+ return !invert;
+
bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
return invert ? !match : match;
}
Modified: trunk/Source/_javascript_Core/yarr/YarrJIT.cpp (225682 => 225683)
--- trunk/Source/_javascript_Core/yarr/YarrJIT.cpp 2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/yarr/YarrJIT.cpp 2017-12-08 18:27:18 UTC (rev 225683)
@@ -1171,7 +1171,7 @@
readCharacter(m_checkedOffset - term->inputPosition, character);
// If we are matching the "any character" builtin class we only need to read the
// character and don't need to match as it will always succeed.
- if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {
+ if (term->invert() || !term->characterClass->m_anyCharacter) {
matchCharacterClass(character, matchDest, term->characterClass);
if (term->invert())
@@ -1220,7 +1220,7 @@
readCharacter(m_checkedOffset - term->inputPosition - term->quantityMaxCount, character, countRegister);
// If we are matching the "any character" builtin class we only need to read the
// character and don't need to match as it will always succeed.
- if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {
+ if (term->invert() || !term->characterClass->m_anyCharacter) {
matchCharacterClass(character, matchDest, term->characterClass);
if (term->invert())
@@ -1272,7 +1272,7 @@
readCharacter(m_checkedOffset - term->inputPosition, character);
// If we are matching the "any character" builtin class we only need to read the
// character and don't need to match as it will always succeed.
- if (term->characterClass != m_pattern.anyCharacterClass()) {
+ if (!term->characterClass->m_anyCharacter) {
matchCharacterClass(character, matchDest, term->characterClass);
failures.append(jump());
}
@@ -1378,7 +1378,7 @@
readCharacter(m_checkedOffset - term->inputPosition, character);
// If we are matching the "any character" builtin class we only need to read the
// character and don't need to match as it will always succeed.
- if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {
+ if (term->invert() || !term->characterClass->m_anyCharacter) {
matchCharacterClass(character, matchDest, term->characterClass);
if (term->invert())
Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.cpp (225682 => 225683)
--- trunk/Source/_javascript_Core/yarr/YarrPattern.cpp 2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.cpp 2017-12-08 18:27:18 UTC (rev 225683)
@@ -48,6 +48,7 @@
CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
: m_isCaseInsensitive(isCaseInsensitive)
, m_hasNonBMPCharacters(false)
+ , m_anyCharacter(false)
, m_canonicalMode(canonicalMode)
{
}
@@ -59,6 +60,7 @@
m_matchesUnicode.clear();
m_rangesUnicode.clear();
m_hasNonBMPCharacters = false;
+ m_anyCharacter = false;
}
void append(const CharacterClass* other)
@@ -238,6 +240,8 @@
std::unique_ptr<CharacterClass> charClass()
{
+ coalesceTables();
+
auto characterClass = std::make_unique<CharacterClass>();
characterClass->m_matches.swap(m_matches);
@@ -245,7 +249,11 @@
characterClass->m_matchesUnicode.swap(m_matchesUnicode);
characterClass->m_rangesUnicode.swap(m_rangesUnicode);
characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters();
+ characterClass->m_anyCharacter = anyCharacter();
+ m_hasNonBMPCharacters = false;
+ m_anyCharacter = false;
+
return characterClass;
}
@@ -270,9 +278,31 @@
int val = matches[pos+index] - ch;
if (!val)
return;
- else if (val > 0)
+ else if (val > 0) {
+ if (val == 1) {
+ UChar32 lo = ch;
+ UChar32 hi = ch + 1;
+ matches.remove(pos + index);
+ if (pos + index > 0 && matches[pos + index - 1] == ch - 1) {
+ lo = ch - 1;
+ matches.remove(pos + index - 1);
+ }
+ addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
+ return;
+ }
range = index;
- else {
+ } else {
+ if (val == -1) {
+ UChar32 lo = ch - 1;
+ UChar32 hi = ch;
+ matches.remove(pos + index);
+ if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) {
+ hi = ch + 1;
+ matches.remove(pos + index + 1);
+ }
+ addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
+ return;
+ }
pos += (index+1);
range -= (index+1);
}
@@ -286,7 +316,7 @@
void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
{
- unsigned end = ranges.size();
+ size_t end = ranges.size();
if (!U_IS_BMP(hi))
m_hasNonBMPCharacters = true;
@@ -293,10 +323,10 @@
// Simple linear scan - I doubt there are that many ranges anyway...
// feel free to fix this with something faster (eg binary chop).
- for (unsigned i = 0; i < end; ++i) {
+ for (size_t i = 0; i < end; ++i) {
// does the new range fall before the current position in the array
if (hi < ranges[i].begin) {
- // optional optimization: concatenate appending ranges? - may not be worthwhile.
+ // Concatenate appending ranges.
if (hi == (ranges[i].begin - 1)) {
ranges[i].begin = lo;
return;
@@ -312,18 +342,7 @@
ranges[i].begin = std::min(ranges[i].begin, lo);
ranges[i].end = std::max(ranges[i].end, hi);
- // now check if the new range can subsume any subsequent ranges.
- unsigned next = i+1;
- // each iteration of the loop we will either remove something from the list, or break the loop.
- while (next < ranges.size()) {
- if (ranges[next].begin <= (ranges[i].end + 1)) {
- // the next entry now overlaps / concatenates this one.
- ranges[i].end = std::max(ranges[i].end, ranges[next].end);
- ranges.remove(next);
- } else
- break;
- }
-
+ mergeRangesFrom(ranges, i);
return;
}
}
@@ -332,13 +351,76 @@
ranges.append(CharacterRange(lo, hi));
}
+ void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
+ {
+ unsigned next = index + 1;
+
+ // each iteration of the loop we will either remove something from the list, or break out of the loop.
+ while (next < ranges.size()) {
+ if (ranges[next].begin <= (ranges[index].end + 1)) {
+ // the next entry now overlaps / concatenates with this one.
+ ranges[index].end = std::max(ranges[index].end, ranges[next].end);
+ ranges.remove(next);
+ } else
+ break;
+ }
+
+ }
+
+ void coalesceTables()
+ {
+ auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
+
+ size_t matchesIndex = 0;
+ size_t rangesIndex = 0;
+
+ while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
+ while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1)
+ matchesIndex++;
+
+ if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) {
+ ranges[rangesIndex].begin = matches[matchesIndex];
+ matches.remove(matchesIndex);
+ }
+
+ while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1)
+ matchesIndex++;
+
+ if (matchesIndex < matches.size()) {
+ if (matches[matchesIndex] == ranges[rangesIndex].end + 1) {
+ ranges[rangesIndex].end = matches[matchesIndex];
+ matches.remove(matchesIndex);
+
+ mergeRangesFrom(ranges, rangesIndex);
+ } else
+ matchesIndex++;
+ }
+ }
+ };
+
+ coalesceMatchesAndRanges(m_matches, m_ranges);
+ coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
+
+ if (!m_matches.size() && !m_matchesUnicode.size()
+ && m_ranges.size() == 1 && m_rangesUnicode.size() == 1
+ && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f
+ && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff)
+ m_anyCharacter = true;
+ }
+
bool hasNonBMPCharacters()
{
return m_hasNonBMPCharacters;
}
- bool m_isCaseInsensitive;
- bool m_hasNonBMPCharacters;
+ bool anyCharacter()
+ {
+ return m_anyCharacter;
+ }
+
+ bool m_isCaseInsensitive : 1;
+ bool m_hasNonBMPCharacters : 1;
+ bool m_anyCharacter : 1;
CanonicalMode m_canonicalMode;
Vector<UChar32> m_matches;
@@ -489,6 +571,11 @@
void atomCharacterClassEnd()
{
auto newCharacterClass = m_characterClassConstructor.charClass();
+
+ if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
+ m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
+ return;
+ }
m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
}
@@ -1180,7 +1267,7 @@
break;
case TypeCharacterClass:
out.print("character class ");
- if (characterClass == thisPattern->anyCharacterClass())
+ if (characterClass->m_anyCharacter)
out.print("<any character>");
else if (characterClass == thisPattern->newlineCharacterClass())
out.print("<newline>");
@@ -1383,6 +1470,7 @@
characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
characterClass->m_hasNonBMPCharacters = true;
+ characterClass->m_anyCharacter = true;
return characterClass;
}
Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.h (225682 => 225683)
--- trunk/Source/_javascript_Core/yarr/YarrPattern.h 2017-12-08 18:05:21 UTC (rev 225682)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.h 2017-12-08 18:27:18 UTC (rev 225683)
@@ -59,6 +59,7 @@
CharacterClass()
: m_table(0)
, m_hasNonBMPCharacters(false)
+ , m_anyCharacter(false)
{
}
CharacterClass(const char* table, bool inverted)
@@ -65,6 +66,7 @@
: m_table(table)
, m_tableInverted(inverted)
, m_hasNonBMPCharacters(false)
+ , m_anyCharacter(false)
{
}
CharacterClass(std::initializer_list<UChar32> matches, std::initializer_list<CharacterRange> ranges, std::initializer_list<UChar32> matchesUnicode, std::initializer_list<CharacterRange> rangesUnicode)
@@ -75,6 +77,7 @@
, m_table(0)
, m_tableInverted(false)
, m_hasNonBMPCharacters(false)
+ , m_anyCharacter(false)
{
}
@@ -84,8 +87,9 @@
Vector<CharacterRange> m_rangesUnicode;
const char* m_table;
- bool m_tableInverted;
- bool m_hasNonBMPCharacters;
+ bool m_tableInverted : 1;
+ bool m_hasNonBMPCharacters : 1;
+ bool m_anyCharacter : 1;
};
enum QuantifierType {