Diff
Modified: trunk/Source/_javascript_Core/ChangeLog (243641 => 243642)
--- trunk/Source/_javascript_Core/ChangeLog 2019-03-29 05:18:47 UTC (rev 243641)
+++ trunk/Source/_javascript_Core/ChangeLog 2019-03-29 06:05:55 UTC (rev 243642)
@@ -1,3 +1,92 @@
+2019-03-28 Michael Saboff <[email protected]>
+
+ [YARR] Precompute BMP / non-BMP status when constructing character classes
+ https://bugs.webkit.org/show_bug.cgi?id=196296
+
+ Reviewed by Keith Miller.
+
+ Changed CharacterClass::m_hasNonBMPCharacters into a character width bit field which
+ indicateis if the class includes characters from either BMP, non-BMP or both ranges.
+ This allows the recognizing code to eliminate checks for the width of a matched
+ characters when the class has only one width. The character width is needed to
+ determine if we advance 1 or 2 character. Also, the pre-computed width of character
+ classes that contains either all BMP or all non-BMP characters allows the parser to
+ use fixed widths for terms using those character classes. Changed both the code gen
+ scripts and Yarr compiler to compute this bit field during the construction of
+ character classes.
+
+ For JIT'ed code of character classes that contain either all BMP or all non-BMP
+ characters, we can eliminate the generic check we were doing do compute how much
+ to advance after sucessfully matching a character in the class.
+
+ Generic isBMP check BMP only non-BMP only
+ -------------- -------------- --------------
+ inc %r9d inc %r9d add $0x2, %r9d
+ cmp $0x10000, %eax
+ jl isBMP
+ cmp %edx, %esi
+ jz atEndOfString
+ inc %r9d
+ inc %esi
+ isBMP:
+
+ For character classes that contained non-BMP characters, we were always generating
+ the code in the left column. The middle column is the code we generate for character
+ classes that contain only BMP characters. The right column is the code we now
+ generate if the character class has only non-BMP characters. In the fix width cases,
+ we can eliminate both the isBMP check as well as the atEndOfString check. The
+ atEndOfstring check is eliminated since we know how many characters this character
+ class requires and that check can be factored out to the beginning of the current
+ alternative. For character classes that contain both BMP and non-BMP characters,
+ we still generate the generic left column.
+
+ This change is a ~8% perf progression on UniPoker and a ~2% improvement on RexBench
+ as a whole.
+
+ * runtime/RegExp.cpp:
+ (JSC::RegExp::matchCompareWithInterpreter):
+ * runtime/RegExpInlines.h:
+ (JSC::RegExp::matchInline):
+ * yarr/YarrInterpreter.cpp:
+ (JSC::Yarr::Interpreter::checkCharacterClassDontAdvanceInputForNonBMP):
+ (JSC::Yarr::Interpreter::matchCharacterClass):
+ * yarr/YarrJIT.cpp:
+ (JSC::Yarr::YarrGenerator::optimizeAlternative):
+ (JSC::Yarr::YarrGenerator::matchCharacterClass):
+ (JSC::Yarr::YarrGenerator::advanceIndexAfterCharacterClassTermMatch):
+ (JSC::Yarr::YarrGenerator::tryReadUnicodeCharImpl):
+ (JSC::Yarr::YarrGenerator::generateCharacterClassOnce):
+ (JSC::Yarr::YarrGenerator::generateCharacterClassFixed):
+ (JSC::Yarr::YarrGenerator::generateCharacterClassGreedy):
+ (JSC::Yarr::YarrGenerator::backtrackCharacterClassGreedy):
+ (JSC::Yarr::YarrGenerator::generateCharacterClassNonGreedy):
+ (JSC::Yarr::YarrGenerator::backtrackCharacterClassNonGreedy):
+ (JSC::Yarr::YarrGenerator::generateEnter):
+ (JSC::Yarr::YarrGenerator::YarrGenerator):
+ (JSC::Yarr::YarrGenerator::compile):
+ * yarr/YarrPattern.cpp:
+ (JSC::Yarr::CharacterClassConstructor::CharacterClassConstructor):
+ (JSC::Yarr::CharacterClassConstructor::reset):
+ (JSC::Yarr::CharacterClassConstructor::charClass):
+ (JSC::Yarr::CharacterClassConstructor::addSorted):
+ (JSC::Yarr::CharacterClassConstructor::addSortedRange):
+ (JSC::Yarr::CharacterClassConstructor::hasNonBMPCharacters):
+ (JSC::Yarr::CharacterClassConstructor::characterWidths):
+ (JSC::Yarr::PatternTerm::dump):
+ (JSC::Yarr::anycharCreate):
+ * yarr/YarrPattern.h:
+ (JSC::Yarr::operator|):
+ (JSC::Yarr::operator&):
+ (JSC::Yarr::operator|=):
+ (JSC::Yarr::CharacterClass::CharacterClass):
+ (JSC::Yarr::CharacterClass::hasNonBMPCharacters):
+ (JSC::Yarr::CharacterClass::hasOneCharacterSize):
+ (JSC::Yarr::CharacterClass::hasOnlyNonBMPCharacters):
+ (JSC::Yarr::PatternTerm::invert const):
+ (JSC::Yarr::PatternTerm::invert): Deleted.
+ * yarr/create_regex_tables:
+ * yarr/generateYarrUnicodePropertyTables.py:
+
2019-03-28 Saam Barati <[email protected]>
BackwardsGraph needs to consider back edges as the backward's root successor
Modified: trunk/Source/_javascript_Core/runtime/RegExp.cpp (243641 => 243642)
--- trunk/Source/_javascript_Core/runtime/RegExp.cpp 2019-03-29 05:18:47 UTC (rev 243641)
+++ trunk/Source/_javascript_Core/runtime/RegExp.cpp 2019-03-29 06:05:55 UTC (rev 243642)
@@ -385,7 +385,7 @@
for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)
interpreterOffsetVector[j] = -1;
- interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector);
+ interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(interpreterOffsetVector));
if (jitResult != interpreterResult)
differences++;
@@ -402,7 +402,7 @@
dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);
if (jitResult != interpreterResult) {
- dataLogF(" JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult);
+ dataLogF(" JIT result = %d, interpreted result = %d\n", jitResult, interpreterResult);
differences--;
} else {
dataLogF(" Correct result = %d\n", jitResult);
Modified: trunk/Source/_javascript_Core/runtime/RegExpInlines.h (243641 => 243642)
--- trunk/Source/_javascript_Core/runtime/RegExpInlines.h 2019-03-29 05:18:47 UTC (rev 243641)
+++ trunk/Source/_javascript_Core/runtime/RegExpInlines.h 2019-03-29 06:05:55 UTC (rev 243642)
@@ -181,7 +181,10 @@
}
#if ENABLE(YARR_JIT_DEBUG)
- matchCompareWithInterpreter(s, startOffset, offsetVector, result);
+ if (m_state == JITCode) {
+ byteCodeCompileIfNecessary(&vm);
+ matchCompareWithInterpreter(s, startOffset, offsetVector, result);
+ }
#endif
} else
#endif
Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp (243641 => 243642)
--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp 2019-03-29 05:18:47 UTC (rev 243641)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp 2019-03-29 06:05:55 UTC (rev 243642)
@@ -428,6 +428,12 @@
bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
return invert ? !match : match;
}
+
+ bool checkCharacterClassDontAdvanceInputForNonBMP(CharacterClass* characterClass, unsigned negativeInputOffset)
+ {
+ int readCharacter = characterClass->hasOnlyNonBMPCharacters() ? input.readSurrogatePairChecked(negativeInputOffset) : input.readChecked(negativeInputOffset);
+ return testCharacterClass(characterClass, readCharacter);
+ }
bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
{
@@ -558,12 +564,21 @@
switch (term.atom.quantityType) {
case QuantifierFixedCount: {
if (unicode) {
+ CharacterClass* charClass = term.atom.characterClass;
backTrack->begin = input.getPos();
unsigned matchAmount = 0;
for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
- if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) {
- input.setPos(backTrack->begin);
- return false;
+ if (term.invert()) {
+ if (!checkCharacterClass(charClass, term.invert(), term.inputPosition - matchAmount)) {
+ input.setPos(backTrack->begin);
+ return false;
+ }
+ } else {
+ unsigned matchOffset = matchAmount * (charClass->hasOnlyNonBMPCharacters() ? 2 : 1);
+ if (!checkCharacterClassDontAdvanceInputForNonBMP(charClass, term.inputPosition - matchOffset)) {
+ input.setPos(backTrack->begin);
+ return false;
+ }
}
}
Modified: trunk/Source/_javascript_Core/yarr/YarrJIT.cpp (243641 => 243642)
--- trunk/Source/_javascript_Core/yarr/YarrJIT.cpp 2019-03-29 05:18:47 UTC (rev 243641)
+++ trunk/Source/_javascript_Core/yarr/YarrJIT.cpp 2019-03-29 06:05:55 UTC (rev 243642)
@@ -72,13 +72,14 @@
static const RegisterID regUnicodeInputAndTrail = ARM64Registers::x10;
static const RegisterID initialStart = ARM64Registers::x11;
static const RegisterID supplementaryPlanesBase = ARM64Registers::x12;
- static const RegisterID surrogateTagMask = ARM64Registers::x13;
- static const RegisterID leadingSurrogateTag = ARM64Registers::x14;
- static const RegisterID trailingSurrogateTag = ARM64Registers::x15;
+ static const RegisterID leadingSurrogateTag = ARM64Registers::x13;
+ static const RegisterID trailingSurrogateTag = ARM64Registers::x14;
+ static const RegisterID endOfStringAddress = ARM64Registers::x15;
static const RegisterID returnRegister = ARM64Registers::x0;
static const RegisterID returnRegister2 = ARM64Registers::x1;
+ const TrustedImm32 surrogateTagMask = TrustedImm32(0xfffffc00);
#define HAVE_INITIAL_START_REG
#define JIT_UNICODE_EXPRESSIONS
#elif CPU(MIPS)
@@ -143,12 +144,13 @@
#endif
static const RegisterID regUnicodeInputAndTrail = X86Registers::r13;
static const RegisterID leadingSurrogateTag = X86Registers::r14;
- static const RegisterID trailingSurrogateTag = X86Registers::r15;
+ static const RegisterID endOfStringAddress = X86Registers::r15;
static const RegisterID returnRegister = X86Registers::eax;
static const RegisterID returnRegister2 = X86Registers::edx;
const TrustedImm32 supplementaryPlanesBase = TrustedImm32(0x10000);
+ const TrustedImm32 trailingSurrogateTag = TrustedImm32(0xdc00);
const TrustedImm32 surrogateTagMask = TrustedImm32(0xfffffc00);
#define HAVE_INITIAL_START_REG
#define JIT_UNICODE_EXPRESSIONS
@@ -319,7 +321,7 @@
// We can move BMP only character classes after fixed character terms.
if ((term.type == PatternTerm::TypeCharacterClass)
&& (term.quantityType == QuantifierFixedCount)
- && (!m_decodeSurrogatePairs || (!term.characterClass->m_hasNonBMPCharacters && !term.m_invert))
+ && (!m_decodeSurrogatePairs || (term.characterClass->hasOneCharacterSize() && !term.m_invert))
&& (nextTerm.type == PatternTerm::TypePatternCharacter)
&& (nextTerm.quantityType == QuantifierFixedCount)) {
PatternTerm termCopy = term;
@@ -383,6 +385,7 @@
matchDest.append(branchTest8(charClass->m_tableInverted ? Zero : NonZero, tableEntry));
return;
}
+
JumpList unicodeFail;
if (charClass->m_matchesUnicode.size() || charClass->m_rangesUnicode.size()) {
JumpList isAscii;
@@ -448,6 +451,23 @@
unicodeFail.link(this);
}
+#ifdef JIT_UNICODE_EXPRESSIONS
+ void advanceIndexAfterCharacterClassTermMatch(const PatternTerm* term, JumpList& failures, const RegisterID character)
+ {
+ ASSERT(term->type == PatternTerm::TypeCharacterClass);
+
+ if (term->characterClass->hasOneCharacterSize() && !term->invert())
+ add32(TrustedImm32(term->characterClass->hasNonBMPCharacters() ? 2 : 1), index);
+ else {
+ add32(TrustedImm32(1), index);
+ failures.append(atEndOfInput());
+ Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase);
+ add32(TrustedImm32(1), index);
+ isBMPChar.link(this);
+ }
+ }
+#endif
+
// Jumps if input not available; will have (incorrectly) incremented already!
Jump jumpIfNoAvailableInput(unsigned countToCheck = 0)
{
@@ -520,12 +540,12 @@
ASSERT(m_charSize == Char16);
JumpList notUnicode;
+
load16Unaligned(regUnicodeInputAndTrail, resultReg);
and32(surrogateTagMask, resultReg, regT2);
notUnicode.append(branch32(NotEqual, regT2, leadingSurrogateTag));
addPtr(TrustedImm32(2), regUnicodeInputAndTrail);
- getEffectiveAddress(BaseIndex(input, length, TimesTwo), regT2);
- notUnicode.append(branch32(AboveOrEqual, regUnicodeInputAndTrail, regT2));
+ notUnicode.append(branchPtr(AboveOrEqual, regUnicodeInputAndTrail, endOfStringAddress));
load16Unaligned(Address(regUnicodeInputAndTrail), regUnicodeInputAndTrail);
and32(surrogateTagMask, regUnicodeInputAndTrail, regT2);
notUnicode.append(branch32(NotEqual, regT2, trailingSurrogateTag));
@@ -1734,7 +1754,7 @@
}
}
#ifdef JIT_UNICODE_EXPRESSIONS
- if (m_decodeSurrogatePairs) {
+ if (m_decodeSurrogatePairs && (!term->characterClass->hasOneCharacterSize() || term->invert())) {
Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase);
add32(TrustedImm32(1), index);
isBMPChar.link(this);
@@ -1768,11 +1788,18 @@
op.m_jumps.append(jumpIfNoAvailableInput());
move(index, countRegister);
- sub32(Imm32(term->quantityMaxCount.unsafeGet()), countRegister);
+ Checked<unsigned> scaledMaxCount = term->quantityMaxCount;
+
+#ifdef JIT_UNICODE_EXPRESSIONS
+ if (m_decodeSurrogatePairs && term->characterClass->hasOnlyNonBMPCharacters() && !term->invert())
+ scaledMaxCount *= 2;
+#endif
+ sub32(Imm32(scaledMaxCount.unsafeGet()), countRegister);
+
Label loop(this);
JumpList matchDest;
- readCharacter(m_checkedOffset - term->inputPosition - term->quantityMaxCount, character, countRegister);
+ readCharacter(m_checkedOffset - term->inputPosition - scaledMaxCount, character, countRegister);
// If we are matching the "any character" builtin class we only need to read the
// character and don't need to match as it will always succeed.
if (term->invert() || !term->characterClass->m_anyCharacter) {
@@ -1786,16 +1813,21 @@
}
}
- add32(TrustedImm32(1), countRegister);
#ifdef JIT_UNICODE_EXPRESSIONS
if (m_decodeSurrogatePairs) {
- Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase);
- op.m_jumps.append(atEndOfInput());
+ if (term->characterClass->hasOneCharacterSize() && !term->invert())
+ add32(TrustedImm32(term->characterClass->hasNonBMPCharacters() ? 2 : 1), countRegister);
+ else {
+ add32(TrustedImm32(1), countRegister);
+ Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase);
+ op.m_jumps.append(atEndOfInput());
+ add32(TrustedImm32(1), countRegister);
+ add32(TrustedImm32(1), index);
+ isBMPChar.link(this);
+ }
+ } else
+#endif
add32(TrustedImm32(1), countRegister);
- add32(TrustedImm32(1), index);
- isBMPChar.link(this);
- }
-#endif
branch32(NotEqual, countRegister, index).linkTo(loop, this);
}
void backtrackCharacterClassFixed(size_t opIndex)
@@ -1811,7 +1843,7 @@
const RegisterID character = regT0;
const RegisterID countRegister = regT1;
- if (m_decodeSurrogatePairs)
+ if (m_decodeSurrogatePairs && (!term->characterClass->hasOneCharacterSize() || term->invert()))
storeToFrame(index, term->frameLocation + BackTrackInfoCharacterClass::beginIndex());
move(TrustedImm32(0), countRegister);
@@ -1825,8 +1857,8 @@
} else {
JumpList matchDest;
readCharacter(m_checkedOffset - term->inputPosition, character);
- // If we are matching the "any character" builtin class we only need to read the
- // character and don't need to match as it will always succeed.
+ // If we are matching the "any character" builtin class for non-unicode patterns,
+ // we only need to read the character and don't need to match as it will always succeed.
if (!term->characterClass->m_anyCharacter) {
matchCharacterClass(character, matchDest, term->characterClass);
failures.append(jump());
@@ -1834,15 +1866,12 @@
matchDest.link(this);
}
- add32(TrustedImm32(1), index);
#ifdef JIT_UNICODE_EXPRESSIONS
- if (m_decodeSurrogatePairs) {
- failures.append(atEndOfInput());
- Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase);
+ if (m_decodeSurrogatePairs)
+ advanceIndexAfterCharacterClassTermMatch(term, failures, character);
+ else
+#endif
add32(TrustedImm32(1), index);
- isBMPChar.link(this);
- }
-#endif
add32(TrustedImm32(1), countRegister);
if (term->quantityMaxCount != quantifyInfinite) {
@@ -1868,14 +1897,17 @@
loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex(), countRegister);
m_backtrackingState.append(branchTest32(Zero, countRegister));
sub32(TrustedImm32(1), countRegister);
+ storeToFrame(countRegister, term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex());
+
if (!m_decodeSurrogatePairs)
sub32(TrustedImm32(1), index);
+ else if (term->characterClass->hasOneCharacterSize() && !term->invert())
+ sub32(TrustedImm32(term->characterClass->hasNonBMPCharacters() ? 2 : 1), index);
else {
+ // Rematch one less
const RegisterID character = regT0;
loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::beginIndex(), index);
- // Rematch one less
- storeToFrame(countRegister, term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex());
Label rematchLoop(this);
readCharacter(m_checkedOffset - term->inputPosition, character);
@@ -1905,9 +1937,11 @@
move(TrustedImm32(0), countRegister);
op.m_reentry = label();
- if (m_decodeSurrogatePairs)
- storeToFrame(index, term->frameLocation + BackTrackInfoCharacterClass::beginIndex());
- storeToFrame(countRegister, term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex());
+ if (m_decodeSurrogatePairs) {
+ if (!term->characterClass->hasOneCharacterSize() || term->invert())
+ storeToFrame(index, term->frameLocation + BackTrackInfoCharacterClass::beginIndex());
+ storeToFrame(countRegister, term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex());
+ }
}
void backtrackCharacterClassNonGreedy(size_t opIndex)
@@ -1922,9 +1956,11 @@
m_backtrackingState.link(this);
- if (m_decodeSurrogatePairs)
- loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::beginIndex(), index);
- loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex(), countRegister);
+ if (m_decodeSurrogatePairs) {
+ if (!term->characterClass->hasOneCharacterSize() || term->invert())
+ loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::beginIndex(), index);
+ loadFromFrame(term->frameLocation + BackTrackInfoCharacterClass::matchAmountIndex(), countRegister);
+ }
nonGreedyFailures.append(atEndOfInput());
nonGreedyFailures.append(branch32(Equal, countRegister, Imm32(term->quantityMaxCount.unsafeGet())));
@@ -1931,8 +1967,8 @@
JumpList matchDest;
readCharacter(m_checkedOffset - term->inputPosition, character);
- // If we are matching the "any character" builtin class we only need to read the
- // character and don't need to match as it will always succeed.
+ // If we are matching the "any character" builtin class for non-unicode patterns,
+ // we only need to read the character and don't need to match as it will always succeed.
if (term->invert() || !term->characterClass->m_anyCharacter) {
matchCharacterClass(character, matchDest, term->characterClass);
@@ -1944,15 +1980,12 @@
}
}
- add32(TrustedImm32(1), index);
#ifdef JIT_UNICODE_EXPRESSIONS
- if (m_decodeSurrogatePairs) {
- nonGreedyFailures.append(atEndOfInput());
- Jump isBMPChar = branch32(LessThan, character, supplementaryPlanesBase);
+ if (m_decodeSurrogatePairs)
+ advanceIndexAfterCharacterClassTermMatch(term, nonGreedyFailures, character);
+ else
+#endif
add32(TrustedImm32(1), index);
- isBMPChar.link(this);
- }
-#endif
add32(TrustedImm32(1), countRegister);
jump(op.m_reentry);
@@ -3700,7 +3733,6 @@
push(X86Registers::r15);
move(TrustedImm32(0xd800), leadingSurrogateTag);
- move(TrustedImm32(0xdc00), trailingSurrogateTag);
}
// The ABI doesn't guarantee the upper bits are zero on unsigned arguments, so clear them ourselves.
zeroExtend32ToPtr(index, index);
@@ -3734,7 +3766,6 @@
if (m_decodeSurrogatePairs) {
pushPair(framePointerRegister, linkRegister);
move(TrustedImm32(0x10000), supplementaryPlanesBase);
- move(TrustedImm32(0xfffffc00), surrogateTagMask);
move(TrustedImm32(0xd800), leadingSurrogateTag);
move(TrustedImm32(0xdc00), trailingSurrogateTag);
}
@@ -3815,6 +3846,7 @@
, m_charSize(charSize)
, m_decodeSurrogatePairs(m_charSize == Char16 && m_pattern.unicode())
, m_unicodeIgnoreCase(m_pattern.unicode() && m_pattern.ignoreCase())
+ , m_fixedSizedAlternative(false)
, m_canonicalMode(m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS)
, m_containsNestedSubpatterns(false)
@@ -3869,6 +3901,11 @@
generateFailReturn();
hasInput.link(this);
+#ifdef JIT_UNICODE_EXPRESSIONS
+ if (m_decodeSurrogatePairs)
+ getEffectiveAddress(BaseIndex(input, length, TimesTwo), endOfStringAddress);
+#endif
+
#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS)
if (m_containsNestedSubpatterns)
move(TrustedImm32(matchLimit), remainingMatchCount);
@@ -4163,6 +4200,7 @@
bool m_decodeSurrogatePairs;
bool m_unicodeIgnoreCase;
+ bool m_fixedSizedAlternative;
CanonicalMode m_canonicalMode;
#if ENABLE(YARR_JIT_ALL_PARENS_EXPRESSIONS)
bool m_containsNestedSubpatterns;
Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.cpp (243641 => 243642)
--- trunk/Source/_javascript_Core/yarr/YarrPattern.cpp 2019-03-29 05:18:47 UTC (rev 243641)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.cpp 2019-03-29 06:05:55 UTC (rev 243642)
@@ -45,8 +45,8 @@
public:
CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
: m_isCaseInsensitive(isCaseInsensitive)
- , m_hasNonBMPCharacters(false)
, m_anyCharacter(false)
+ , m_characterWidths(CharacterClassWidths::Unknown)
, m_canonicalMode(canonicalMode)
{
}
@@ -57,8 +57,8 @@
m_ranges.clear();
m_matchesUnicode.clear();
m_rangesUnicode.clear();
- m_hasNonBMPCharacters = false;
m_anyCharacter = false;
+ m_characterWidths = CharacterClassWidths::Unknown;
}
void append(const CharacterClass* other)
@@ -246,11 +246,11 @@
characterClass->m_ranges.swap(m_ranges);
characterClass->m_matchesUnicode.swap(m_matchesUnicode);
characterClass->m_rangesUnicode.swap(m_rangesUnicode);
- characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters();
characterClass->m_anyCharacter = anyCharacter();
+ characterClass->m_characterWidths = characterWidths();
- m_hasNonBMPCharacters = false;
m_anyCharacter = false;
+ m_characterWidths = CharacterClassWidths::Unknown;
return characterClass;
}
@@ -266,8 +266,7 @@
unsigned pos = 0;
unsigned range = matches.size();
- if (!U_IS_BMP(ch))
- m_hasNonBMPCharacters = true;
+ m_characterWidths |= (U_IS_BMP(ch) ? CharacterClassWidths::HasBMPChars : CharacterClassWidths::HasNonBMPChars);
// binary chop, find position to insert char.
while (range) {
@@ -316,8 +315,10 @@
{
size_t end = ranges.size();
+ if (U_IS_BMP(lo))
+ m_characterWidths |= CharacterClassWidths::HasBMPChars;
if (!U_IS_BMP(hi))
- m_hasNonBMPCharacters = true;
+ m_characterWidths |= CharacterClassWidths::HasNonBMPChars;
// Simple linear scan - I doubt there are that many ranges anyway...
// feel free to fix this with something faster (eg binary chop).
@@ -408,9 +409,14 @@
bool hasNonBMPCharacters()
{
- return m_hasNonBMPCharacters;
+ return m_characterWidths & CharacterClassWidths::HasNonBMPChars;
}
+ CharacterClassWidths characterWidths()
+ {
+ return m_characterWidths;
+ }
+
bool anyCharacter()
{
return m_anyCharacter;
@@ -417,8 +423,9 @@
}
bool m_isCaseInsensitive : 1;
- bool m_hasNonBMPCharacters : 1;
bool m_anyCharacter : 1;
+ CharacterClassWidths m_characterWidths;
+
CanonicalMode m_canonicalMode;
Vector<UChar32> m_matches;
@@ -836,8 +843,16 @@
} else if (m_pattern.unicode()) {
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
- currentInputPosition += term.quantityMaxCount;
- alternative->m_hasFixedSize = false;
+ if (term.characterClass->hasOneCharacterSize() && !term.invert()) {
+ Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
+ tempCount *= term.characterClass->hasNonBMPCharacters() ? 2 : 1;
+ if (tempCount.hasOverflowed())
+ return ErrorCode::OffsetTooLarge;
+ currentInputPosition += tempCount;
+ } else {
+ currentInputPosition += term.quantityMaxCount;
+ alternative->m_hasFixedSize = false;
+ }
} else
currentInputPosition += term.quantityMaxCount;
break;
@@ -1318,6 +1333,7 @@
break;
case TypeCharacterClass:
out.print("character class ");
+ out.printf("inputPosition %u ", inputPosition);
dumpCharacterClass(out, thisPattern, characterClass);
dumpQuantifier(out);
if (quantityType != QuantifierFixedCount || thisPattern->unicode())
@@ -1461,7 +1477,7 @@
auto characterClass = std::make_unique<CharacterClass>();
characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
- characterClass->m_hasNonBMPCharacters = true;
+ characterClass->m_characterWidths = CharacterClassWidths::HasBothBMPAndNonBMP;
characterClass->m_anyCharacter = true;
return characterClass;
}
Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.h (243641 => 243642)
--- trunk/Source/_javascript_Core/yarr/YarrPattern.h 2019-03-29 05:18:47 UTC (rev 243641)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.h 2019-03-29 06:05:55 UTC (rev 243642)
@@ -52,6 +52,29 @@
}
};
+enum struct CharacterClassWidths : unsigned char {
+ Unknown = 0x0,
+ HasBMPChars = 0x1,
+ HasNonBMPChars = 0x2,
+ HasBothBMPAndNonBMP = HasBMPChars | HasNonBMPChars
+};
+
+inline CharacterClassWidths operator|(CharacterClassWidths lhs, CharacterClassWidths rhs)
+{
+ return static_cast<CharacterClassWidths>(static_cast<unsigned>(lhs) | static_cast<unsigned>(rhs));
+}
+
+inline bool operator&(CharacterClassWidths lhs, CharacterClassWidths rhs)
+{
+ return static_cast<unsigned>(lhs) & static_cast<unsigned>(rhs);
+}
+
+inline CharacterClassWidths& operator|=(CharacterClassWidths& lhs, CharacterClassWidths rhs)
+{
+ lhs = lhs | rhs;
+ return lhs;
+}
+
struct CharacterClass {
WTF_MAKE_FAST_ALLOCATED;
public:
@@ -60,29 +83,34 @@
// specified matches and ranges)
CharacterClass()
: m_table(0)
- , m_hasNonBMPCharacters(false)
+ , m_characterWidths(CharacterClassWidths::Unknown)
, m_anyCharacter(false)
{
}
CharacterClass(const char* table, bool inverted)
: m_table(table)
+ , m_characterWidths(CharacterClassWidths::Unknown)
, m_tableInverted(inverted)
- , m_hasNonBMPCharacters(false)
, m_anyCharacter(false)
{
}
- CharacterClass(std::initializer_list<UChar32> matches, std::initializer_list<CharacterRange> ranges, std::initializer_list<UChar32> matchesUnicode, std::initializer_list<CharacterRange> rangesUnicode)
+ CharacterClass(std::initializer_list<UChar32> matches, std::initializer_list<CharacterRange> ranges, std::initializer_list<UChar32> matchesUnicode, std::initializer_list<CharacterRange> rangesUnicode, CharacterClassWidths widths)
: m_matches(matches)
, m_ranges(ranges)
, m_matchesUnicode(matchesUnicode)
, m_rangesUnicode(rangesUnicode)
, m_table(0)
+ , m_characterWidths(widths)
, m_tableInverted(false)
- , m_hasNonBMPCharacters(false)
, m_anyCharacter(false)
{
}
+ bool hasNonBMPCharacters() { return m_characterWidths & CharacterClassWidths::HasNonBMPChars; }
+
+ bool hasOneCharacterSize() { return m_characterWidths == CharacterClassWidths::HasBMPChars || m_characterWidths == CharacterClassWidths::HasNonBMPChars; }
+ bool hasOnlyNonBMPCharacters() { return m_characterWidths == CharacterClassWidths::HasNonBMPChars; }
+
Vector<UChar32> m_matches;
Vector<CharacterRange> m_ranges;
Vector<UChar32> m_matchesUnicode;
@@ -89,8 +117,8 @@
Vector<CharacterRange> m_rangesUnicode;
const char* m_table;
+ CharacterClassWidths m_characterWidths;
bool m_tableInverted : 1;
- bool m_hasNonBMPCharacters : 1;
bool m_anyCharacter : 1;
};
@@ -220,7 +248,7 @@
return PatternTerm(TypeAssertionWordBoundary, invert);
}
- bool invert()
+ bool invert() const
{
return m_invert;
}
Modified: trunk/Source/_javascript_Core/yarr/create_regex_tables (243641 => 243642)
--- trunk/Source/_javascript_Core/yarr/create_regex_tables 2019-03-29 05:18:47 UTC (rev 243641)
+++ trunk/Source/_javascript_Core/yarr/create_regex_tables 2019-03-29 06:05:55 UTC (rev 243642)
@@ -100,8 +100,13 @@
function += (" auto characterClass = std::make_unique<CharacterClass>(_%sData, false);\n" % (name))
else:
function += (" auto characterClass = std::make_unique<CharacterClass>();\n")
+ hasBMPCharacters = False
hasNonBMPCharacters = False
for (min, max) in ranges:
+ if min < 0x10000:
+ hasBMPCharacters = True
+ if max >= 0x10000:
+ hasNonBMPCharacters = True
if (min == max):
if (min > 127):
function += (" characterClass->m_matchesUnicode.append(0x%04x);\n" % min)
@@ -112,9 +117,7 @@
function += (" characterClass->m_rangesUnicode.append(CharacterRange(0x%04x, 0x%04x));\n" % (min, max))
else:
function += (" characterClass->m_ranges.append(CharacterRange(0x%02x, 0x%02x));\n" % (min, max))
- if max >= 0x10000:
- hasNonBMPCharacters = True
- function += (" characterClass->m_hasNonBMPCharacters = %s;\n" % ("true" if hasNonBMPCharacters else "false"))
+ function += (" characterClass->m_characterWidths = CharacterClassWidths::%s;\n" % (("Unknown", "HasBMPChars", "HasNonBMPChars", "HasBothBMPAndNonBMP")[int(hasNonBMPCharacters) * 2 + int(hasBMPCharacters)]))
function += (" return characterClass;\n")
function += ("}\n\n")
functions += function
Modified: trunk/Source/_javascript_Core/yarr/generateYarrUnicodePropertyTables.py (243641 => 243642)
--- trunk/Source/_javascript_Core/yarr/generateYarrUnicodePropertyTables.py 2019-03-29 05:18:47 UTC (rev 243641)
+++ trunk/Source/_javascript_Core/yarr/generateYarrUnicodePropertyTables.py 2019-03-29 06:05:55 UTC (rev 243642)
@@ -35,7 +35,7 @@
from hasher import stringHash
header = """/*
-* Copyright (C) 2017-2018 Apple Inc. All rights reserved.
+* Copyright (C) 2017-2019 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -225,6 +225,7 @@
self.name = name
self.aliases = []
self.index = len(PropertyData.allPropertyData)
+ self.hasBMPCharacters = False
self.hasNonBMPCharacters = False
self.matches = []
self.ranges = []
@@ -249,7 +250,9 @@
return "createCharacterClass{}".format(self.index)
def addMatch(self, codePoint):
- if codePoint > MaxBMP:
+ if codePoint <= MaxBMP:
+ self.hasBMPCharacters = True
+ else:
self.hasNonBMPCharacters = True
if codePoint <= lastASCIICodePoint:
if (len(self.matches) and self.matches[-1] > codePoint) or (len(self.ranges) and self.ranges[-1][1] > codePoint):
@@ -281,6 +284,8 @@
self.unicodeMatches.append(codePoint)
def addRange(self, lowCodePoint, highCodePoint):
+ if lowCodePoint <= MaxBMP:
+ self.hasBMPCharacters = True
if highCodePoint > MaxBMP:
self.hasNonBMPCharacters = True
if highCodePoint <= lastASCIICodePoint:
@@ -536,9 +541,9 @@
file.write("),\n")
file.write(" std::initializer_list<CharacterRange>(")
self.dumpMatchData(file, 4, self.unicodeRanges, lambda file, range: (file.write("{{{0:0=#6x}, {1:0=#6x}}}".format(range[0], range[1]))))
- file.write("));\n")
+ file.write("),\n")
- file.write(" characterClass->m_hasNonBMPCharacters = {};\n".format(("false", "true")[self.hasNonBMPCharacters]))
+ file.write(" CharacterClassWidths::{});\n".format(("Unknown", "HasBMPChars", "HasNonBMPChars", "HasBothBMPAndNonBMP")[int(self.hasNonBMPCharacters) * 2 + int(self.hasBMPCharacters)]))
file.write(" return characterClass;\n}\n\n")
@classmethod