Diff
Modified: trunk/JSTests/ChangeLog (255133 => 255134)
--- trunk/JSTests/ChangeLog 2020-01-26 22:47:06 UTC (rev 255133)
+++ trunk/JSTests/ChangeLog 2020-01-26 23:28:44 UTC (rev 255134)
@@ -1,3 +1,12 @@
+2020-01-26 Alexey Shvayka <[email protected]>
+
+ Invalid ranges in character classes should be banned in unicode patterns
+ https://bugs.webkit.org/show_bug.cgi?id=206768
+
+ Reviewed by Darin Adler.
+
+ * test262/expectations.yaml: Mark 18 test cases as passing.
+
2020-01-24 Mark Lam <[email protected]>
IntlObject's cached strings should be immortal and safe for concurrent access.
Modified: trunk/JSTests/test262/expectations.yaml (255133 => 255134)
--- trunk/JSTests/test262/expectations.yaml 2020-01-26 22:47:06 UTC (rev 255133)
+++ trunk/JSTests/test262/expectations.yaml 2020-01-26 23:28:44 UTC (rev 255134)
@@ -1240,18 +1240,6 @@
test/built-ins/RegExp/named-groups/unicode-property-names.js:
default: 'SyntaxError: Invalid regular _expression_: invalid group specifier name'
strict mode: 'SyntaxError: Invalid regular _expression_: invalid group specifier name'
-test/built-ins/RegExp/property-escapes/character-class-range-end.js:
- default: 'Test262: This statement should not be evaluated.'
- strict mode: 'Test262: This statement should not be evaluated.'
-test/built-ins/RegExp/property-escapes/character-class-range-no-dash-end.js:
- default: 'Test262: This statement should not be evaluated.'
- strict mode: 'Test262: This statement should not be evaluated.'
-test/built-ins/RegExp/property-escapes/character-class-range-no-dash-start.js:
- default: 'Test262: This statement should not be evaluated.'
- strict mode: 'Test262: This statement should not be evaluated.'
-test/built-ins/RegExp/property-escapes/character-class-range-start.js:
- default: 'Test262: This statement should not be evaluated.'
- strict mode: 'Test262: This statement should not be evaluated.'
test/built-ins/RegExp/property-escapes/generated/Alphabetic.js:
default: 'Test262Error: `\p{Alphabetic}` should match U+001CFA (`ᳺ`)'
strict mode: 'Test262Error: `\p{Alphabetic}` should match U+001CFA (`ᳺ`)'
@@ -1609,9 +1597,6 @@
test/built-ins/RegExp/unicode_restricted_brackets.js:
default: 'Test262Error: RegExp("]", "u"): Expected a SyntaxError to be thrown but no exception was thrown at all'
strict mode: 'Test262Error: RegExp("]", "u"): Expected a SyntaxError to be thrown but no exception was thrown at all'
-test/built-ins/RegExp/unicode_restricted_character_class_escape.js:
- default: 'Test262Error: RegExp("[\d-a]", "u"): Expected a SyntaxError to be thrown but no exception was thrown at all'
- strict mode: 'Test262Error: RegExp("[\d-a]", "u"): Expected a SyntaxError to be thrown but no exception was thrown at all'
test/built-ins/RegExp/unicode_restricted_identity_escape.js:
default: "Test262Error: Invalid IdentityEscape in AtomEscape: '\\"
strict mode: "Test262Error: Invalid IdentityEscape in AtomEscape: '\\"
@@ -3423,18 +3408,6 @@
test/language/literals/regexp/u-invalid-legacy-octal-escape.js:
default: 'Test262: This statement should not be evaluated.'
strict mode: 'Test262: This statement should not be evaluated.'
-test/language/literals/regexp/u-invalid-non-empty-class-ranges-no-dash-a.js:
- default: 'Test262: This statement should not be evaluated.'
- strict mode: 'Test262: This statement should not be evaluated.'
-test/language/literals/regexp/u-invalid-non-empty-class-ranges-no-dash-ab.js:
- default: 'Test262: This statement should not be evaluated.'
- strict mode: 'Test262: This statement should not be evaluated.'
-test/language/literals/regexp/u-invalid-non-empty-class-ranges-no-dash-b.js:
- default: 'Test262: This statement should not be evaluated.'
- strict mode: 'Test262: This statement should not be evaluated.'
-test/language/literals/regexp/u-invalid-non-empty-class-ranges.js:
- default: 'Test262: This statement should not be evaluated.'
- strict mode: 'Test262: This statement should not be evaluated.'
test/language/literals/regexp/u-invalid-oob-decimal-escape.js:
default: 'Test262: This statement should not be evaluated.'
strict mode: 'Test262: This statement should not be evaluated.'
Modified: trunk/Source/_javascript_Core/ChangeLog (255133 => 255134)
--- trunk/Source/_javascript_Core/ChangeLog 2020-01-26 22:47:06 UTC (rev 255133)
+++ trunk/Source/_javascript_Core/ChangeLog 2020-01-26 23:28:44 UTC (rev 255134)
@@ -1,3 +1,29 @@
+2020-01-26 Alexey Shvayka <[email protected]>
+
+ Invalid ranges in character classes should be banned in unicode patterns
+ https://bugs.webkit.org/show_bug.cgi?id=206768
+
+ Reviewed by Darin Adler.
+
+ In ES5, grammar of CharacterRange was ambiguous, resulting in invalid ranges
+ like /[\d-a]/ being allowed. As of ES2015, invalid ranges are SyntaxError in
+ unicode patterns, yet still allowed in regular ones to avoid breaking the web.
+ (https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors-annexb)
+
+ This change adds SyntaxError for unicode patterns and updates explanatory
+ comments. ErrorCode::CharacterClassOutOfOrder is renamed for consistency
+ with newly added error code and ErrorCode::ParenthesesTypeInvalid.
+
+ * yarr/YarrErrorCode.cpp:
+ (JSC::Yarr::errorMessage):
+ (JSC::Yarr::errorToThrow):
+ * yarr/YarrErrorCode.h:
+ * yarr/YarrParser.h:
+ (JSC::Yarr::Parser::CharacterClassParserDelegate::CharacterClassParserDelegate):
+ (JSC::Yarr::Parser::CharacterClassParserDelegate::atomPatternCharacter):
+ (JSC::Yarr::Parser::CharacterClassParserDelegate::atomBuiltInCharacterClass):
+ (JSC::Yarr::Parser::parseCharacterClass):
+
2020-01-24 Mark Lam <[email protected]>
Move singleton Intl string locales out of JSGlobalObject.
Modified: trunk/Source/_javascript_Core/yarr/YarrErrorCode.cpp (255133 => 255134)
--- trunk/Source/_javascript_Core/yarr/YarrErrorCode.cpp 2020-01-26 22:47:06 UTC (rev 255133)
+++ trunk/Source/_javascript_Core/yarr/YarrErrorCode.cpp 2020-01-26 23:28:44 UTC (rev 255134)
@@ -35,26 +35,27 @@
#define REGEXP_ERROR_PREFIX "Invalid regular _expression_: "
// The order of this array must match the ErrorCode enum.
static const char* errorMessages[] = {
- nullptr, // NoError
- REGEXP_ERROR_PREFIX "regular _expression_ too large", // PatternTooLarge
- REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", // QuantifierOutOfOrder
- REGEXP_ERROR_PREFIX "nothing to repeat", // QuantifierWithoutAtom
- REGEXP_ERROR_PREFIX "number too large in {} quantifier", // QuantifierTooLarge
- REGEXP_ERROR_PREFIX "missing )", // MissingParentheses
- REGEXP_ERROR_PREFIX "unmatched parentheses", // ParenthesesUnmatched
- REGEXP_ERROR_PREFIX "unrecognized character after (?", // ParenthesesTypeInvalid
- REGEXP_ERROR_PREFIX "invalid group specifier name", // InvalidGroupName
- REGEXP_ERROR_PREFIX "duplicate group specifier name", // DuplicateGroupName
- REGEXP_ERROR_PREFIX "missing terminating ] for character class", // CharacterClassUnmatched
- REGEXP_ERROR_PREFIX "range out of order in character class", // CharacterClassOutOfOrder
- REGEXP_ERROR_PREFIX "\\ at end of pattern", // EscapeUnterminated
- REGEXP_ERROR_PREFIX "invalid unicode {} escape", // InvalidUnicodeEscape
- REGEXP_ERROR_PREFIX "invalid backreference for unicode pattern", // InvalidBackreference
- REGEXP_ERROR_PREFIX "invalid escaped character for unicode pattern", // InvalidIdentityEscape
- REGEXP_ERROR_PREFIX "invalid property _expression_", // InvalidUnicodePropertyExpression
- REGEXP_ERROR_PREFIX "too many nested disjunctions", // TooManyDisjunctions
- REGEXP_ERROR_PREFIX "pattern exceeds string length limits", // OffsetTooLarge
- REGEXP_ERROR_PREFIX "invalid flags" // InvalidRegularExpressionFlags
+ nullptr, // NoError
+ REGEXP_ERROR_PREFIX "regular _expression_ too large", // PatternTooLarge
+ REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", // QuantifierOutOfOrder
+ REGEXP_ERROR_PREFIX "nothing to repeat", // QuantifierWithoutAtom
+ REGEXP_ERROR_PREFIX "number too large in {} quantifier", // QuantifierTooLarge
+ REGEXP_ERROR_PREFIX "missing )", // MissingParentheses
+ REGEXP_ERROR_PREFIX "unmatched parentheses", // ParenthesesUnmatched
+ REGEXP_ERROR_PREFIX "unrecognized character after (?", // ParenthesesTypeInvalid
+ REGEXP_ERROR_PREFIX "invalid group specifier name", // InvalidGroupName
+ REGEXP_ERROR_PREFIX "duplicate group specifier name", // DuplicateGroupName
+ REGEXP_ERROR_PREFIX "missing terminating ] for character class", // CharacterClassUnmatched
+ REGEXP_ERROR_PREFIX "range out of order in character class", // CharacterClassRangeOutOfOrder
+ REGEXP_ERROR_PREFIX "invalid range in character class for unicode pattern", // CharacterClassRangeInvalid
+ REGEXP_ERROR_PREFIX "\\ at end of pattern", // EscapeUnterminated
+ REGEXP_ERROR_PREFIX "invalid unicode {} escape", // InvalidUnicodeEscape
+ REGEXP_ERROR_PREFIX "invalid backreference for unicode pattern", // InvalidBackreference
+ REGEXP_ERROR_PREFIX "invalid escaped character for unicode pattern", // InvalidIdentityEscape
+ REGEXP_ERROR_PREFIX "invalid property _expression_", // InvalidUnicodePropertyExpression
+ REGEXP_ERROR_PREFIX "too many nested disjunctions", // TooManyDisjunctions
+ REGEXP_ERROR_PREFIX "pattern exceeds string length limits", // OffsetTooLarge
+ REGEXP_ERROR_PREFIX "invalid flags" // InvalidRegularExpressionFlags
};
return errorMessages[static_cast<unsigned>(error)];
@@ -76,7 +77,8 @@
case ErrorCode::InvalidGroupName:
case ErrorCode::DuplicateGroupName:
case ErrorCode::CharacterClassUnmatched:
- case ErrorCode::CharacterClassOutOfOrder:
+ case ErrorCode::CharacterClassRangeOutOfOrder:
+ case ErrorCode::CharacterClassRangeInvalid:
case ErrorCode::EscapeUnterminated:
case ErrorCode::InvalidUnicodeEscape:
case ErrorCode::InvalidBackreference:
Modified: trunk/Source/_javascript_Core/yarr/YarrErrorCode.h (255133 => 255134)
--- trunk/Source/_javascript_Core/yarr/YarrErrorCode.h 2020-01-26 22:47:06 UTC (rev 255133)
+++ trunk/Source/_javascript_Core/yarr/YarrErrorCode.h 2020-01-26 23:28:44 UTC (rev 255134)
@@ -45,7 +45,8 @@
InvalidGroupName,
DuplicateGroupName,
CharacterClassUnmatched,
- CharacterClassOutOfOrder,
+ CharacterClassRangeOutOfOrder,
+ CharacterClassRangeInvalid,
EscapeUnterminated,
InvalidUnicodeEscape,
InvalidBackreference,
Modified: trunk/Source/_javascript_Core/yarr/YarrParser.h (255133 => 255134)
--- trunk/Source/_javascript_Core/yarr/YarrParser.h 2020-01-26 22:47:06 UTC (rev 255133)
+++ trunk/Source/_javascript_Core/yarr/YarrParser.h 2020-01-26 23:28:44 UTC (rev 255134)
@@ -54,9 +54,10 @@
*/
class CharacterClassParserDelegate {
public:
- CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
+ CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err, bool isUnicode)
: m_delegate(delegate)
, m_errorCode(err)
+ , m_isUnicode(isUnicode)
, m_state(Empty)
, m_character(0)
{
@@ -85,13 +86,14 @@
{
switch (m_state) {
case AfterCharacterClass:
- // Following a builtin character class we need look out for a hyphen.
+ // Following a built-in character class we need look out for a hyphen.
// We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
- // If we see a hyphen following a charater class then unlike usual
+ // If we see a hyphen following a character class then unlike usual
// we'll report it to the delegate immediately, and put ourself into
- // a poisoned state. Any following calls to add another character or
- // character class will result in an error. (A hypen following a
- // character-class is itself valid, but only at the end of a regex).
+ // a poisoned state. In a unicode pattern, any following calls to add
+ // another character or character class will result in syntax error.
+ // A hypen following a character class is itself valid, but only at
+ // the end of a regex.
if (hyphenIsRange && ch == '-') {
m_delegate.atomCharacterClassAtom('-');
m_state = AfterCharacterClassHyphen;
@@ -116,7 +118,7 @@
case CachedCharacterHyphen:
if (ch < m_character) {
- m_errorCode = ErrorCode::CharacterClassOutOfOrder;
+ m_errorCode = ErrorCode::CharacterClassRangeOutOfOrder;
return;
}
m_delegate.atomCharacterClassRange(m_character, ch);
@@ -123,14 +125,13 @@
m_state = Empty;
return;
- // See coment in atomBuiltInCharacterClass below.
- // This too is technically an error, per ECMA-262, and again we
- // we chose to allow this. Note a subtlely here that while we
- // diverge from the spec's definition of CharacterRange we do
- // remain in compliance with the grammar. For example, consider
- // the _expression_ /[\d-a-z]/. We comply with the grammar in
- // this case by not allowing a-z to be matched as a range.
+ // If we hit this case, we have an invalid range like /[\d-a]/.
+ // See coment in atomBuiltInCharacterClass() below.
case AfterCharacterClassHyphen:
+ if (m_isUnicode) {
+ m_errorCode = ErrorCode::CharacterClassRangeInvalid;
+ return;
+ }
m_delegate.atomCharacterClassAtom(ch);
m_state = Empty;
return;
@@ -151,23 +152,27 @@
FALLTHROUGH;
case Empty:
case AfterCharacterClass:
+ m_delegate.atomCharacterClassBuiltIn(classID, invert);
m_state = AfterCharacterClass;
- m_delegate.atomCharacterClassBuiltIn(classID, invert);
return;
// If we hit either of these cases, we have an invalid range that
- // looks something like /[x-\d]/ or /[\d-\d]/.
- // According to ECMA-262 this should be a syntax error, but
- // empirical testing shows this to break teh webz. Instead we
- // comply with to the ECMA-262 grammar, and assume the grammar to
- // have matched the range correctly, but tweak our interpretation
- // of CharacterRange. Effectively we implicitly handle the hyphen
- // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
+ // looks something like /[a-\d]/ or /[\d-\d]/.
+ // Since ES2015, this should be syntax error in a unicode pattern,
+ // yet gracefully handled in a regular regex to avoid breaking the web.
+ // Effectively we handle the hyphen as if it was (implicitly) escaped,
+ // e.g. /[\d-a-z]/ is treated as /[\d\-a\-z]/.
+ // See usages of CharacterRangeOrUnion abstract op in
+ // https://tc39.es/ecma262/#sec-regular-_expression_-patterns-semantics
case CachedCharacterHyphen:
m_delegate.atomCharacterClassAtom(m_character);
m_delegate.atomCharacterClassAtom('-');
FALLTHROUGH;
case AfterCharacterClassHyphen:
+ if (m_isUnicode) {
+ m_errorCode = ErrorCode::CharacterClassRangeInvalid;
+ return;
+ }
m_delegate.atomCharacterClassBuiltIn(classID, invert);
m_state = Empty;
return;
@@ -201,6 +206,7 @@
private:
Delegate& m_delegate;
ErrorCode& m_errorCode;
+ bool m_isUnicode;
enum CharacterClassConstructionState {
Empty,
CachedCharacter,
@@ -595,7 +601,7 @@
ASSERT(peek() == '[');
consume();
- CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode);
+ CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode, m_isUnicode);
characterClassConstructor.begin(tryConsume('^'));