Revision: 6225
Author: [email protected]
Date: Fri Jan 7 04:35:42 2011
Log: Change interpretation of malformed \c? escapes in RegExp to match JSC.
Review URL: http://codereview.chromium.org/6171001
http://code.google.com/p/v8/source/detail?r=6225
Modified:
/branches/bleeding_edge/src/parser.cc
/branches/bleeding_edge/src/parser.h
/branches/bleeding_edge/src/scanner-base.cc
/branches/bleeding_edge/test/cctest/test-parsing.cc
/branches/bleeding_edge/test/cctest/test-regexp.cc
/branches/bleeding_edge/test/mjsunit/regexp.js
=======================================
--- /branches/bleeding_edge/src/parser.cc Thu Jan 6 06:00:50 2011
+++ /branches/bleeding_edge/src/parser.cc Fri Jan 7 04:35:42 2011
@@ -4022,9 +4022,21 @@
builder->AddCharacter('\v');
break;
case 'c': {
- Advance(2);
- uc32 control = ParseControlLetterEscape();
- builder->AddCharacter(control);
+ Advance();
+ uc32 controlLetter = Next();
+ // Special case if it is an ASCII letter.
+ // Convert lower case letters to uppercase.
+ uc32 letter = controlLetter & ~('a' ^ 'A');
+ if (letter < 'A' || 'Z' < letter) {
+ // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
+ // This is outside the specification. We match JSC in
+ // reading the backslash as a literal character instead
+ // of as starting an escape.
+ builder->AddCharacter('\\');
+ } else {
+ Advance(2);
+ builder->AddCharacter(controlLetter & 0x1f);
+ }
break;
}
case 'x': {
@@ -4297,23 +4309,6 @@
*max_out = max;
return true;
}
-
-
-// Upper and lower case letters differ by one bit.
-STATIC_CHECK(('a' ^ 'A') == 0x20);
-
-uc32 RegExpParser::ParseControlLetterEscape() {
- if (!has_more())
- return 'c';
- uc32 letter = current() & ~(0x20); // Collapse upper and lower case
letters.
- if (letter < 'A' || 'Z' < letter) {
- // Non-spec error-correction: "\c" followed by non-control letter is
- // interpreted as an IdentityEscape of 'c'.
- return 'c';
- }
- Advance();
- return letter & 0x1f; // Remainder modulo 32, per specification.
-}
uc32 RegExpParser::ParseOctalLiteral() {
@@ -4381,9 +4376,23 @@
case 'v':
Advance();
return '\v';
- case 'c':
- Advance();
- return ParseControlLetterEscape();
+ case 'c': {
+ uc32 controlLetter = Next();
+ uc32 letter = controlLetter & ~('A' ^ 'a');
+ // For compatibility with JSC, inside a character class
+ // we also accept digits and underscore as control characters.
+ if ((controlLetter >= '0' && controlLetter <= '9') ||
+ controlLetter == '_' ||
+ (letter >= 'A' && letter <= 'Z')) {
+ Advance(2);
+ // Control letters mapped to ASCII control characters in the range
+ // 0x00-0x1f.
+ return controlLetter & 0x1f;
+ }
+ // We match JSC in reading the backslash as a literal
+ // character instead of as starting an escape.
+ return '\\';
+ }
case '0': case '1': case '2': case '3': case '4': case '5':
case '6': case '7':
// For compatibility, we interpret a decimal escape that isn't
=======================================
--- /branches/bleeding_edge/src/parser.h Wed Dec 22 12:14:19 2010
+++ /branches/bleeding_edge/src/parser.h Fri Jan 7 04:35:42 2011
@@ -321,7 +321,6 @@
// and sets the value if it is.
bool ParseHexEscape(int length, uc32* value);
- uc32 ParseControlLetterEscape();
uc32 ParseOctalLiteral();
// Tries to parse the input as a back reference. If successful it
=======================================
--- /branches/bleeding_edge/src/scanner-base.cc Wed Dec 22 12:14:19 2010
+++ /branches/bleeding_edge/src/scanner-base.cc Fri Jan 7 04:35:42 2011
@@ -731,11 +731,18 @@
while (c0_ != '/' || in_character_class) {
if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return
false;
- if (c0_ == '\\') { // escaped character
+ if (c0_ == '\\') { // Escape sequence.
AddLiteralCharAdvance();
if (ScannerConstants::kIsLineTerminator.get(c0_) || c0_ < 0) return
false;
AddLiteralCharAdvance();
- } else { // unescaped character
+ // If the escape allows more characters, i.e., \x??, \u????, or \c?,
+ // only "safe" characters are allowed (letters, digits, underscore),
+ // otherwise the escape isn't valid and the invalid character has
+ // its normal meaning. I.e., we can just continue scanning without
+ // worrying whether the following characters are part of the escape
+ // or not, since any '/', '\\' or '[' is guaranteed to not be part
+ // of the escape sequence.
+ } else { // Unescaped character.
if (c0_ == '[') in_character_class = true;
if (c0_ == ']') in_character_class = false;
AddLiteralCharAdvance();
=======================================
--- /branches/bleeding_edge/test/cctest/test-parsing.cc Wed Dec 22 12:14:19
2010
+++ /branches/bleeding_edge/test/cctest/test-parsing.cc Fri Jan 7 04:35:42
2011
@@ -645,3 +645,58 @@
TestStreamScanner(&stream3, expectations3, 1, 1 + i);
}
}
+
+
+void TestScanRegExp(const char* re_source, const char* expected) {
+ i::Utf8ToUC16CharacterStream stream(
+ reinterpret_cast<const i::byte*>(re_source),
+ static_cast<unsigned>(strlen(re_source)));
+ i::V8JavaScriptScanner scanner;
+ scanner.Initialize(&stream);
+
+ i::Token::Value start = scanner.peek();
+ CHECK(start == i::Token::DIV || start == i::Token::ASSIGN_DIV);
+ CHECK(scanner.ScanRegExpPattern(start == i::Token::ASSIGN_DIV));
+ scanner.Next(); // Current token is now the regexp literal.
+ CHECK(scanner.is_literal_ascii());
+ i::Vector<const char> actual = scanner.literal_ascii_string();
+ for (int i = 0; i < actual.length(); i++) {
+ CHECK_NE('\0', expected[i]);
+ CHECK_EQ(expected[i], actual[i]);
+ }
+}
+
+
+TEST(RegExpScanning) {
+ // RegExp token with added garbage at the end. The scanner should only
+ // scan the RegExp until the terminating slash just before "flipperwald".
+ TestScanRegExp("/b/flipperwald", "b");
+ // Incomplete escape sequences doesn't hide the terminating slash.
+ TestScanRegExp("/\\x/flipperwald", "\\x");
+ TestScanRegExp("/\\u/flipperwald", "\\u");
+ TestScanRegExp("/\\u1/flipperwald", "\\u1");
+ TestScanRegExp("/\\u12/flipperwald", "\\u12");
+ TestScanRegExp("/\\u123/flipperwald", "\\u123");
+ TestScanRegExp("/\\c/flipperwald", "\\c");
+ TestScanRegExp("/\\c//flipperwald", "\\c");
+ // Slashes inside character classes are not terminating.
+ TestScanRegExp("/[/]/flipperwald", "[/]");
+ TestScanRegExp("/[\\s-/]/flipperwald", "[\\s-/]");
+ // Incomplete escape sequences inside a character class doesn't hide
+ // the end of the character class.
+ TestScanRegExp("/[\\c/]/flipperwald", "[\\c/]");
+ TestScanRegExp("/[\\c]/flipperwald", "[\\c]");
+ TestScanRegExp("/[\\x]/flipperwald", "[\\x]");
+ TestScanRegExp("/[\\x1]/flipperwald", "[\\x1]");
+ TestScanRegExp("/[\\u]/flipperwald", "[\\u]");
+ TestScanRegExp("/[\\u1]/flipperwald", "[\\u1]");
+ TestScanRegExp("/[\\u12]/flipperwald", "[\\u12]");
+ TestScanRegExp("/[\\u123]/flipperwald", "[\\u123]");
+ // Escaped ']'s wont end the character class.
+ TestScanRegExp("/[\\]/]/flipperwald", "[\\]/]");
+ // Escaped slashes are not terminating.
+ TestScanRegExp("/\\//flipperwald", "\\/");
+ // Starting with '=' works too.
+ TestScanRegExp("/=/", "=");
+ TestScanRegExp("/=?/", "=?");
+}
=======================================
--- /branches/bleeding_edge/test/cctest/test-regexp.cc Thu Dec 9 04:07:52
2010
+++ /branches/bleeding_edge/test/cctest/test-regexp.cc Fri Jan 7 04:35:42
2011
@@ -176,11 +176,23 @@
CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]");
CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]");
CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]");
+ // Control character outside character class.
CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK",
"'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
- CHECK_PARSE_EQ("\\c!", "'c!'");
- CHECK_PARSE_EQ("\\c_", "'c_'");
- CHECK_PARSE_EQ("\\c~", "'c~'");
+ CHECK_PARSE_EQ("\\c!", "'\\c!'");
+ CHECK_PARSE_EQ("\\c_", "'\\c_'");
+ CHECK_PARSE_EQ("\\c~", "'\\c~'");
+ CHECK_PARSE_EQ("\\c1", "'\\c1'");
+ // Control character inside character class.
+ CHECK_PARSE_EQ("[\\c!]", "[\\ c !]");
+ CHECK_PARSE_EQ("[\\c_]", "[\\x1f]");
+ CHECK_PARSE_EQ("[\\c~]", "[\\ c ~]");
+ CHECK_PARSE_EQ("[\\ca]", "[\\x01]");
+ CHECK_PARSE_EQ("[\\cz]", "[\\x1a]");
+ CHECK_PARSE_EQ("[\\cA]", "[\\x01]");
+ CHECK_PARSE_EQ("[\\cZ]", "[\\x1a]");
+ CHECK_PARSE_EQ("[\\c1]", "[\\x11]");
+
CHECK_PARSE_EQ("[a\\]c]", "[a ] c]");
CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ #
]");
@@ -234,7 +246,7 @@
CHECK_PARSE_EQ("\\x34", "'\x34'");
CHECK_PARSE_EQ("\\x60", "'\x60'");
CHECK_PARSE_EQ("\\x3z", "'x3z'");
- CHECK_PARSE_EQ("\\c", "'c'");
+ CHECK_PARSE_EQ("\\c", "'\\c'");
CHECK_PARSE_EQ("\\u0034", "'\x34'");
CHECK_PARSE_EQ("\\u003z", "'u003z'");
CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))");
=======================================
--- /branches/bleeding_edge/test/mjsunit/regexp.js Mon Dec 13 00:33:32 2010
+++ /branches/bleeding_edge/test/mjsunit/regexp.js Fri Jan 7 04:35:42 2011
@@ -84,15 +84,14 @@
assertEquals(result[5], 'E');
assertEquals(result[6], 'F');
-// Some tests from the Mozilla tests, where our behavior differs from
+// Some tests from the Mozilla tests, where our behavior used to differ
from
// SpiderMonkey.
// From ecma_3/RegExp/regress-334158.js
assertTrue(/\ca/.test( "\x01" ));
assertFalse(/\ca/.test( "\\ca" ));
-// Passes in KJS, fails in IrregularExpressions.
-// See http://code.google.com/p/v8/issues/detail?id=152
-//assertTrue(/\c[a/]/.test( "\x1ba/]" ));
-
+assertFalse(/\ca/.test( "ca" ));
+assertTrue(/\c[a/]/.test( "\\ca" ));
+assertTrue(/\c[a/]/.test( "\\c/" ));
// Test \c in character class
re = /^[\cM]$/;
@@ -104,11 +103,29 @@
re = /^[\c]]$/;
assertTrue(re.test("c]"));
-assertFalse(re.test("\\]"));
+assertTrue(re.test("\\]"));
assertFalse(re.test("\x1d")); // ']' & 0x1f
-assertFalse(re.test("\\]"));
assertFalse(re.test("\x03]")); // I.e., read as \cc
+re = /^[\c1]$/; // Digit control characters are masked in character
classes.
+assertTrue(re.test("\x11"));
+assertFalse(re.test("\\"));
+assertFalse(re.test("c"));
+assertFalse(re.test("1"));
+
+re = /^[\c_]$/; // Underscore control character is masked in character
classes.
+assertTrue(re.test("\x1f"));
+assertFalse(re.test("\\"));
+assertFalse(re.test("c"));
+assertFalse(re.test("_"));
+
+re = /^[\c$]$/; // Other characters are interpreted literally.
+assertFalse(re.test("\x04"));
+assertTrue(re.test("\\"));
+assertTrue(re.test("c"));
+assertTrue(re.test("$"));
+
+assertTrue(/^[Z-\c-e]*$/.test("Z[\\cde"));
// Test that we handle \s and \S correctly inside some bizarre
// character classes.
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev