Revision: 6029
Author: [email protected]
Date: Wed Dec 15 04:43:21 2010
Log: Merge 5953 and 5974 to trunk to fix RegExp parsing incompatability.
Review URL: http://codereview.chromium.org/5850002
http://code.google.com/p/v8/source/detail?r=6029
Modified:
/trunk/src/parser.cc
/trunk/src/version.cc
/trunk/test/cctest/test-regexp.cc
/trunk/test/mjsunit/regexp.js
/trunk/test/mjsunit/third_party/regexp-pcre.js
/trunk/test/mozilla/mozilla.status
=======================================
--- /trunk/src/parser.cc Wed Dec 15 04:05:09 2010
+++ /trunk/src/parser.cc Wed Dec 15 04:43:21 2010
@@ -4407,12 +4407,27 @@
return CharacterRange::Singleton(first);
}
}
+
+
+static const uc16 kNoCharClass = 0;
+
+// Adds range or pre-defined character class to character ranges.
+// If char_class is not kInvalidClass, it's interpreted as a class
+// escape (i.e., 's' means whitespace, from '\s').
+static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
+ uc16 char_class,
+ CharacterRange range) {
+ if (char_class != kNoCharClass) {
+ CharacterRange::AddClassEscape(char_class, ranges);
+ } else {
+ ranges->Add(range);
+ }
+}
RegExpTree* RegExpParser::ParseCharacterClass() {
static const char* kUnterminated = "Unterminated character class";
static const char* kRangeOutOfOrder = "Range out of order in character
class";
- static const char* kInvalidRange = "Invalid character range";
ASSERT_EQ(current(), '[');
Advance();
@@ -4421,30 +4436,10 @@
is_negated = true;
Advance();
}
- // A CharacterClass is a sequence of single characters, character class
- // escapes or ranges. Ranges are on the form "x-y" where x and y are
- // single characters (and not character class escapes like \s).
- // A "-" may occur at the start or end of the character class (just after
- // "[" or "[^", or just before "]") without being considered part of a
- // range. A "-" may also appear as the beginning or end of a range.
- // I.e., [--+] is valid, so is [!--].
-
ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
while (has_more() && current() != ']') {
- uc16 char_class = 0;
+ uc16 char_class = kNoCharClass;
CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
- if (char_class) {
- CharacterRange::AddClassEscape(char_class, ranges);
- if (current() == '-') {
- Advance();
- ranges->Add(CharacterRange::Singleton('-'));
- if (current() != ']') {
- ReportError(CStrVector(kInvalidRange) CHECK_FAILED);
- }
- break;
- }
- continue;
- }
if (current() == '-') {
Advance();
if (current() == kEndMarker) {
@@ -4452,20 +4447,25 @@
// following code report an error.
break;
} else if (current() == ']') {
- ranges->Add(first);
+ AddRangeOrEscape(ranges, char_class, first);
ranges->Add(CharacterRange::Singleton('-'));
break;
}
- CharacterRange next = ParseClassAtom(&char_class CHECK_FAILED);
- if (char_class) {
- ReportError(CStrVector(kInvalidRange) CHECK_FAILED);
+ uc16 char_class_2 = kNoCharClass;
+ CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
+ if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
+ // Either end is an escaped character class. Treat the '-'
verbatim.
+ AddRangeOrEscape(ranges, char_class, first);
+ ranges->Add(CharacterRange::Singleton('-'));
+ AddRangeOrEscape(ranges, char_class_2, next);
+ continue;
}
if (first.from() > next.to()) {
return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
}
ranges->Add(CharacterRange::Range(first.from(), next.to()));
} else {
- ranges->Add(first);
+ AddRangeOrEscape(ranges, char_class, first);
}
}
if (!has_more()) {
=======================================
--- /trunk/src/version.cc Wed Dec 15 04:05:09 2010
+++ /trunk/src/version.cc Wed Dec 15 04:43:21 2010
@@ -35,7 +35,7 @@
#define MAJOR_VERSION 3
#define MINOR_VERSION 0
#define BUILD_NUMBER 2
-#define PATCH_LEVEL 0
+#define PATCH_LEVEL 1
#define CANDIDATE_VERSION false
// Define SONAME to have the SCons build the put a specific SONAME into the
=======================================
--- /trunk/test/cctest/test-regexp.cc Wed Dec 15 04:05:09 2010
+++ /trunk/test/cctest/test-regexp.cc Wed Dec 15 04:43:21 2010
@@ -173,6 +173,9 @@
CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]");
CHECK_PARSE_EQ("[\\d]", "[0-9]");
CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]");
+ CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]");
+ CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]");
+ CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]");
CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK",
"'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
CHECK_PARSE_EQ("\\c!", "'c!'");
=======================================
--- /trunk/test/mjsunit/regexp.js Wed Dec 15 04:05:09 2010
+++ /trunk/test/mjsunit/regexp.js Wed Dec 15 04:43:21 2010
@@ -110,6 +110,44 @@
assertFalse(re.test("\x03]")); // I.e., read as \cc
+// Test that we handle \s and \S correctly inside some bizarre
+// character classes.
+re = /[\s-:]/;
+assertTrue(re.test('-'));
+assertTrue(re.test(':'));
+assertTrue(re.test(' '));
+assertTrue(re.test('\t'));
+assertTrue(re.test('\n'));
+assertFalse(re.test('a'));
+assertFalse(re.test('Z'));
+
+re = /[\S-:]/;
+assertTrue(re.test('-'));
+assertTrue(re.test(':'));
+assertFalse(re.test(' '));
+assertFalse(re.test('\t'));
+assertFalse(re.test('\n'));
+assertTrue(re.test('a'));
+assertTrue(re.test('Z'));
+
+re = /[^\s-:]/;
+assertFalse(re.test('-'));
+assertFalse(re.test(':'));
+assertFalse(re.test(' '));
+assertFalse(re.test('\t'));
+assertFalse(re.test('\n'));
+assertTrue(re.test('a'));
+assertTrue(re.test('Z'));
+
+re = /[^\S-:]/;
+assertFalse(re.test('-'));
+assertFalse(re.test(':'));
+assertTrue(re.test(' '));
+assertTrue(re.test('\t'));
+assertTrue(re.test('\n'));
+assertFalse(re.test('a'));
+assertFalse(re.test('Z'));
+
re = /[\s]/;
assertFalse(re.test('-'));
assertFalse(re.test(':'));
@@ -164,6 +202,17 @@
assertFalse(re.test('a'));
assertFalse(re.test('Z'));
+// First - is treated as range operator, second as literal minus.
+// This follows the specification in parsing, but doesn't throw on
+// the \s at the beginning of the range.
+re = /[\s-0-9]/;
+assertTrue(re.test(' '));
+assertTrue(re.test('\xA0'));
+assertTrue(re.test('-'));
+assertTrue(re.test('0'));
+assertTrue(re.test('9'));
+assertFalse(re.test('1'));
+
// Test beginning and end of line assertions with or without the
// multiline flag.
re = /^\d+/;
@@ -610,46 +659,3 @@
assertFalse(re.test("c"));
assertFalse(re.test(""));
-
-function testInvalidRange(str) {
- try {
- RegExp(str).test("x");
- } catch (e) {
- return;
- }
- assetUnreachable("Allowed invalid range in " + str);
-}
-
-function testValidRange(str) {
- try {
- RegExp(str).test("x");
- } catch (e) {
- assertUnreachable("Shouldn't fail parsing: " + str + ", was: " + e);
- }
-}
-
-testInvalidRange("[\\d-z]");
-testInvalidRange("[z-\\d]");
-testInvalidRange("[\\d-\\d]");
-testInvalidRange("[z-x]"); // Larger value first.
-testInvalidRange("[x-\\d-\\d]");
-
-testValidRange("[x-z]");
-testValidRange("[!--\d]"); // Second "-" is end of range.
-testValidRange("[\d-]");
-testValidRange("[-\d]");
-testValidRange("[-\d-]");
-testValidRange("[^-\d-]");
-testValidRange("[^-\d-]");
-testValidRange("[0-9-\w]");
-
-// Escaped dashes do not count as range operators.
-testValidRange("[\\d\\-z]");
-testValidRange("[z\\-\\d]");
-testValidRange("[\\d\\-\\d]");
-testValidRange("[z\\-x]");
-testValidRange("[x\\-\\d\\-\\d]");
-
-
-
-
=======================================
--- /trunk/test/mjsunit/third_party/regexp-pcre.js Wed Dec 15 04:05:09 2010
+++ /trunk/test/mjsunit/third_party/regexp-pcre.js Wed Dec 15 04:43:21 2010
@@ -962,7 +962,7 @@
res[883] = /[a\-z]+/;
res[884] = /[a-z]+/;
res[885] = /[\d-]+/;
-// res[886] - Disabled after making [\d-z] invalid to be compatible with
JSC.
+res[886] = /[\d-z]+/;
res[887] = /\x5c/;
res[888] = /\x20Z/;
res[889] = /ab{3cd/;
@@ -1346,7 +1346,7 @@
res[1267] = /(Z()|A)*/;
res[1268] = /(Z(())|A)*/;
res[1269] = /a*/g;
-// res[1270] disabled after making /^[\d-a]/ invalid to be compatible with
JSC.
+res[1270] = /^[\d-a]/;
res[1271] = /[[:space:]]+/;
res[1272] = /[[:blank:]]+/;
res[1273] = /[\s]+/;
@@ -2530,7 +2530,7 @@
assertEquals(null, res[431].exec("a\x85b", 883));
assertThrows("var re = /(?-+a)/;", 884);
assertEquals(null, res[443].exec("aaaa", 885));
-// assertEquals(null, res[443].exec("bacxxx", 886));
+assertEquals(null, res[443].exec("bacxxx", 886));
assertEquals(null, res[443].exec("bbaccxxx ", 887));
assertEquals(null, res[443].exec("bbbacccxx", 888));
assertEquals(null, res[443].exec("aaaa", 889));
@@ -4391,10 +4391,9 @@
assertEquals("12-34", res[885].exec("12-34"), 2744);
assertEquals(null, res[885].exec("*** Failers", 2745));
assertEquals(null, res[885].exec("aaa", 2746));
-// Disabled. To be compatible with JSC, the regexp is no longer valid.
-// assertEquals("12-34z", res[886].exec("12-34z"), 2747);
-// assertEquals(null, res[886].exec("*** Failers", 2748));
-// assertEquals(null, res[886].exec("aaa", 2749));
+assertEquals("12-34z", res[886].exec("12-34z"), 2747);
+assertEquals(null, res[886].exec("*** Failers", 2748));
+assertEquals(null, res[886].exec("aaa", 2749));
assertEquals("\\", res[887].exec("\\\\"), 2750);
assertEquals(" Z", res[888].exec("the Zoo"), 2751);
assertEquals(null, res[888].exec("*** Failers", 2752));
@@ -5356,12 +5355,11 @@
assertEquals("", res[1269].exec("0digit"), 3708);
assertEquals("", res[1269].exec("*** Failers"), 3709);
assertEquals("", res[1269].exec("bcdef "), 3710);
-// Disabled. To be compatible with JSC, the RegExp is no longer valid.
-// assertEquals("a", res[1270].exec("abcde"), 3711);
-// assertEquals("-", res[1270].exec("-things"), 3712);
-// assertEquals("0", res[1270].exec("0digit"), 3713);
-// assertEquals(null, res[1270].exec("*** Failers", 3714));
-// assertEquals(null, res[1270].exec("bcdef ", 3715));
+assertEquals("a", res[1270].exec("abcde"), 3711);
+assertEquals("-", res[1270].exec("-things"), 3712);
+assertEquals("0", res[1270].exec("0digit"), 3713);
+assertEquals(null, res[1270].exec("*** Failers", 3714));
+assertEquals(null, res[1270].exec("bcdef ", 3715));
assertEquals(null, res[1271].exec("> \x09\n\x0c\x0d\x0b<", 3716));
assertEquals(null, res[1271].exec(" ", 3717));
assertEquals(null, res[1272].exec("> \x09\n\x0c\x0d\x0b<", 3718));
=======================================
--- /trunk/test/mozilla/mozilla.status Wed Dec 15 04:05:09 2010
+++ /trunk/test/mozilla/mozilla.status Wed Dec 15 04:43:21 2010
@@ -289,10 +289,6 @@
js1_2/regexp/beginLine: FAIL_OK
js1_2/regexp/endLine: FAIL_OK
-# To be compatible with JSC, we no longer accept [\d-x], [x-\d] or
-# [\d-\d] as valid ranges.
-ecma_3/RegExp/regress-375715-02: FAIL
-js1_5/extensions/regress-351463-01: FAIL
# To be compatible with safari typeof a regexp yields 'function';
# in firefox it yields 'object'.
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev