Author: [EMAIL PROTECTED]
Date: Wed Nov 19 22:29:34 2008
New Revision: 796
Modified:
branches/experimental/regexp2000/src/jsregexp.cc
branches/experimental/regexp2000/src/parser.cc
branches/experimental/regexp2000/test/cctest/test-regexp.cc
Log:
Fixed issues 156, 157, 158 and disabled error reporting on malformed
quantifier prefixes ({...}).
Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc Wed Nov 19 22:29:34
2008
@@ -1746,6 +1746,12 @@
};
+static const int kLineTerminatorRangeCount = 6;
+static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {
+ 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029
+};
+
+
static void AddClass(const uc16* elmv,
int elmc,
ZoneList<CharacterRange>* ranges) {
@@ -1794,6 +1800,14 @@
AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
break;
case '.':
+ AddClassNegated(kLineTerminatorRanges,
+ kLineTerminatorRangeCount,
+ ranges);
+ break;
+ // This is not a character range as defined by the spec but a
+ // convenient shorthand for a character class that matches any
+ // character.
+ case '*':
ranges->Add(CharacterRange::Everything());
break;
default:
@@ -2211,7 +2225,7 @@
RegExpNode* node = RegExpQuantifier::ToNode(0,
RegExpQuantifier::kInfinity,
false,
- new
RegExpCharacterClass('.'),
+ new
RegExpCharacterClass('*'),
&compiler,
captured_body,
compiler.backtrack());
Modified: branches/experimental/regexp2000/src/parser.cc
==============================================================================
--- branches/experimental/regexp2000/src/parser.cc (original)
+++ branches/experimental/regexp2000/src/parser.cc Wed Nov 19 22:29:34 2008
@@ -320,7 +320,7 @@
private:
void FlushCharacters();
void FlushText();
- bool FlushTerms();
+ void FlushTerms();
bool pending_empty_;
ZoneList<uc16>* characters_;
BufferedZoneList<RegExpTree, 2> terms_;
@@ -410,20 +410,17 @@
void RegExpBuilder::NewAlternative() {
- if (!FlushTerms()) {
- alternatives_.Add(RegExpEmpty::GetInstance());
- }
+ FlushTerms();
}
-bool RegExpBuilder::FlushTerms() {
+void RegExpBuilder::FlushTerms() {
FlushText();
int num_terms = terms_.length();
- if (num_terms == 0) {
- return false;
- }
RegExpTree* alternative;
- if (num_terms == 1) {
+ if (num_terms == 0) {
+ alternative = RegExpEmpty::GetInstance();
+ } else if (num_terms == 1) {
alternative = terms_.last();
} else {
alternative = new RegExpAlternative(terms_.GetList());
@@ -431,7 +428,6 @@
alternatives_.Add(alternative);
terms_.Clear();
LAST(ADD_NONE);
- return true;
}
@@ -506,7 +502,7 @@
// Parses a {...,...} quantifier and stores the range in the given
// out parameters.
- void* ParseIntervalQuantifier(int* min_out, int* max_out, bool* ok);
+ bool ParseIntervalQuantifier(int* min_out, int* max_out);
// Parses and returns a single escaped character. The character
// must not be 'b' or 'B' since they are usually handle specially.
@@ -538,7 +534,7 @@
int captures_started() { return captures_ == NULL ? 0 :
captures_->length(); }
int position() { return next_pos_ - 1; }
- static const uc32 kEndMarker = unibrow::Utf8::kBadChar;
+ static const uc32 kEndMarker = (1 << 21);
private:
uc32 current() { return current_; }
bool has_more() { return has_more_; }
@@ -3602,8 +3598,7 @@
case '*':
case '+':
case '?':
- case '{':
- ReportError(CStrVector("Nothing to repeat."), CHECK_OK);
+ ReportError(CStrVector("Nothing to repeat"), CHECK_OK);
case '^': {
Advance();
RegExpAssertion::Type type =
@@ -3754,13 +3749,20 @@
}
has_character_escapes_ = true;
break;
+ case '{': {
+ int dummy;
+ if (ParseIntervalQuantifier(&dummy, &dummy)) {
+ ReportError(CStrVector("Nothing to repeat"), CHECK_OK);
+ }
+ // fallthrough
+ }
default:
builder.AddCharacter(current());
Advance();
break;
} // end switch(current())
- has_read_atom:
+ has_read_atom:
int min;
int max;
switch (current()) {
@@ -3785,8 +3787,11 @@
Advance();
break;
case '{':
- ParseIntervalQuantifier(&min, &max, CHECK_OK);
- break;
+ if (ParseIntervalQuantifier(&min, &max)) {
+ break;
+ } else {
+ continue;
+ }
default:
continue;
}
@@ -3876,19 +3881,14 @@
// { DecimalDigits }
// { DecimalDigits , }
// { DecimalDigits , DecimalDigits }
-void* RegExpParser::ParseIntervalQuantifier(int* min_out,
- int* max_out,
- bool* ok) {
+bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
ASSERT_EQ(current(), '{');
- static const char* kInvalidQuantifier = "Invalid quantifier";
+ int start = position();
Advance();
int min = 0;
if (!IsDecimalDigit(current())) {
- // JSC allows {} and {,} as quantifiers (and { and } and all
- // sorts of crazy stuff) but my puny human brain has been unable
- // to figure out what they mean exactly, if anything. For now
- // we follow the spec and report a syntax error.
- ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
+ Reset(start);
+ return false;
}
while (IsDecimalDigit(current())) {
min = 10 * min + (current() - '0');
@@ -3909,16 +3909,18 @@
Advance();
}
if (current() != '}') {
- ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
+ Reset(start);
+ return false;
}
Advance();
}
} else {
- ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
+ Reset(start);
+ return false;
}
*min_out = min;
*max_out = max;
- return NULL;
+ return true;
}
@@ -4152,7 +4154,11 @@
if (!is_char_class) {
if (current() == '-') {
Advance();
- if (current() == ']') {
+ if (current() == kEndMarker) {
+ // If we reach the end we break out of the loop and let the
+ // following code report an error.
+ break;
+ } else if (current() == ']') {
ranges->Add(first);
ranges->Add(CharacterRange::Singleton('-'));
break;
Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Wed Nov 19
22:29:34 2008
@@ -219,10 +219,30 @@
CHECK_ESCAPES("\\1112", true);
CHECK_ESCAPES("\\0", true);
CHECK_ESCAPES("(a)\\1", false);
+
+ CHECK_PARSE_EQ("a{}", "'a{}'");
+ CHECK_PARSE_EQ("a{,}", "'a{,}'");
+ CHECK_PARSE_EQ("a{", "'a{'");
+ CHECK_PARSE_EQ("a{z}", "'a{z}'");
+ CHECK_PARSE_EQ("a{1z}", "'a{1z}'");
+ CHECK_PARSE_EQ("a{12z}", "'a{12z}'");
+ CHECK_PARSE_EQ("a{12,", "'a{12,'");
+ CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'");
+ CHECK_PARSE_EQ("{}", "'{}'");
+ CHECK_PARSE_EQ("{,}", "'{,}'");
+ CHECK_PARSE_EQ("{", "'{'");
+ CHECK_PARSE_EQ("{z}", "'{z}'");
+ CHECK_PARSE_EQ("{1z}", "'{1z}'");
+ CHECK_PARSE_EQ("{12z}", "'{12z}'");
+ CHECK_PARSE_EQ("{12,", "'{12,'");
+ CHECK_PARSE_EQ("{12,3b", "'{12,3b'");
}
TEST(ParserRegression) {
CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])");
+ CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
+ CHECK_PARSE_EQ("{", "'{'");
+ CHECK_PARSE_EQ("a|", "(| 'a' %)");
}
static void ExpectError(const char* input,
@@ -243,15 +263,6 @@
V8::Initialize(NULL);
const char* kEndBackslash = "\\ at end of pattern";
ExpectError("\\", kEndBackslash);
- const char* kInvalidQuantifier = "Invalid quantifier";
- ExpectError("a{}", kInvalidQuantifier);
- ExpectError("a{,}", kInvalidQuantifier);
- ExpectError("a{", kInvalidQuantifier);
- ExpectError("a{z}", kInvalidQuantifier);
- ExpectError("a{1z}", kInvalidQuantifier);
- ExpectError("a{12z}", kInvalidQuantifier);
- ExpectError("a{12,", kInvalidQuantifier);
- ExpectError("a{12,3b", kInvalidQuantifier);
const char* kUnterminatedGroup = "Unterminated group";
ExpectError("(foo", kUnterminatedGroup);
const char* kInvalidGroup = "Invalid group";
@@ -263,6 +274,13 @@
ExpectError("[a-\\w]", kIllegalCharacterClass);
const char* kEndControl = "\\c at end of pattern";
ExpectError("\\c", kEndControl);
+ static char* kNothingToRepeat = "Nothing to repeat";
+ ExpectError("*", kNothingToRepeat);
+ ExpectError("?", kNothingToRepeat);
+ ExpectError("+", kNothingToRepeat);
+ ExpectError("{1}", kNothingToRepeat);
+ ExpectError("{1,2}", kNothingToRepeat);
+ ExpectError("{1,}", kNothingToRepeat);
}
@@ -313,7 +331,13 @@
static bool Dot(uc16 c) {
- return true;
+ switch (c) {
+ // CR LF LS PS
+ case 0x000A: case 0x000D: case 0x2028: case 0x2029:
+ return false;
+ default:
+ return true;
+ }
}
--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---