Author: [EMAIL PROTECTED]
Date: Tue Oct 28 04:18:25 2008
New Revision: 623
Modified:
branches/experimental/regexp2000/src/ast.cc
branches/experimental/regexp2000/src/ast.h
branches/experimental/regexp2000/src/jsregexp.cc
branches/experimental/regexp2000/src/parser.cc
branches/experimental/regexp2000/test/cctest/test-regexp.cc
Log:
Added parsing of integer escapes as backreferences.
Modified: branches/experimental/regexp2000/src/ast.cc
==============================================================================
--- branches/experimental/regexp2000/src/ast.cc (original)
+++ branches/experimental/regexp2000/src/ast.cc Tue Oct 28 04:18:25 2008
@@ -328,6 +328,13 @@
}
+void* RegExpUnparser::VisitBackreference(RegExpBackreference* that,
+ void* data) {
+ stream()->Add("(<- %i)", that->index());
+ return NULL;
+}
+
+
void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) {
stream()->Put('%');
return NULL;
Modified: branches/experimental/regexp2000/src/ast.h
==============================================================================
--- branches/experimental/regexp2000/src/ast.h (original)
+++ branches/experimental/regexp2000/src/ast.h Tue Oct 28 04:18:25 2008
@@ -1193,6 +1193,7 @@
VISIT(Quantifier) \
VISIT(Capture) \
VISIT(Lookahead) \
+ VISIT(Backreference) \
VISIT(Empty)
@@ -1354,6 +1355,16 @@
private:
RegExpTree* body_;
bool is_positive_;
+};
+
+
+class RegExpBackreference: public RegExpTree {
+ public:
+ explicit RegExpBackreference(int index) : index_(index) { }
+ virtual void* Accept(RegExpVisitor* visitor, void* data);
+ int index() { return index_; }
+ private:
+ int index_;
};
Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc Tue Oct 28 04:18:25
2008
@@ -795,6 +795,14 @@
template <typename Char>
+void* RegExpCompiler<Char>::VisitBackreference(RegExpBackreference* that,
+ void* rest) {
+ UNIMPLEMENTED();
+ return NULL;
+}
+
+
+template <typename Char>
void* RegExpCompiler<Char>::VisitEmpty(RegExpEmpty* that, void* rest) {
return rest;
}
Modified: branches/experimental/regexp2000/src/parser.cc
==============================================================================
--- branches/experimental/regexp2000/src/parser.cc (original)
+++ branches/experimental/regexp2000/src/parser.cc Tue Oct 28 04:18:25 2008
@@ -252,6 +252,12 @@
uc32 ParseControlEscape(bool* ok);
uc32 ParseOctalLiteral(bool* ok);
+ // Tries to parse the input as a backreference. If successful it
+ // stores the result in the output parameter and returns true. If
+ // it fails it will push back the characters read so the same characters
+ // can be reparsed.
+ bool ParseBackreferenceIndex(int* index_out);
+
CharacterRange ParseClassAtom(bool* ok);
RegExpTree* ReportError(Vector<const char> message, bool* ok);
void Advance();
@@ -270,6 +276,9 @@
int captures_seen_;
unibrow::CharacterStream* in_;
Handle<String>* error_;
+ static const int kMaxPushback = 5;
+ int pushback_count_;
+ uc32 pushback_buffer_[kMaxPushback];
};
@@ -3208,7 +3217,8 @@
//
----------------------------------------------------------------------------
-// Regular expressions.
+// Regular expressions
+
RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>*
error)
: current_(kEndMarker),
@@ -3217,14 +3227,20 @@
has_next_(true),
captures_seen_(0),
in_(in),
- error_(error) {
+ error_(error),
+ pushback_count_(0) {
Advance(2);
}
+
void RegExpParser::Advance() {
current_ = next_;
has_more_ = has_next_;
- if (in()->has_more()) {
+ if (pushback_count_ > 0) {
+ pushback_count_--;
+ next_ = pushback_buffer_[pushback_count_];
+ has_next_ = true;
+ } else if (in()->has_more()) {
next_ = in()->GetNext();
} else {
next_ = kEndMarker;
@@ -3232,23 +3248,27 @@
}
}
+
void RegExpParser::Advance(int dist) {
for (int i = 0; i < dist; i++)
Advance();
}
+
RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool*
ok) {
*ok = false;
*error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
return NULL;
}
+
// Pattern ::
// Disjunction
RegExpTree* RegExpParser::ParsePattern(bool* ok) {
return ParseDisjunction(ok);
}
+
// Disjunction ::
// Alternative
// Alternative | Disjunction
@@ -3268,10 +3288,12 @@
}
}
+
static bool IsAlternativeTerminator(uc32 c) {
return c == '|' || c == ')' || c == RegExpParser::kEndMarker;
}
+
// Alternative ::
// [empty]
// Alternative Term
@@ -3332,6 +3354,52 @@
}
+bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
+ ASSERT_EQ('\\', current());
+ ASSERT('1' <= next() && next() <= '9');
+ ASSERT_EQ(0, pushback_count_);
+ if (captures_seen_ == 0)
+ return false;
+ int value = next() - '0';
+ if (value > captures_seen_)
+ return false;
+ static const int kMaxChars = kMaxPushback - 2;
+ EmbeddedVector<uc32, kMaxChars> chars_seen;
+ chars_seen[0] = next();
+ int char_count = 1;
+ Advance(2);
+ while (true) {
+ uc32 c = current();
+ if (IsDecimalDigit(c)) {
+ int next_value = 10 * value + (c - '0');
+ // To avoid reading past the end of the stack-allocated pushback
+ // buffers we only read kMaxChars before giving up.
+ if (next_value > captures_seen_ || char_count > kMaxChars) {
+ // If we give up we have to push the characters we read back
+ // onto the pushback buffer in the reverse order.
+ pushback_buffer_[0] = current();
+ for (int i = 0; i < char_count; i++)
+ pushback_buffer_[i + 1] = chars_seen[char_count - i - 1];
+ pushback_buffer_[char_count + 1] = '\\';
+ pushback_count_ = char_count + 2;
+ // Then, once we've filled up the buffer, we read the two
+ // first characters into the lookahead. This is a roundabout
+ // way of doing it but makes the code simpler.
+ Advance(2);
+ return false;
+ } else {
+ value = next_value;
+ chars_seen[char_count++] = current();
+ Advance();
+ }
+ } else {
+ *index_out = value;
+ return true;
+ }
+ }
+}
+
+
// Term ::
// Assertion
// Atom
@@ -3384,15 +3452,27 @@
atom = new
RegExpCharacterClass(CharacterRange::CharacterClass(c));
goto has_read_atom;
}
- // Todo backreferences
+ case '1': case '2': case '3': case '4': case '5': case '6':
+ case '7': case '8': case '9': {
+ int index = 0;
+ if (ParseBackreferenceIndex(&index)) {
+ atom = new RegExpBackreference(index);
+ goto has_read_atom;
+ } else {
+ // If this is not a backreference we go to the atom parser
+ // which will read it as an octal escape.
+ goto parse_atom;
+ }
+ }
default:
- break;
+ goto parse_atom;
}
}
// All other escapes fall through to the default case since
// they correspond to single characters that can be
// represented within atoms.
default: {
+ parse_atom:
atom = ParseAtom(CHECK_OK);
break;
}
@@ -3405,6 +3485,7 @@
// *
// +
// ?
+ // {
case '*':
min = 0;
max = RegExpQuantifier::kInfinity;
Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Tue Oct 28
04:18:25 2008
@@ -140,13 +140,33 @@
ExpectParse("\\11", "'\t'");
ExpectParse("\\11a", "'\ta'");
ExpectParse("\\011", "'\t'");
+ ExpectParse("\\00011", "'\t'");
ExpectParse("\\118", "'\t8'");
ExpectParse("\\111", "'I'");
ExpectParse("\\1111", "'I1'");
+ ExpectParse("(.)(.)(.)\\1", "(: (^ [&.]) (^ [&.]) (^ [&.]) (<- 1))");
+ ExpectParse("(.)(.)(.)\\2", "(: (^ [&.]) (^ [&.]) (^ [&.]) (<- 2))");
+ ExpectParse("(.)(.)(.)\\3", "(: (^ [&.]) (^ [&.]) (^ [&.]) (<- 3))");
+ ExpectParse("(.)(.)(.)\\4", "(: (^ [&.]) (^ [&.]) (^ [&.]) '\x04')");
+ ExpectParse("(.)(.)(.)\\1*", "(: (^ [&.]) (^ [&.]) (^ [&.])"
+ " (# 0 - g (<- 1)))");
+ ExpectParse("(.)(.)(.)\\2*", "(: (^ [&.]) (^ [&.]) (^ [&.])"
+ " (# 0 - g (<- 2)))");
+ ExpectParse("(.)(.)(.)\\3*", "(: (^ [&.]) (^ [&.]) (^ [&.])"
+ " (# 0 - g (<- 3)))");
+ ExpectParse("(.)(.)(.)\\4*", "(: (^ [&.]) (^ [&.]) (^ [&.])"
+ " (# 0 - g '\x04'))");
+ ExpectParse("(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\\10",
+ "(: (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.])"
+ " (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (<- 10))");
+ ExpectParse("(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\\11",
+ "(: (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.])"
+ " (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) '\x09')");
ExpectParse("[\\0]", "[\0]");
ExpectParse("[\\11]", "[\t]");
ExpectParse("[\\11a]", "[\t a]");
ExpectParse("[\\011]", "[\t]");
+ ExpectParse("[\\00011]", "[\t]");
ExpectParse("[\\118]", "[\t 8]");
ExpectParse("[\\111]", "[I]");
ExpectParse("[\\1111]", "[I 1]");
--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---