Author: [EMAIL PROTECTED]
Date: Fri Nov 21 05:08:23 2008
New Revision: 820
Modified:
branches/experimental/regexp2000/src/assembler-re2k.cc
branches/experimental/regexp2000/src/assembler-re2k.h
branches/experimental/regexp2000/src/ast.cc
branches/experimental/regexp2000/src/ast.h
branches/experimental/regexp2000/src/bytecodes-re2k.h
branches/experimental/regexp2000/src/interpreter-re2k.cc
branches/experimental/regexp2000/src/jsregexp.cc
branches/experimental/regexp2000/src/jsregexp.h
branches/experimental/regexp2000/src/parser.cc
branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.cc
branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.h
branches/experimental/regexp2000/src/regexp-macro-assembler.h
branches/experimental/regexp2000/test/cctest/test-regexp.cc
Log:
Add back references. Since "back references" is two words
the CamelCasing was fixed in various places.
Review URL: http://codereview.chromium.org/11350
Modified: branches/experimental/regexp2000/src/assembler-re2k.cc
==============================================================================
--- branches/experimental/regexp2000/src/assembler-re2k.cc (original)
+++ branches/experimental/regexp2000/src/assembler-re2k.cc Fri Nov 21
05:08:23 2008
@@ -211,10 +211,9 @@
}
-void Re2kAssembler::CheckBackref(int capture_index,
- Label* on_mismatch) {
- Emit(BC_CHECK_BACKREF);
- Emit32(0);
+void Re2kAssembler::CheckNotBackReference(int capture_index,
+ Label* on_mismatch) {
+ Emit(BC_CHECK_NOT_BACK_REF);
Emit(capture_index);
EmitOrLink(on_mismatch);
}
Modified: branches/experimental/regexp2000/src/assembler-re2k.h
==============================================================================
--- branches/experimental/regexp2000/src/assembler-re2k.h (original)
+++ branches/experimental/regexp2000/src/assembler-re2k.h Fri Nov 21
05:08:23 2008
@@ -71,8 +71,8 @@
// previous capture. Advances current position by the length of the
capture
// iff it matches. The capture is stored in a given register and the
// the register after. If a register contains -1 then the other register
- // mush always contain -1 and the on_mismatch label will never be called.
- void CheckBackref(int capture_index, Label* on_mismatch);
+ // must always contain -1 and the on_mismatch label will never be called.
+ void CheckNotBackReference(int capture_index, Label* on_mismatch);
// Checks a register for strictly-less-than or greater-than-or-equal.
void CheckRegisterLT(int reg_index, uint16_t vs, Label* on_less_than);
Modified: branches/experimental/regexp2000/src/ast.cc
==============================================================================
--- branches/experimental/regexp2000/src/ast.cc (original)
+++ branches/experimental/regexp2000/src/ast.cc Fri Nov 21 05:08:23 2008
@@ -360,7 +360,7 @@
}
-void* RegExpUnparser::VisitBackreference(RegExpBackreference* that,
+void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
void* data) {
stream()->Add("(<- %i)", that->index());
return NULL;
Modified: branches/experimental/regexp2000/src/ast.h
==============================================================================
--- branches/experimental/regexp2000/src/ast.h (original)
+++ branches/experimental/regexp2000/src/ast.h Fri Nov 21 05:08:23 2008
@@ -1412,16 +1412,16 @@
};
-class RegExpBackreference: public RegExpTree {
+class RegExpBackReference: public RegExpTree {
public:
- explicit RegExpBackreference(RegExpCapture* capture)
+ explicit RegExpBackReference(RegExpCapture* capture)
: capture_(capture) { }
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success,
RegExpNode* on_failure);
- virtual RegExpBackreference* AsBackreference();
- virtual bool IsBackreference();
+ virtual RegExpBackReference* AsBackReference();
+ virtual bool IsBackReference();
int index() { return capture_->index(); }
RegExpCapture* capture() { return capture_; }
private:
Modified: branches/experimental/regexp2000/src/bytecodes-re2k.h
==============================================================================
--- branches/experimental/regexp2000/src/bytecodes-re2k.h (original)
+++ branches/experimental/regexp2000/src/bytecodes-re2k.h Fri Nov 21
05:08:23 2008
@@ -54,14 +54,13 @@
V(CHECK_NOT_CHAR, 19, 7) /* check_not_char uc16
addr32 */ \
V(CHECK_LT, 20, 7) /* check_lt uc16
addr32 */ \
V(CHECK_GT, 21, 7) /* check_gr uc16
addr32 */ \
-V(CHECK_BACKREF, 22, 9) /* check_backref offset32 capture_idx
addr32 */ \
-V(CHECK_NOT_BACKREF, 23, 9) /* check_not_backref offset32 capture_idx
addr32*/ \
-V(LOOKUP_MAP1, 24, 11) /* l_map1 start16 bit_map_addr32
addr32 */ \
-V(LOOKUP_MAP2, 25, 99) /* l_map2 start16
half_nibble_map_addr32* */ \
-V(LOOKUP_MAP8, 26, 99) /* l_map8 start16 byte_map
addr32* */ \
-V(LOOKUP_HI_MAP8, 27, 99) /* l_himap8 start8 byte_map_addr32
addr32* */ \
-V(CHECK_REGISTER_LT, 28, 8) /* check_reg_lt register_index value16
addr32 */ \
-V(CHECK_REGISTER_GE, 29, 8) /* check_reg_ge register_index value16
addr32 */ \
+V(CHECK_NOT_BACK_REF, 22, 6) /* check_not_back_ref capture_idx
addr32 */ \
+V(LOOKUP_MAP1, 23, 11) /* l_map1 start16 bit_map_addr32
addr32 */ \
+V(LOOKUP_MAP2, 24, 99) /* l_map2 start16
half_nibble_map_addr32* */ \
+V(LOOKUP_MAP8, 25, 99) /* l_map8 start16 byte_map
addr32* */ \
+V(LOOKUP_HI_MAP8, 26, 99) /* l_himap8 start8 byte_map_addr32
addr32* */ \
+V(CHECK_REGISTER_LT, 27, 8) /* check_reg_lt register_index value16
addr32 */ \
+V(CHECK_REGISTER_GE, 28, 8) /* check_reg_ge register_index value16
addr32 */ \
#define DECLARE_BYTECODES(name, code, length) \
static const int BC_##name = code;
Modified: branches/experimental/regexp2000/src/interpreter-re2k.cc
==============================================================================
--- branches/experimental/regexp2000/src/interpreter-re2k.cc (original)
+++ branches/experimental/regexp2000/src/interpreter-re2k.cc Fri Nov 21
05:08:23 2008
@@ -280,12 +280,26 @@
pc = code_base + Load32(new_pc);
break;
}
- BYTECODE(CHECK_BACKREF)
- UNREACHABLE();
- break;
- BYTECODE(CHECK_NOT_BACKREF)
- UNREACHABLE();
+ BYTECODE(CHECK_NOT_BACK_REF) {
+ int from = registers[pc[1]];
+ int len = registers[pc[1] + 1] - from;
+ if (current + len > subject.length()) {
+ pc = code_base + Load32(pc + 2);
+ break;
+ } else {
+ int i;
+ for (i = 0; i < len; i++) {
+ if (subject[from + i] != subject[current + i]) {
+ pc = code_base + Load32(pc + 2);
+ break;
+ }
+ }
+ if (i < len) break;
+ current += len;
+ }
+ pc += BC_CHECK_NOT_BACK_REF_LENGTH;
break;
+ }
default:
UNREACHABLE();
break;
Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc Fri Nov 21 05:08:23
2008
@@ -1282,6 +1282,19 @@
}
+bool BackReferenceNode::Emit(RegExpCompiler* compiler) {
+ RegExpMacroAssembler* macro = compiler->macro_assembler();
+ Bind(macro);
+ // Check whether the registers are uninitialized and always
+ // succeed if they are.
+ macro->IfRegisterLT(start_reg_, 0, on_success()->label());
+ macro->IfRegisterLT(end_reg_, 0, on_success()->label());
+ ASSERT_EQ(start_reg_ + 1, end_reg_);
+ macro->CheckNotBackReference(start_reg_, on_failure_->label());
+ return on_success()->GoTo(compiler);
+}
+
+
// -------------------------------------------------------------------
// Dot/dotty output
@@ -1445,7 +1458,7 @@
}
-void DotPrinter::VisitBackreference(BackreferenceNode* that) {
+void DotPrinter::VisitBackReference(BackReferenceNode* that) {
stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
that,
that->start_register(),
@@ -1678,10 +1691,10 @@
}
-RegExpNode* RegExpBackreference::ToNode(RegExpCompiler* compiler,
+RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success,
RegExpNode* on_failure) {
- return new BackreferenceNode(RegExpCapture::StartRegister(index()),
+ return new BackReferenceNode(RegExpCapture::StartRegister(index()),
RegExpCapture::EndRegister(index()),
on_success,
on_failure);
@@ -2020,7 +2033,7 @@
}
-RegExpNode* BackreferenceNode::PropagateInterest(NodeInfo* info) {
+RegExpNode* BackReferenceNode::PropagateInterest(NodeInfo* info) {
return PropagateToEndpoint(this, info);
}
@@ -2230,7 +2243,7 @@
}
-void Analysis::VisitBackreference(BackreferenceNode* that) {
+void Analysis::VisitBackReference(BackReferenceNode* that) {
EnsureAnalyzed(that->on_success());
EnsureAnalyzed(that->on_failure());
}
@@ -2287,8 +2300,10 @@
}
-void DispatchTableConstructor::VisitBackreference(BackreferenceNode* that)
{
- // TODO(plesner): What should this do?
+void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that)
{
+ // TODO(160): Find the node that we refer back to and propagate its start
+ // set back to here. For now we just accept anything.
+ AddRange(CharacterRange::Everything());
}
Modified: branches/experimental/regexp2000/src/jsregexp.h
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.h (original)
+++ branches/experimental/regexp2000/src/jsregexp.h Fri Nov 21 05:08:23 2008
@@ -402,7 +402,7 @@
VISIT(End) \
VISIT(Action) \
VISIT(Choice) \
- VISIT(Backreference) \
+ VISIT(BackReference) \
VISIT(Text)
@@ -415,7 +415,7 @@
VISIT(Quantifier) \
VISIT(Capture) \
VISIT(Lookahead) \
- VISIT(Backreference) \
+ VISIT(BackReference) \
VISIT(Empty) \
VISIT(Text)
@@ -606,9 +606,9 @@
};
-class BackreferenceNode: public SeqRegExpNode {
+class BackReferenceNode: public SeqRegExpNode {
public:
- BackreferenceNode(int start_reg,
+ BackReferenceNode(int start_reg,
int end_reg,
RegExpNode* on_success,
RegExpNode* on_failure)
@@ -620,7 +620,7 @@
RegExpNode* on_failure() { return on_failure_; }
int start_register() { return start_reg_; }
int end_register() { return end_reg_; }
- virtual bool Emit(RegExpCompiler* compiler) { return false; }
+ virtual bool Emit(RegExpCompiler* compiler);
virtual RegExpNode* PropagateInterest(NodeInfo* info);
private:
RegExpNode* on_failure_;
Modified: branches/experimental/regexp2000/src/parser.cc
==============================================================================
--- branches/experimental/regexp2000/src/parser.cc (original)
+++ branches/experimental/regexp2000/src/parser.cc Fri Nov 21 05:08:23 2008
@@ -515,11 +515,11 @@
uc32 ParseControlLetterEscape(bool* ok);
uc32 ParseOctalLiteral();
- // Tries to parse the input as a backreference. If successful it
+ // Tries to parse the input as a back reference. If successful it
// stores the result in the output parameter and returns true. If
// it fails it will push back the characters read so the same characters
// can be reparsed.
- bool ParseBackreferenceIndex(int* index_out);
+ bool ParseBackReferenceIndex(int* index_out);
CharacterRange ParseClassAtom(bool* is_char_class,
ZoneList<CharacterRange>* ranges,
@@ -541,6 +541,8 @@
bool has_next() { return next_pos_ < in()->length(); }
uc32 Next();
FlatStringReader* in() { return in_; }
+ void ScanForCaptures();
+ bool CaptureAvailable(int index);
uc32 current_;
bool has_more_;
bool multiline_mode_;
@@ -548,7 +550,9 @@
FlatStringReader* in_;
Handle<String>* error_;
bool has_character_escapes_;
+ bool is_scanned_for_captures_;
ZoneList<RegExpCapture*>* captures_;
+ int capture_count_;
};
@@ -3502,7 +3506,9 @@
in_(in),
error_(error),
has_character_escapes_(false),
- captures_(NULL) {
+ is_scanned_for_captures_(false),
+ captures_(NULL),
+ capture_count_(0) {
Advance(1);
}
@@ -3564,6 +3570,14 @@
}
+bool RegExpParser::CaptureAvailable(int index) {
+ if (captures_ == NULL) return false;
+ if (index >= captures_->length()) return false;
+ RegExpCapture* capture = captures_->at(index);
+ return capture != NULL && capture->available() == CAPTURE_AVAILABLE;
+}
+
+
// Disjunction ::
// Alternative
// Alternative | Disjunction
@@ -3667,14 +3681,14 @@
case '1': case '2': case '3': case '4': case '5': case '6':
case '7': case '8': case '9': {
int index = 0;
- if (ParseBackreferenceIndex(&index)) {
- RegExpCapture* capture = captures_->at(index - 1);
- if (capture == NULL || capture->available() !=
CAPTURE_AVAILABLE) {
+ if (ParseBackReferenceIndex(&index)) {
+ if (!CaptureAvailable(index - 1)) {
// Prepare to ignore a following quantifier
builder.AddEmpty();
goto has_read_atom;
}
- RegExpTree* atom = new RegExpBackreference(capture);
+ RegExpCapture* capture = captures_->at(index - 1);
+ RegExpTree* atom = new RegExpBackReference(capture);
builder.AddAtom(atom);
goto has_read_atom; // Avoid setting has_character_escapes_.
}
@@ -3844,7 +3858,42 @@
#endif
-bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
+// In order to know whether an escape is a backreference or not we have to
scan
+// the entire regexp and find the number of capturing parentheses.
However we
+// don't want to scan the regexp twice unless it is necessary. This
mini-parser
+// is called when needed. It can see the difference between capturing and
+// noncapturing parentheses and can skip character classes and
backslash-escaped
+// characters.
+void RegExpParser::ScanForCaptures() {
+ int n;
+ while ((n = current()) != kEndMarker) {
+ Advance();
+ switch (n) {
+ case '\\':
+ Advance();
+ break;
+ case '[': {
+ int c;
+ while ((c = current()) != kEndMarker) {
+ Advance();
+ if (c == '\\') {
+ Advance();
+ } else {
+ if (c == ']') break;
+ }
+ }
+ break;
+ }
+ case '(':
+ if (current() != '?') capture_count_++;
+ break;
+ }
+ }
+ is_scanned_for_captures_ = true;
+}
+
+
+bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
ASSERT_EQ('\\', current());
ASSERT('1' <= Next() && Next() <= '9');
// Try to parse a decimal literal that is no greater than the number
@@ -3852,18 +3901,21 @@
// This is a not according the the ECMAScript specification. According to
// that, one must accept values up to the total number of left capturing
// parentheses in the entire input, even if they are meaningless.
- if (captures_ == NULL)
- return false;
+ if (!is_scanned_for_captures_) {
+ int saved_position = position();
+ ScanForCaptures();
+ Reset(saved_position);
+ }
+ if (capture_count_ == 0) return false;
int start = position();
int value = Next() - '0';
- if (value > captures_->length())
- return false;
+ if (value > capture_count_) return false;
Advance(2);
while (true) {
uc32 c = current();
if (IsDecimalDigit(c)) {
value = 10 * value + (c - '0');
- if (value > captures_->length()) {
+ if (value > capture_count_) {
Reset(start);
return false;
}
@@ -4068,6 +4120,7 @@
captures_ = new ZoneList<RegExpCapture*>(2);
}
captures_->Add(NULL);
+ if (!is_scanned_for_captures_) capture_count_++;
}
int capture_index = captures_started();
RegExpTree* body = ParseDisjunction(CHECK_OK);
Modified:
branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.cc
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.cc
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.cc Fri
Nov 21 05:08:23 2008
@@ -174,6 +174,12 @@
}
+void RegExpMacroAssemblerRe2k::CheckNotBackReference(int start_reg,
+ Label* on_not_equal) {
+ assembler_->CheckNotBackReference(start_reg, on_not_equal);
+}
+
+
void RegExpMacroAssemblerRe2k::CheckBitmap(uc16 start,
Label* bitmap,
Label* on_zero) {
Modified: branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.h
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.h
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.h Fri
Nov 21 05:08:23 2008
@@ -60,6 +60,7 @@
virtual void CheckCharacterGT(uc16 limit, Label* on_greater);
virtual void CheckCharacter(uc16 c, Label* on_equal);
virtual void CheckNotCharacter(uc16 c, Label* on_not_equal);
+ virtual void CheckNotBackReference(int start_reg, Label* on_no_match);
virtual void CheckCharacters(Vector<const uc16> str,
int cp_offset,
Label* on_failure);
Modified: branches/experimental/regexp2000/src/regexp-macro-assembler.h
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler.h
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler.h Fri Nov
21 05:08:23 2008
@@ -64,6 +64,7 @@
virtual void CheckCharacterGT(uc16 limit, Label* on_greater) = 0;
virtual void CheckCharacter(uc16 c, Label* on_equal) = 0;
virtual void CheckNotCharacter(uc16 c, Label* on_not_equal) = 0;
+ virtual void CheckNotBackReference(int start_reg, Label* on_no_match) =
0;
// Check the current character for a match with a literal string. If we
// fail to match then goto the on_failure label. End of input always
// matches. If the label is NULL then we should pop a backtrack address
off
Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Fri Nov 21
05:08:23 2008
@@ -160,9 +160,9 @@
CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
- CHECK_PARSE_EQ("\\1(a)", "(: '\\x01' (^ 'a'))");
+ CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))");
- CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: '\\x01' (^ 'a') (<-
1)))");
+ CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: (^ 'a') (<- 1)))");
CHECK_PARSE_EQ("[\\0]", "[\\x00]");
CHECK_PARSE_EQ("[\\11]", "[\\x09]");
CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]");
@@ -196,7 +196,7 @@
CHECK_ESCAPES("(a)", false);
CHECK_ESCAPES("(a)\\1", false);
CHECK_ESCAPES("(\\1a)", false);
- CHECK_ESCAPES("\\1(a)", true);
+ CHECK_ESCAPES("\\1(a)", false);
CHECK_ESCAPES("a\\s", false);
CHECK_ESCAPES("a\\S", false);
CHECK_ESCAPES("a\\d", false);
@@ -923,5 +923,5 @@
TEST(Graph) {
V8::Initialize(NULL);
- Execute("(a|^b|c)", "", false);
+ Execute("(x)?\\1y", "", true);
}
--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---