Author: [EMAIL PROTECTED]
Date: Fri Nov 14 06:06:29 2008
New Revision: 759
Modified:
branches/experimental/regexp2000/src/ast.h
branches/experimental/regexp2000/src/jsregexp.cc
branches/experimental/regexp2000/src/parser.cc
branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.cc
branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.h
branches/experimental/regexp2000/src/regexp-macro-assembler.cc
branches/experimental/regexp2000/src/regexp-macro-assembler.h
branches/experimental/regexp2000/test/cctest/test-regexp.cc
Log:
Better handling of captures.
Ignores back-references to captures that can never be captured at that
point.
RegExpBackReference keeps link to the RegExpCapture.
Fix for bad formatting.
Modified: branches/experimental/regexp2000/src/ast.h
==============================================================================
--- branches/experimental/regexp2000/src/ast.h (original)
+++ branches/experimental/regexp2000/src/ast.h Fri Nov 14 06:06:29 2008
@@ -1341,10 +1341,13 @@
};
+enum CaptureAvailability {
+ CAPTURE_AVAILABLE, CAPTURE_UNREACHABLE,
CAPTURE_PERMANENTLY_UNREACHABLE };
+
class RegExpCapture: public RegExpTree {
public:
explicit RegExpCapture(RegExpTree* body, int index)
- : body_(body), index_(index) { }
+ : body_(body), index_(index), available_(CAPTURE_AVAILABLE) { }
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success,
@@ -1357,11 +1360,16 @@
virtual RegExpCapture* AsCapture();
RegExpTree* body() { return body_; }
int index() { return index_; }
+ inline CaptureAvailability available() { return available_; }
+ inline void set_available(CaptureAvailability availability) {
+ available_ = availability;
+ }
static int StartRegister(int index) { return index * 2; }
static int EndRegister(int index) { return index * 2 + 1; }
private:
RegExpTree* body_;
int index_;
+ CaptureAvailability available_;
};
@@ -1385,15 +1393,17 @@
class RegExpBackreference: public RegExpTree {
public:
- explicit RegExpBackreference(int index) : index_(index) { }
+ explicit RegExpBackreference(RegExpCapture* capture)
+ : capture_(capture) { }
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success,
RegExpNode* on_failure);
virtual RegExpBackreference* AsBackreference();
- int index() { return index_; }
+ int index() { return capture_->index(); }
+ RegExpCapture* capture() { return capture_; }
private:
- int index_;
+ RegExpCapture* capture_;
};
Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc Fri Nov 14 06:06:29
2008
@@ -1876,15 +1876,16 @@
byte codes[10240];
Re2kAssembler assembler(Vector<byte>(codes, 1024));
RegExpMacroAssemblerRe2k macro_assembler(&assembler);
- return compiler.Assemble(¯o_assembler, node, input->capture_count,
ignore_case);
+ return compiler.Assemble(¯o_assembler,
+ node,
+ input->capture_count,
+ ignore_case);
}
RegExpMacroAssembler::RegExpMacroAssembler() {
-
}
RegExpMacroAssembler::~RegExpMacroAssembler() {
-
}
Modified: branches/experimental/regexp2000/src/parser.cc
==============================================================================
--- branches/experimental/regexp2000/src/parser.cc (original)
+++ branches/experimental/regexp2000/src/parser.cc Fri Nov 14 06:06:29 2008
@@ -291,6 +291,9 @@
public:
RegExpBuilder();
void AddCharacter(uc16 character);
+ // "Adds" an empty expression. Does nothing except consume a
+ // following quantifier
+ void AddEmpty();
void AddAtom(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
@@ -299,6 +302,7 @@
private:
void FlushCharacters();
bool FlushTerms();
+ bool pending_empty_;
ZoneList<uc16>* characters_;
BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> alternatives_;
@@ -311,7 +315,8 @@
};
-RegExpBuilder::RegExpBuilder() : characters_(NULL), terms_(),
alternatives_()
+RegExpBuilder::RegExpBuilder()
+ : pending_empty_(false), characters_(NULL), terms_(), alternatives_()
#ifdef DEBUG
, last_added_(ADD_NONE)
#endif
@@ -319,6 +324,7 @@
void RegExpBuilder::FlushCharacters() {
+ pending_empty_ = false;
if (characters_ != NULL) {
RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());
characters_ = NULL;
@@ -329,6 +335,7 @@
void RegExpBuilder::AddCharacter(uc16 c) {
+ pending_empty_ = false;
if (characters_ == NULL) {
characters_ = new ZoneList<uc16>(4);
}
@@ -337,6 +344,11 @@
}
+void RegExpBuilder::AddEmpty() {
+ pending_empty_ = true;
+}
+
+
void RegExpBuilder::AddAtom(RegExpTree* atom) {
FlushCharacters();
terms_.Add(atom);
@@ -391,6 +403,10 @@
void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
+ if (pending_empty_) {
+ pending_empty_ = false;
+ return;
+ }
RegExpTree* atom;
if (characters_ != NULL) {
ASSERT(last_added_ == ADD_CHAR);
@@ -465,7 +481,7 @@
bool HasCharacterEscapes();
- int captures_started() { return captures_started_; }
+ int captures_started() { return captures_ == NULL ? 0 :
captures_->length(); }
static const uc32 kEndMarker = unibrow::Utf8::kBadChar;
private:
@@ -479,13 +495,13 @@
bool has_more_;
bool has_next_;
bool multiline_mode_;
- int captures_started_;
unibrow::CharacterStream* in_;
Handle<String>* error_;
static const int kMaxPushback = 5;
int pushback_count_;
uc32 pushback_buffer_[kMaxPushback];
bool has_character_escapes_;
+ ZoneList<RegExpCapture*>* captures_;
};
@@ -3437,11 +3453,11 @@
has_more_(true),
has_next_(true),
multiline_mode_(multiline_mode),
- captures_started_(0),
in_(in),
error_(error),
pushback_count_(0),
- has_character_escapes_(false) {
+ has_character_escapes_(false),
+ captures_(NULL) {
Advance(2);
}
@@ -3523,15 +3539,25 @@
// Atom Quantifier
RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
RegExpBuilder builder;
+ int capture_start_index = captures_started();
while (true) {
switch (current()) {
case kEndMarker:
case ')':
return builder.ToRegExp();
- case '|':
+ case '|': {
Advance();
builder.NewAlternative();
+ int capture_new_alt_start_index = captures_started();
+ for (int i = capture_start_index; i < capture_new_alt_start_index;
i++) {
+ RegExpCapture* capture = captures_->at(i);
+ if (capture->available() == CAPTURE_AVAILABLE) {
+ capture->set_available(CAPTURE_UNREACHABLE);
+ }
+ }
+ capture_start_index = capture_new_alt_start_index;
continue;
+ }
case '*':
case '+':
case '?':
@@ -3606,7 +3632,13 @@
case '7': case '8': case '9': {
int index = 0;
if (ParseBackreferenceIndex(&index)) {
- RegExpTree* atom = new RegExpBackreference(index);
+ RegExpCapture* capture = captures_->at(index - 1);
+ if (capture == NULL || capture->available() !=
CAPTURE_AVAILABLE) {
+ // Prepare to ignore a following quantifier
+ builder.AddEmpty();
+ goto has_read_atom;
+ }
+ RegExpTree* atom = new RegExpBackreference(capture);
builder.AddAtom(atom);
goto has_read_atom; // Avoid setting has_character_escapes_.
}
@@ -3775,10 +3807,10 @@
// This is a not according the the ECMAScript specification. According to
// that, one must accept values up to the total number of left capturing
// parentheses in the entire input, even if they are meaningless.
- if (captures_started_ == 0)
+ if (captures_ == NULL)
return false;
int value = next() - '0';
- if (value > captures_started_)
+ if (value > captures_->length())
return false;
static const int kMaxChars = kMaxPushback - 2;
EmbeddedVector<uc32, kMaxChars> chars_seen;
@@ -3791,7 +3823,7 @@
value = 10 * value + (c - '0');
// To avoid reading past the end of the stack-allocated pushback
// buffers we only read kMaxChars before giving up.
- if (value > captures_started_ || char_count > kMaxChars) {
+ if (value > captures_->length() || char_count > kMaxChars) {
// If we give up we have to push the characters we read back
// onto the pushback buffer in the reverse order.
for (int i = 0; i < char_count; i++) {
@@ -4005,16 +4037,42 @@
break;
}
} else {
- captures_started_++;
+ if (captures_ == NULL) {
+ captures_ = new ZoneList<RegExpCapture*>(2);
+ }
+ captures_->Add(NULL);
}
- int capture_index = captures_started_;
+ int capture_index = captures_started();
RegExpTree* body = ParseDisjunction(CHECK_OK);
if (current() != ')') {
ReportError(CStrVector("Unterminated group"), CHECK_OK);
}
Advance();
+
+ int end_capture_index = captures_started();
+ if (type == '!') {
+ // Captures inside a negative lookahead are never available outside it.
+ for (int i = capture_index; i < end_capture_index; i++) {
+ RegExpCapture* capture = captures_->at(i);
+ ASSERT(capture != NULL);
+ capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);
+ }
+ } else {
+ // Captures temporarily unavailable because they are in different
+ // alternatives are all available after the disjunction.
+ for (int i = capture_index; i < end_capture_index; i++) {
+ RegExpCapture* capture = captures_->at(i);
+ ASSERT(capture != NULL);
+ if (capture->available() == CAPTURE_UNREACHABLE) {
+ capture->set_available(CAPTURE_AVAILABLE);
+ }
+ }
+ }
+
if (type == '(') {
- return new RegExpCapture(body, capture_index);
+ RegExpCapture* capture = new RegExpCapture(body, capture_index);
+ captures_->at(capture_index - 1) = capture;
+ return capture;
} else if (type == ':') {
return body;
} else {
@@ -4093,10 +4151,10 @@
}
Advance();
if (ranges->length() == 0) {
- return RegExpEmpty::GetInstance();
- } else {
- return new RegExpCharacterClass(ranges, is_negated);
+ ranges->Add(CharacterRange::Range(0, 0xffff));
+ is_negated = !is_negated;
}
+ return new RegExpCharacterClass(ranges, is_negated);
}
Modified:
branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.cc
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.cc
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.cc Fri
Nov 14 06:06:29 2008
@@ -62,10 +62,10 @@
*/
RegExpMacroAssemblerIA32::RegExpMacroAssemblerIA32()
- : masm_(new MacroAssembler(NULL, kRegExpCodeSize)),
- constants_(kRegExpConstantsSize),
- num_registers_(0),
- ignore_case(false) {}
+ : masm_(new MacroAssembler(NULL, kRegExpCodeSize)),
+ constants_(kRegExpConstantsSize),
+ num_registers_(0),
+ ignore_case(false) {}
RegExpMacroAssemblerIA32::~RegExpMacroAssemblerIA32() {
@@ -108,11 +108,11 @@
Label* on_zero) {
ReadCurrentChar(eax);
__ sub(eax, start);
- __ cmp(eax, 64); // FIXME: 64 = length_of_bitmap_in_bits.
+ __ cmp(eax, 64); // FIXME: 64 = length_of_bitmap_in_bits.
BranchOrBacktrack(greater_equal, on_zero);
__ mov(ebx, eax);
__ shr(ebx, 3);
- // TODO: Where is the bitmap stored? Pass the bitmap as argument instead.
+ // TODO(lrn): Where is the bitmap stored? Pass the bitmap as argument
instead.
// __ mov(ecx, position_of_bitmap);
__ movzx_b(ebx, Operand(ecx, ebx, times_1, 0));
__ and_(eax, (1<<3)-1);
@@ -144,8 +144,8 @@
BranchOrBacktrack(greater_equal, on_failure);
if (str.length() <= kMaxInlineStringTests || ignore_case()) {
- // TODO: make proper loop if str.length is large but ignore_case is
true;
- for(int i = 0; i < str.length(); i++) {
+ // TODO(lrn): make loop if str.length is large but ignore_case is true;
+ for (int i = 0; i < str.length(); i++) {
ReadChar(eax, i);
if (ignore_case()) {
Canonicalize(eax);
@@ -167,7 +167,7 @@
if (sizeof(SubjectChar) == 1) {
__ rep_cmpsb();
} else {
- ASSERT(sizeof(SubjectChar)==2);
+ ASSERT(sizeof(SubjectChar) == 2);
__ rep_cmpsw();
}
__ mov(esi, ebx);
@@ -201,7 +201,7 @@
__ mov(ebx, eax);
__ shr(eax, 2);
- __ movzx_b(eax, Operand(ecx, eax)); // FIXME: ecx holds address of map
+ __ movzx_b(eax, Operand(ecx, eax)); // FIXME: ecx holds address of map
Label got_nybble;
Label high_bits;
__ and_(ebx, 0x03);
@@ -263,7 +263,7 @@
__ cmp(eax, destinations.length() - start);
__ j(greater_equal, &fallthrough);
- // TODO jumptable: jump to destinations[eax]
+ // TODO(lrn) jumptable: jump to destinations[eax]
__ bind(&fallthrough);
}
@@ -283,7 +283,7 @@
}
-void RegExpMacroAssemblerIA32::GoTo(Label &to) {
+void RegExpMacroAssemblerIA32::GoTo(Label* to) {
__ jmp(to);
}
@@ -398,7 +398,7 @@
return;
}
ASSERT(sizeof(SubjectChar) == 2);
- // TODO: Use some tables.
+ // TODO(lrn): Use some tables.
}
@@ -427,10 +427,10 @@
}
-template <typename T>
-void LoadConstantBufferAddress(Register reg, ArraySlice<T>& buffer) {
- __ mov(reg, buffer.array());
- __ add(reg, buffer.base_offset());
+void RegExpMacroAssemblerIA32::LoadConstantBufferAddress(
+ Register reg, ArraySlice<T>* buffer) {
+ __ mov(reg, buffer->array());
+ __ add(reg, buffer->base_offset());
}
#undef __
Modified: branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.h
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.h
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.h Fri
Nov 14 06:06:29 2008
@@ -133,6 +133,10 @@
// Read a character from input at the given offset from the current
// position.
void ReadChar(Register destination, int offset);
+
+ template <typename T>
+ void LoadConstantBufferAddress(Register reg, ArraySlice<T>* buffer);
+
// Read the current character into the destination register.
void ReadCurrentChar(Register destination);
@@ -145,7 +149,6 @@
int num_registers_;
bool ignore_case_;
};
-
}}
#endif /* REGEXP_MACRO_ASSEMBLER_IA32_H_ */
Modified: branches/experimental/regexp2000/src/regexp-macro-assembler.cc
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler.cc
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler.cc Fri Nov
14 06:06:29 2008
@@ -32,9 +32,9 @@
ByteArrayProvider::ByteArrayProvider(int initial_size)
- : byte_array_size_(initial_size),
- current_byte_array_(NULL),
- current_byte_array_free_offset_(initial_size) {}
+ : byte_array_size_(initial_size),
+ current_byte_array_(NULL),
+ current_byte_array_free_offset_(initial_size) {}
template <typename T>
@@ -58,5 +58,4 @@
current_byte_array_free_offset_ = free_offset + size;
return ArraySlice<T>(current_byte_array_, free_offset);
}
-
}}
Modified: branches/experimental/regexp2000/src/regexp-macro-assembler.h
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler.h
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler.h Fri Nov
14 06:06:29 2008
@@ -123,7 +123,7 @@
template <typename T>
struct ArraySlice {
-public:
+ public:
ArraySlice(Handle<ByteArray> array, size_t offset)
: array_(array), offset_(offset) {}
Handle<ByteArray> array() { return array_; }
@@ -139,15 +139,15 @@
T& operator[](int idx) {
return reinterpret_cast<T*>(array_->GetDataStartAddress() +
offset)[idx];
}
-private:
- const Handle<ByteArray> array_;
- const size_t offset_;
+ private:
+ Handle<ByteArray> array_;
+ size_t offset_;
};
class ByteArrayProvider {
public:
- ByteArrayProvider(int initial_size);
+ explicit ByteArrayProvider(int initial_size);
// Provides a place to put "size" elements of type T. The information
// can be stored in the provided ByteArray at the "offset". The offset is
// aligned to an "align"-boundary
Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Fri Nov 14
06:06:29 2008
@@ -106,7 +106,8 @@
CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
CHECK_PARSE_EQ("()", "(^ %)");
CHECK_PARSE_EQ("(?=)", "(-> + %)");
- CHECK_PARSE_EQ("[]", "%");
+ CHECK_PARSE_EQ("[]", "^[\x00-\uffff]");
+ CHECK_PARSE_EQ("[^]", "[\x00-\uffff]");
CHECK_PARSE_EQ("[x]", "[x]");
CHECK_PARSE_EQ("[xyz]", "[x y z]");
CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
@@ -155,9 +156,11 @@
"(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
" (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\x09')");
CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
- CHECK_PARSE_EQ("(a\\1)", "(^ (: 'a' (<- 1)))");
- CHECK_PARSE_EQ("(\\1a)", "(^ (: (<- 1) 'a'))");
+ CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
+ CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
CHECK_PARSE_EQ("\\1(a)", "(: '\x01' (^ 'a'))");
+ CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))");
+ CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: '\x01' (^ 'a') (<-
1)))");
CHECK_PARSE_EQ("[\\0]", "[\0]");
CHECK_PARSE_EQ("[\\11]", "[\t]");
CHECK_PARSE_EQ("[\\11a]", "[\t a]");
--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---