Revision: 5657
Author: [email protected]
Date: Tue Oct 19 02:42:40 2010
Log: Use finite-length end-anchored regexps to reduce part of regexp that is searched.

Review URL: http://codereview.chromium.org/3850005
http://code.google.com/p/v8/source/detail?r=5657

Modified:
 /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc
 /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.h
 /branches/bleeding_edge/src/ast.cc
 /branches/bleeding_edge/src/ast.h
 /branches/bleeding_edge/src/bytecodes-irregexp.h
 /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc
 /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.h
 /branches/bleeding_edge/src/interpreter-irregexp.cc
 /branches/bleeding_edge/src/jsregexp.cc
 /branches/bleeding_edge/src/regexp-macro-assembler-irregexp.cc
 /branches/bleeding_edge/src/regexp-macro-assembler-irregexp.h
 /branches/bleeding_edge/src/regexp-macro-assembler-tracer.cc
 /branches/bleeding_edge/src/regexp-macro-assembler-tracer.h
 /branches/bleeding_edge/src/regexp-macro-assembler.h
 /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc
 /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.h
 /branches/bleeding_edge/test/mjsunit/regexp.js

=======================================
--- /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Tue Sep 7 04:09:45 2010 +++ /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Tue Oct 19 02:42:40 2010
@@ -142,7 +142,6 @@

 void RegExpMacroAssemblerARM::AdvanceCurrentPosition(int by) {
   if (by != 0) {
-    Label inside_string;
     __ add(current_input_offset(),
            current_input_offset(), Operand(by * char_size()));
   }
@@ -925,6 +924,19 @@
   __ ldr(r0, MemOperand(frame_pointer(), kStackHighEnd));
   __ add(backtrack_stackpointer(), backtrack_stackpointer(), Operand(r0));
 }
+
+
+void RegExpMacroAssemblerARM::SetCurrentPositionFromEnd(int by) {
+  NearLabel after_position;
+  __ cmp(current_input_offset(), Operand(by * char_size()));
+  __ b(hs, &after_position);
+  __ mov(current_input_offset(), Operand(by * char_size()));
+ // On RegExp code entry (where this operation is used), the character before
+  // the current position is expected to be already loaded.
+  // We have advenced the position, so it's safe to read backwards.
+  LoadCurrentCharacterUnchecked(-1, 1);
+  __ bind(&after_position);
+}


 void RegExpMacroAssemblerARM::SetRegister(int register_index, int to) {
=======================================
--- /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.h Mon Aug 30 04:48:07 2010 +++ /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.h Tue Oct 19 02:42:40 2010
@@ -100,6 +100,7 @@
                             StackCheckFlag check_stack_limit);
   virtual void ReadCurrentPositionFromRegister(int reg);
   virtual void ReadStackPointerFromRegister(int reg);
+  virtual void SetCurrentPositionFromEnd(int by);
   virtual void SetRegister(int register_index, int to);
   virtual void Succeed();
   virtual void WriteCurrentPositionToRegister(int reg, int cp_offset);
=======================================
--- /branches/bleeding_edge/src/ast.cc  Fri Sep 24 00:53:59 2010
+++ /branches/bleeding_edge/src/ast.cc  Tue Oct 19 02:42:40 2010
@@ -398,39 +398,70 @@
 }


-bool RegExpAssertion::IsAnchored() {
+bool RegExpAssertion::IsAnchoredAtStart() {
   return type() == RegExpAssertion::START_OF_INPUT;
 }


-bool RegExpAlternative::IsAnchored() {
+bool RegExpAssertion::IsAnchoredAtEnd() {
+  return type() == RegExpAssertion::END_OF_INPUT;
+}
+
+
+bool RegExpAlternative::IsAnchoredAtStart() {
   ZoneList<RegExpTree*>* nodes = this->nodes();
   for (int i = 0; i < nodes->length(); i++) {
     RegExpTree* node = nodes->at(i);
-    if (node->IsAnchored()) { return true; }
+    if (node->IsAnchoredAtStart()) { return true; }
     if (node->max_match() > 0) { return false; }
   }
   return false;
 }


-bool RegExpDisjunction::IsAnchored() {
+bool RegExpAlternative::IsAnchoredAtEnd() {
+  ZoneList<RegExpTree*>* nodes = this->nodes();
+  for (int i = nodes->length() - 1; i >= 0; i--) {
+    RegExpTree* node = nodes->at(i);
+    if (node->IsAnchoredAtEnd()) { return true; }
+    if (node->max_match() > 0) { return false; }
+  }
+  return false;
+}
+
+
+bool RegExpDisjunction::IsAnchoredAtStart() {
   ZoneList<RegExpTree*>* alternatives = this->alternatives();
   for (int i = 0; i < alternatives->length(); i++) {
-    if (!alternatives->at(i)->IsAnchored())
+    if (!alternatives->at(i)->IsAnchoredAtStart())
       return false;
   }
   return true;
 }


-bool RegExpLookahead::IsAnchored() {
-  return is_positive() && body()->IsAnchored();
+bool RegExpDisjunction::IsAnchoredAtEnd() {
+  ZoneList<RegExpTree*>* alternatives = this->alternatives();
+  for (int i = 0; i < alternatives->length(); i++) {
+    if (!alternatives->at(i)->IsAnchoredAtEnd())
+      return false;
+  }
+  return true;
 }


-bool RegExpCapture::IsAnchored() {
-  return body()->IsAnchored();
+bool RegExpLookahead::IsAnchoredAtStart() {
+  return is_positive() && body()->IsAnchoredAtStart();
+}
+
+
+bool RegExpCapture::IsAnchoredAtStart() {
+  return body()->IsAnchoredAtStart();
+}
+
+
+bool RegExpCapture::IsAnchoredAtEnd() {
+  return body()->IsAnchoredAtEnd();
 }


=======================================
--- /branches/bleeding_edge/src/ast.h   Mon Oct  4 07:30:43 2010
+++ /branches/bleeding_edge/src/ast.h   Tue Oct 19 02:42:40 2010
@@ -1523,7 +1523,8 @@
   virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                              RegExpNode* on_success) = 0;
   virtual bool IsTextElement() { return false; }
-  virtual bool IsAnchored() { return false; }
+  virtual bool IsAnchoredAtStart() { return false; }
+  virtual bool IsAnchoredAtEnd() { return false; }
   virtual int min_match() = 0;
   virtual int max_match() = 0;
   // Returns the interval of registers used for captures within this
@@ -1548,7 +1549,8 @@
   virtual RegExpDisjunction* AsDisjunction();
   virtual Interval CaptureRegisters();
   virtual bool IsDisjunction();
-  virtual bool IsAnchored();
+  virtual bool IsAnchoredAtStart();
+  virtual bool IsAnchoredAtEnd();
   virtual int min_match() { return min_match_; }
   virtual int max_match() { return max_match_; }
   ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
@@ -1568,7 +1570,8 @@
   virtual RegExpAlternative* AsAlternative();
   virtual Interval CaptureRegisters();
   virtual bool IsAlternative();
-  virtual bool IsAnchored();
+  virtual bool IsAnchoredAtStart();
+  virtual bool IsAnchoredAtEnd();
   virtual int min_match() { return min_match_; }
   virtual int max_match() { return max_match_; }
   ZoneList<RegExpTree*>* nodes() { return nodes_; }
@@ -1595,7 +1598,8 @@
                              RegExpNode* on_success);
   virtual RegExpAssertion* AsAssertion();
   virtual bool IsAssertion();
-  virtual bool IsAnchored();
+  virtual bool IsAnchoredAtStart();
+  virtual bool IsAnchoredAtEnd();
   virtual int min_match() { return 0; }
   virtual int max_match() { return 0; }
   Type type() { return type_; }
@@ -1768,7 +1772,8 @@
                             RegExpCompiler* compiler,
                             RegExpNode* on_success);
   virtual RegExpCapture* AsCapture();
-  virtual bool IsAnchored();
+  virtual bool IsAnchoredAtStart();
+  virtual bool IsAnchoredAtEnd();
   virtual Interval CaptureRegisters();
   virtual bool IsCapture();
   virtual int min_match() { return body_->min_match(); }
@@ -1800,7 +1805,7 @@
   virtual RegExpLookahead* AsLookahead();
   virtual Interval CaptureRegisters();
   virtual bool IsLookahead();
-  virtual bool IsAnchored();
+  virtual bool IsAnchoredAtStart();
   virtual int min_match() { return 0; }
   virtual int max_match() { return 0; }
   RegExpTree* body() { return body_; }
=======================================
--- /branches/bleeding_edge/src/bytecodes-irregexp.h Wed Sep 9 03:49:40 2009 +++ /branches/bleeding_edge/src/bytecodes-irregexp.h Tue Oct 19 02:42:40 2010
@@ -88,7 +88,8 @@
V(CHECK_AT_START, 44, 8) /* bc8 pad24 addr32 */ \ V(CHECK_NOT_AT_START, 45, 8) /* bc8 pad24 addr32 */ \ V(CHECK_GREEDY, 46, 8) /* bc8 pad24 addr32 */ \ -V(ADVANCE_CP_AND_GOTO, 47, 8) /* bc8 offset24 addr32 */ +V(ADVANCE_CP_AND_GOTO, 47, 8) /* bc8 offset24 addr32 */ \ +V(SET_CURRENT_POSITION_FROM_END, 48, 4) /* bc8 idx24 */

 #define DECLARE_BYTECODES(name, code, length) \
   static const int BC_##name = code;
=======================================
--- /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Mon Aug 30 04:48:07 2010 +++ /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Tue Oct 19 02:42:40 2010
@@ -133,7 +133,6 @@

 void RegExpMacroAssemblerIA32::AdvanceCurrentPosition(int by) {
   if (by != 0) {
-    Label inside_string;
     __ add(Operand(edi), Immediate(by * char_size()));
   }
 }
@@ -964,6 +963,17 @@
   __ add(backtrack_stackpointer(), Operand(ebp, kStackHighEnd));
 }

+void RegExpMacroAssemblerIA32::SetCurrentPositionFromEnd(int by)  {
+  NearLabel after_position;
+  __ cmp(edi, -by * char_size());
+  __ j(above_equal, &after_position);
+  __ mov(edi, -by * char_size());
+ // On RegExp code entry (where this operation is used), the character before
+  // the current position is expected to be already loaded.
+  // We have advenced the position, so it's safe to read backwards.
+  LoadCurrentCharacterUnchecked(-1, 1);
+  __ bind(&after_position);
+}

 void RegExpMacroAssemblerIA32::SetRegister(int register_index, int to) {
ASSERT(register_index >= num_saved_registers_); // Reserved for positions!
=======================================
--- /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.h Tue May 11 00:29:10 2010 +++ /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.h Tue Oct 19 02:42:40 2010
@@ -98,6 +98,7 @@
                             StackCheckFlag check_stack_limit);
   virtual void ReadCurrentPositionFromRegister(int reg);
   virtual void ReadStackPointerFromRegister(int reg);
+  virtual void SetCurrentPositionFromEnd(int by);
   virtual void SetRegister(int register_index, int to);
   virtual void Succeed();
   virtual void WriteCurrentPositionToRegister(int reg, int cp_offset);
=======================================
--- /branches/bleeding_edge/src/interpreter-irregexp.cc Wed Nov 11 01:50:06 2009 +++ /branches/bleeding_edge/src/interpreter-irregexp.cc Tue Oct 19 02:42:40 2010
@@ -607,6 +607,15 @@
           pc = code_base + Load32Aligned(pc + 4);
         }
         break;
+      BYTECODE(SET_CURRENT_POSITION_FROM_END) {
+        int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT;
+        if (subject.length() - current > by) {
+          current = subject.length() - by;
+          current_char = subject[current - 1];
+        }
+        pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH;
+        break;
+      }
       default:
         UNREACHABLE();
         break;
=======================================
--- /branches/bleeding_edge/src/jsregexp.cc     Fri Oct  1 07:10:47 2010
+++ /branches/bleeding_edge/src/jsregexp.cc     Tue Oct 19 02:42:40 2010
@@ -5180,7 +5180,10 @@
                                                     &compiler,
                                                     compiler.accept());
   RegExpNode* node = captured_body;
-  if (!data->tree->IsAnchored()) {
+  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
+  bool is_start_anchored = data->tree->IsAnchoredAtStart();
+  int max_length = data->tree->max_match();
+  if (!is_start_anchored) {
     // Add a .*? at the beginning, outside the body capture, unless
     // this expression is anchored at the beginning.
     RegExpNode* loop_node =
@@ -5235,6 +5238,15 @@
   EmbeddedVector<byte, 1024> codes;
   RegExpMacroAssemblerIrregexp macro_assembler(codes);
 #endif  // V8_INTERPRETED_REGEXP
+
+ // Inserted here, instead of in Assembler, because it depends on information
+  // in the AST that isn't replicated in the Node structure.
+  static const int kMaxBacksearchLimit = 1024;
+  if (is_end_anchored &&
+      !is_start_anchored &&
+      max_length < kMaxBacksearchLimit) {
+    macro_assembler.SetCurrentPositionFromEnd(max_length);
+  }

   return compiler.Assemble(&macro_assembler,
                            node,
=======================================
--- /branches/bleeding_edge/src/regexp-macro-assembler-irregexp.cc Mon Apr 19 12:30:11 2010 +++ /branches/bleeding_edge/src/regexp-macro-assembler-irregexp.cc Tue Oct 19 02:42:40 2010
@@ -143,6 +143,12 @@
   ASSERT(register_index <= kMaxRegister);
   Emit(BC_SET_SP_TO_REGISTER, register_index);
 }
+
+
+void RegExpMacroAssemblerIrregexp::SetCurrentPositionFromEnd(int by) {
+  ASSERT(is_uint24(by));
+  Emit(BC_SET_CURRENT_POSITION_FROM_END, by);
+}


void RegExpMacroAssemblerIrregexp::SetRegister(int register_index, int to) {
=======================================
--- /branches/bleeding_edge/src/regexp-macro-assembler-irregexp.h Mon Apr 19 12:30:11 2010 +++ /branches/bleeding_edge/src/regexp-macro-assembler-irregexp.h Tue Oct 19 02:42:40 2010
@@ -65,6 +65,7 @@
   virtual void PushRegister(int register_index,
                             StackCheckFlag check_stack_limit);
   virtual void AdvanceRegister(int reg, int by);  // r[reg] += by.
+  virtual void SetCurrentPositionFromEnd(int by);
   virtual void SetRegister(int register_index, int to);
   virtual void WriteCurrentPositionToRegister(int reg, int cp_offset);
   virtual void ClearRegisters(int reg_from, int reg_to);
=======================================
--- /branches/bleeding_edge/src/regexp-macro-assembler-tracer.cc Thu Sep 30 00:22:53 2010 +++ /branches/bleeding_edge/src/regexp-macro-assembler-tracer.cc Tue Oct 19 02:42:40 2010
@@ -134,6 +134,12 @@
   PrintF(" AdvanceRegister(register=%d, by=%d);\n", reg, by);
   assembler_->AdvanceRegister(reg, by);
 }
+
+
+void RegExpMacroAssemblerTracer::SetCurrentPositionFromEnd(int by) {
+  PrintF(" SetCurrentPositionFromEnd(by=%d);\n", by);
+  assembler_->SetCurrentPositionFromEnd(by);
+}


 void RegExpMacroAssemblerTracer::SetRegister(int register_index, int to) {
=======================================
--- /branches/bleeding_edge/src/regexp-macro-assembler-tracer.h Thu Jan 7 11:01:23 2010 +++ /branches/bleeding_edge/src/regexp-macro-assembler-tracer.h Tue Oct 19 02:42:40 2010
@@ -89,6 +89,7 @@
                             StackCheckFlag check_stack_limit);
   virtual void ReadCurrentPositionFromRegister(int reg);
   virtual void ReadStackPointerFromRegister(int reg);
+  virtual void SetCurrentPositionFromEnd(int by);
   virtual void SetRegister(int register_index, int to);
   virtual void Succeed();
   virtual void WriteCurrentPositionToRegister(int reg, int cp_offset);
=======================================
--- /branches/bleeding_edge/src/regexp-macro-assembler.h Mon Aug 30 04:48:07 2010 +++ /branches/bleeding_edge/src/regexp-macro-assembler.h Tue Oct 19 02:42:40 2010
@@ -155,6 +155,7 @@
                             StackCheckFlag check_stack_limit) = 0;
   virtual void ReadCurrentPositionFromRegister(int reg) = 0;
   virtual void ReadStackPointerFromRegister(int reg) = 0;
+  virtual void SetCurrentPositionFromEnd(int by) = 0;
   virtual void SetRegister(int register_index, int to) = 0;
   virtual void Succeed() = 0;
   virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
=======================================
--- /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Mon Aug 30 04:48:07 2010 +++ /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Tue Oct 19 02:42:40 2010
@@ -145,7 +145,6 @@

 void RegExpMacroAssemblerX64::AdvanceCurrentPosition(int by) {
   if (by != 0) {
-    Label inside_string;
     __ addq(rdi, Immediate(by * char_size()));
   }
 }
@@ -1051,6 +1050,19 @@
   __ movq(backtrack_stackpointer(), register_location(reg));
   __ addq(backtrack_stackpointer(), Operand(rbp, kStackHighEnd));
 }
+
+
+void RegExpMacroAssemblerX64::SetCurrentPositionFromEnd(int by) {
+  NearLabel after_position;
+  __ cmpq(rdi, Immediate(-by * char_size()));
+  __ j(above_equal, &after_position);
+  __ movq(rdi, Immediate(-by * char_size()));
+ // On RegExp code entry (where this operation is used), the character before
+  // the current position is expected to be already loaded.
+  // We have advenced the position, so it's safe to read backwards.
+  LoadCurrentCharacterUnchecked(-1, 1);
+  __ bind(&after_position);
+}


 void RegExpMacroAssemblerX64::SetRegister(int register_index, int to) {
=======================================
--- /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.h Tue May 11 00:29:10 2010 +++ /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.h Tue Oct 19 02:42:40 2010
@@ -93,6 +93,7 @@
                             StackCheckFlag check_stack_limit);
   virtual void ReadCurrentPositionFromRegister(int reg);
   virtual void ReadStackPointerFromRegister(int reg);
+  virtual void SetCurrentPositionFromEnd(int by);
   virtual void SetRegister(int register_index, int to);
   virtual void Succeed();
   virtual void WriteCurrentPositionToRegister(int reg, int cp_offset);
=======================================
--- /branches/bleeding_edge/test/mjsunit/regexp.js      Thu Oct 14 05:54:00 2010
+++ /branches/bleeding_edge/test/mjsunit/regexp.js      Tue Oct 19 02:42:40 2010
@@ -589,3 +589,59 @@
 assertEquals(false, desc.configurable);
 assertEquals(false, desc.enumerable);
 assertEquals(true, desc.writable);
+
+
+// Check that end-anchored regexps are optimized correctly.
+var re = /(?:a|bc)g$/;
+assertTrue(re.test("ag"));
+assertTrue(re.test("bcg"));
+assertTrue(re.test("abcg"));
+assertTrue(re.test("zimbag"));
+assertTrue(re.test("zimbcg"));
+
+assertFalse(re.test("g"));
+assertFalse(re.test(""));
+
+// Global regexp (non-zero start).
+var re = /(?:a|bc)g$/g;
+assertTrue(re.test("ag"));
+re.lastIndex = 1;  // Near start of string.
+assertTrue(re.test("zimbag"));
+re.lastIndex = 5;  // Near end of string.
+assertFalse(re.test("zimbag"));
+re.lastIndex = 4;
+assertTrue(re.test("zimbag"));
+
+// Anchored at both ends.
+var re = /^(?:a|bc)g$/g;
+assertTrue(re.test("ag"));
+re.lastIndex = 1;
+assertFalse(re.test("ag"));
+re.lastIndex = 1;
+assertFalse(re.test("zag"));
+
+// Long max_length of RegExp.
+var re = /VeryLongRegExp!{1,1000}$/;
+assertTrue(re.test("BahoolaVeryLongRegExp!!!!!!"));
+assertFalse(re.test("VeryLongRegExp"));
+assertFalse(re.test("!"));
+
+// End anchor inside disjunction.
+var re = /(?:a$|bc$)/;
+assertTrue(re.test("a"));
+assertTrue(re.test("bc"));
+assertTrue(re.test("abc"));
+assertTrue(re.test("zimzamzumba"));
+assertTrue(re.test("zimzamzumbc"));
+assertFalse(re.test("c"));
+assertFalse(re.test(""));
+
+// Only partially anchored.
+var re = /(?:a|bc$)/;
+assertTrue(re.test("a"));
+assertTrue(re.test("bc"));
+assertEquals(["a"], re.exec("abc"));
+assertEquals(4, re.exec("zimzamzumba").index);
+assertEquals(["bc"], re.exec("zimzomzumbc"));
+assertFalse(re.test("c"));
+assertFalse(re.test(""));

--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to