Author: [EMAIL PROTECTED]
Date: Fri Nov 21 05:08:23 2008
New Revision: 820

Modified:
    branches/experimental/regexp2000/src/assembler-re2k.cc
    branches/experimental/regexp2000/src/assembler-re2k.h
    branches/experimental/regexp2000/src/ast.cc
    branches/experimental/regexp2000/src/ast.h
    branches/experimental/regexp2000/src/bytecodes-re2k.h
    branches/experimental/regexp2000/src/interpreter-re2k.cc
    branches/experimental/regexp2000/src/jsregexp.cc
    branches/experimental/regexp2000/src/jsregexp.h
    branches/experimental/regexp2000/src/parser.cc
    branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.cc
    branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.h
    branches/experimental/regexp2000/src/regexp-macro-assembler.h
    branches/experimental/regexp2000/test/cctest/test-regexp.cc

Log:
Add back references.  Since "back references" is two words
the CamelCasing was fixed in various places.
Review URL: http://codereview.chromium.org/11350

Modified: branches/experimental/regexp2000/src/assembler-re2k.cc
==============================================================================
--- branches/experimental/regexp2000/src/assembler-re2k.cc      (original)
+++ branches/experimental/regexp2000/src/assembler-re2k.cc      Fri Nov 21  
05:08:23 2008
@@ -211,10 +211,9 @@
  }


-void Re2kAssembler::CheckBackref(int capture_index,
-                                 Label* on_mismatch) {
-  Emit(BC_CHECK_BACKREF);
-  Emit32(0);
+void Re2kAssembler::CheckNotBackReference(int capture_index,
+                                          Label* on_mismatch) {
+  Emit(BC_CHECK_NOT_BACK_REF);
    Emit(capture_index);
    EmitOrLink(on_mismatch);
  }

Modified: branches/experimental/regexp2000/src/assembler-re2k.h
==============================================================================
--- branches/experimental/regexp2000/src/assembler-re2k.h       (original)
+++ branches/experimental/regexp2000/src/assembler-re2k.h       Fri Nov 21  
05:08:23 2008
@@ -71,8 +71,8 @@
    // previous capture.  Advances current position by the length of the  
capture
    // iff it matches.  The capture is stored in a given register and the
    // the register after.  If a register contains -1 then the other register
-  // mush always contain -1 and the on_mismatch label will never be called.
-  void CheckBackref(int capture_index, Label* on_mismatch);
+  // must always contain -1 and the on_mismatch label will never be called.
+  void CheckNotBackReference(int capture_index, Label* on_mismatch);

    // Checks a register for strictly-less-than or greater-than-or-equal.
    void CheckRegisterLT(int reg_index, uint16_t vs, Label* on_less_than);

Modified: branches/experimental/regexp2000/src/ast.cc
==============================================================================
--- branches/experimental/regexp2000/src/ast.cc (original)
+++ branches/experimental/regexp2000/src/ast.cc Fri Nov 21 05:08:23 2008
@@ -360,7 +360,7 @@
  }


-void* RegExpUnparser::VisitBackreference(RegExpBackreference* that,
+void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
                                           void* data) {
    stream()->Add("(<- %i)", that->index());
    return NULL;

Modified: branches/experimental/regexp2000/src/ast.h
==============================================================================
--- branches/experimental/regexp2000/src/ast.h  (original)
+++ branches/experimental/regexp2000/src/ast.h  Fri Nov 21 05:08:23 2008
@@ -1412,16 +1412,16 @@
  };


-class RegExpBackreference: public RegExpTree {
+class RegExpBackReference: public RegExpTree {
   public:
-  explicit RegExpBackreference(RegExpCapture* capture)
+  explicit RegExpBackReference(RegExpCapture* capture)
      : capture_(capture) { }
    virtual void* Accept(RegExpVisitor* visitor, void* data);
    virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success,
                               RegExpNode* on_failure);
-  virtual RegExpBackreference* AsBackreference();
-  virtual bool IsBackreference();
+  virtual RegExpBackReference* AsBackReference();
+  virtual bool IsBackReference();
    int index() { return capture_->index(); }
    RegExpCapture* capture() { return capture_; }
   private:

Modified: branches/experimental/regexp2000/src/bytecodes-re2k.h
==============================================================================
--- branches/experimental/regexp2000/src/bytecodes-re2k.h       (original)
+++ branches/experimental/regexp2000/src/bytecodes-re2k.h       Fri Nov 21  
05:08:23 2008
@@ -54,14 +54,13 @@
  V(CHECK_NOT_CHAR,    19, 7) /* check_not_char uc16  
addr32                   */ \
  V(CHECK_LT,          20, 7) /* check_lt uc16  
addr32                         */ \
  V(CHECK_GT,          21, 7) /* check_gr uc16  
addr32                         */ \
-V(CHECK_BACKREF,     22, 9) /* check_backref offset32 capture_idx  
addr32    */ \
-V(CHECK_NOT_BACKREF, 23, 9) /* check_not_backref offset32 capture_idx  
addr32*/ \
-V(LOOKUP_MAP1,       24, 11) /* l_map1 start16 bit_map_addr32  
addr32        */ \
-V(LOOKUP_MAP2,       25, 99) /* l_map2 start16  
half_nibble_map_addr32*      */ \
-V(LOOKUP_MAP8,       26, 99) /* l_map8 start16 byte_map  
addr32*             */ \
-V(LOOKUP_HI_MAP8,    27, 99) /* l_himap8 start8 byte_map_addr32  
addr32*     */ \
-V(CHECK_REGISTER_LT, 28, 8) /* check_reg_lt register_index value16  
addr32   */ \
-V(CHECK_REGISTER_GE, 29, 8) /* check_reg_ge register_index value16  
addr32   */ \
+V(CHECK_NOT_BACK_REF, 22, 6) /* check_not_back_ref capture_idx  
addr32       */ \
+V(LOOKUP_MAP1,       23, 11) /* l_map1 start16 bit_map_addr32  
addr32        */ \
+V(LOOKUP_MAP2,       24, 99) /* l_map2 start16  
half_nibble_map_addr32*      */ \
+V(LOOKUP_MAP8,       25, 99) /* l_map8 start16 byte_map  
addr32*             */ \
+V(LOOKUP_HI_MAP8,    26, 99) /* l_himap8 start8 byte_map_addr32  
addr32*     */ \
+V(CHECK_REGISTER_LT, 27, 8) /* check_reg_lt register_index value16  
addr32   */ \
+V(CHECK_REGISTER_GE, 28, 8) /* check_reg_ge register_index value16  
addr32   */ \

  #define DECLARE_BYTECODES(name, code, length) \
    static const int BC_##name = code;

Modified: branches/experimental/regexp2000/src/interpreter-re2k.cc
==============================================================================
--- branches/experimental/regexp2000/src/interpreter-re2k.cc    (original)
+++ branches/experimental/regexp2000/src/interpreter-re2k.cc    Fri Nov 21  
05:08:23 2008
@@ -280,12 +280,26 @@
          pc = code_base + Load32(new_pc);
          break;
        }
-      BYTECODE(CHECK_BACKREF)
-        UNREACHABLE();
-        break;
-      BYTECODE(CHECK_NOT_BACKREF)
-        UNREACHABLE();
+      BYTECODE(CHECK_NOT_BACK_REF) {
+        int from = registers[pc[1]];
+        int len = registers[pc[1] + 1] - from;
+        if (current + len > subject.length()) {
+          pc = code_base + Load32(pc + 2);
+          break;
+        } else {
+          int i;
+          for (i = 0; i < len; i++) {
+            if (subject[from + i] != subject[current + i]) {
+              pc = code_base + Load32(pc + 2);
+              break;
+            }
+          }
+          if (i < len) break;
+          current += len;
+        }
+        pc += BC_CHECK_NOT_BACK_REF_LENGTH;
          break;
+      }
        default:
          UNREACHABLE();
          break;

Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc    (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc    Fri Nov 21 05:08:23  
2008
@@ -1282,6 +1282,19 @@
  }


+bool BackReferenceNode::Emit(RegExpCompiler* compiler) {
+  RegExpMacroAssembler* macro = compiler->macro_assembler();
+  Bind(macro);
+  // Check whether the registers are uninitialized and always
+  // succeed if they are.
+  macro->IfRegisterLT(start_reg_, 0, on_success()->label());
+  macro->IfRegisterLT(end_reg_, 0, on_success()->label());
+  ASSERT_EQ(start_reg_ + 1, end_reg_);
+  macro->CheckNotBackReference(start_reg_, on_failure_->label());
+  return on_success()->GoTo(compiler);
+}
+
+
  // -------------------------------------------------------------------
  // Dot/dotty output

@@ -1445,7 +1458,7 @@
  }


-void DotPrinter::VisitBackreference(BackreferenceNode* that) {
+void DotPrinter::VisitBackReference(BackReferenceNode* that) {
    stream()->Add("  n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
                  that,
                  that->start_register(),
@@ -1678,10 +1691,10 @@
  }


-RegExpNode* RegExpBackreference::ToNode(RegExpCompiler* compiler,
+RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
                                          RegExpNode* on_success,
                                          RegExpNode* on_failure) {
-  return new BackreferenceNode(RegExpCapture::StartRegister(index()),
+  return new BackReferenceNode(RegExpCapture::StartRegister(index()),
                                 RegExpCapture::EndRegister(index()),
                                 on_success,
                                 on_failure);
@@ -2020,7 +2033,7 @@
  }


-RegExpNode* BackreferenceNode::PropagateInterest(NodeInfo* info) {
+RegExpNode* BackReferenceNode::PropagateInterest(NodeInfo* info) {
    return PropagateToEndpoint(this, info);
  }

@@ -2230,7 +2243,7 @@
  }


-void Analysis::VisitBackreference(BackreferenceNode* that) {
+void Analysis::VisitBackReference(BackReferenceNode* that) {
    EnsureAnalyzed(that->on_success());
    EnsureAnalyzed(that->on_failure());
  }
@@ -2287,8 +2300,10 @@
  }


-void DispatchTableConstructor::VisitBackreference(BackreferenceNode* that)  
{
-  // TODO(plesner): What should this do?
+void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that)  
{
+  // TODO(160): Find the node that we refer back to and propagate its start
+  // set back to here.  For now we just accept anything.
+  AddRange(CharacterRange::Everything());
  }



Modified: branches/experimental/regexp2000/src/jsregexp.h
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.h     (original)
+++ branches/experimental/regexp2000/src/jsregexp.h     Fri Nov 21 05:08:23 2008
@@ -402,7 +402,7 @@
    VISIT(End)                                                         \
    VISIT(Action)                                                      \
    VISIT(Choice)                                                      \
-  VISIT(Backreference)                                               \
+  VISIT(BackReference)                                               \
    VISIT(Text)


@@ -415,7 +415,7 @@
    VISIT(Quantifier)                                                  \
    VISIT(Capture)                                                     \
    VISIT(Lookahead)                                                   \
-  VISIT(Backreference)                                               \
+  VISIT(BackReference)                                               \
    VISIT(Empty)                                                       \
    VISIT(Text)

@@ -606,9 +606,9 @@
  };


-class BackreferenceNode: public SeqRegExpNode {
+class BackReferenceNode: public SeqRegExpNode {
   public:
-  BackreferenceNode(int start_reg,
+  BackReferenceNode(int start_reg,
                      int end_reg,
                      RegExpNode* on_success,
                      RegExpNode* on_failure)
@@ -620,7 +620,7 @@
    RegExpNode* on_failure() { return on_failure_; }
    int start_register() { return start_reg_; }
    int end_register() { return end_reg_; }
-  virtual bool Emit(RegExpCompiler* compiler) { return false; }
+  virtual bool Emit(RegExpCompiler* compiler);
    virtual RegExpNode* PropagateInterest(NodeInfo* info);
   private:
    RegExpNode* on_failure_;

Modified: branches/experimental/regexp2000/src/parser.cc
==============================================================================
--- branches/experimental/regexp2000/src/parser.cc      (original)
+++ branches/experimental/regexp2000/src/parser.cc      Fri Nov 21 05:08:23 2008
@@ -515,11 +515,11 @@
    uc32 ParseControlLetterEscape(bool* ok);
    uc32 ParseOctalLiteral();

-  // Tries to parse the input as a backreference.  If successful it
+  // Tries to parse the input as a back reference.  If successful it
    // stores the result in the output parameter and returns true.  If
    // it fails it will push back the characters read so the same characters
    // can be reparsed.
-  bool ParseBackreferenceIndex(int* index_out);
+  bool ParseBackReferenceIndex(int* index_out);

    CharacterRange ParseClassAtom(bool* is_char_class,
                                  ZoneList<CharacterRange>* ranges,
@@ -541,6 +541,8 @@
    bool has_next() { return next_pos_ < in()->length(); }
    uc32 Next();
    FlatStringReader* in() { return in_; }
+  void ScanForCaptures();
+  bool CaptureAvailable(int index);
    uc32 current_;
    bool has_more_;
    bool multiline_mode_;
@@ -548,7 +550,9 @@
    FlatStringReader* in_;
    Handle<String>* error_;
    bool has_character_escapes_;
+  bool is_scanned_for_captures_;
    ZoneList<RegExpCapture*>* captures_;
+  int capture_count_;
  };


@@ -3502,7 +3506,9 @@
      in_(in),
      error_(error),
      has_character_escapes_(false),
-    captures_(NULL) {
+    is_scanned_for_captures_(false),
+    captures_(NULL),
+    capture_count_(0) {
    Advance(1);
  }

@@ -3564,6 +3570,14 @@
  }


+bool RegExpParser::CaptureAvailable(int index) {
+  if (captures_ == NULL) return false;
+  if (index >= captures_->length()) return false;
+  RegExpCapture* capture = captures_->at(index);
+  return capture != NULL && capture->available() == CAPTURE_AVAILABLE;
+}
+
+
  // Disjunction ::
  //   Alternative
  //   Alternative | Disjunction
@@ -3667,14 +3681,14 @@
        case '1': case '2': case '3': case '4': case '5': case '6':
        case '7': case '8': case '9': {
          int index = 0;
-        if (ParseBackreferenceIndex(&index)) {
-          RegExpCapture* capture = captures_->at(index - 1);
-          if (capture == NULL || capture->available() !=  
CAPTURE_AVAILABLE) {
+        if (ParseBackReferenceIndex(&index)) {
+          if (!CaptureAvailable(index - 1)) {
              // Prepare to ignore a following quantifier
              builder.AddEmpty();
              goto has_read_atom;
            }
-          RegExpTree* atom = new RegExpBackreference(capture);
+          RegExpCapture* capture = captures_->at(index - 1);
+          RegExpTree* atom = new RegExpBackReference(capture);
            builder.AddAtom(atom);
            goto has_read_atom;  // Avoid setting has_character_escapes_.
          }
@@ -3844,7 +3858,42 @@
  #endif


-bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
+// In order to know whether an escape is a backreference or not we have to  
scan
+// the entire regexp and find the number of capturing parentheses.   
However we
+// don't want to scan the regexp twice unless it is necessary.  This  
mini-parser
+// is called when needed.  It can see the difference between capturing and
+// noncapturing parentheses and can skip character classes and  
backslash-escaped
+// characters.
+void RegExpParser::ScanForCaptures() {
+  int n;
+  while ((n = current()) != kEndMarker) {
+    Advance();
+    switch (n) {
+      case '\\':
+        Advance();
+        break;
+      case '[': {
+        int c;
+        while ((c = current()) != kEndMarker) {
+          Advance();
+          if (c == '\\') {
+            Advance();
+          } else {
+            if (c == ']') break;
+          }
+        }
+        break;
+      }
+      case '(':
+        if (current() != '?') capture_count_++;
+        break;
+    }
+  }
+  is_scanned_for_captures_ = true;
+}
+
+
+bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
    ASSERT_EQ('\\', current());
    ASSERT('1' <= Next() && Next() <= '9');
    // Try to parse a decimal literal that is no greater than the number
@@ -3852,18 +3901,21 @@
    // This is a not according the the ECMAScript specification. According to
    // that, one must accept values up to the total number of left capturing
    // parentheses in the entire input, even if they are meaningless.
-  if (captures_ == NULL)
-    return false;
+  if (!is_scanned_for_captures_) {
+    int saved_position = position();
+    ScanForCaptures();
+    Reset(saved_position);
+  }
+  if (capture_count_ == 0) return false;
    int start = position();
    int value = Next() - '0';
-  if (value > captures_->length())
-    return false;
+  if (value > capture_count_) return false;
    Advance(2);
    while (true) {
      uc32 c = current();
      if (IsDecimalDigit(c)) {
        value = 10 * value + (c - '0');
-      if (value > captures_->length()) {
+      if (value > capture_count_) {
          Reset(start);
          return false;
        }
@@ -4068,6 +4120,7 @@
        captures_ = new ZoneList<RegExpCapture*>(2);
      }
      captures_->Add(NULL);
+    if (!is_scanned_for_captures_) capture_count_++;
    }
    int capture_index = captures_started();
    RegExpTree* body = ParseDisjunction(CHECK_OK);

Modified:  
branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.cc
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.cc  
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.cc Fri  
Nov 21 05:08:23 2008
@@ -174,6 +174,12 @@
  }


+void RegExpMacroAssemblerRe2k::CheckNotBackReference(int start_reg,
+                                                     Label* on_not_equal) {
+  assembler_->CheckNotBackReference(start_reg, on_not_equal);
+}
+
+
  void RegExpMacroAssemblerRe2k::CheckBitmap(uc16 start,
                                             Label* bitmap,
                                             Label* on_zero) {

Modified: branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.h
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.h   
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler-re2k.h  Fri  
Nov 21 05:08:23 2008
@@ -60,6 +60,7 @@
    virtual void CheckCharacterGT(uc16 limit, Label* on_greater);
    virtual void CheckCharacter(uc16 c, Label* on_equal);
    virtual void CheckNotCharacter(uc16 c, Label* on_not_equal);
+  virtual void CheckNotBackReference(int start_reg, Label* on_no_match);
    virtual void CheckCharacters(Vector<const uc16> str,
                                 int cp_offset,
                                 Label* on_failure);

Modified: branches/experimental/regexp2000/src/regexp-macro-assembler.h
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler.h       
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler.h       Fri Nov 
 
21 05:08:23 2008
@@ -64,6 +64,7 @@
    virtual void CheckCharacterGT(uc16 limit, Label* on_greater) = 0;
    virtual void CheckCharacter(uc16 c, Label* on_equal) = 0;
    virtual void CheckNotCharacter(uc16 c, Label* on_not_equal) = 0;
+  virtual void CheckNotBackReference(int start_reg, Label* on_no_match) =  
0;
    // Check the current character for a match with a literal string.  If we
    // fail to match then goto the on_failure label.  End of input always
    // matches.  If the label is NULL then we should pop a backtrack address  
off

Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Fri Nov 21  
05:08:23 2008
@@ -160,9 +160,9 @@
    CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
    CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
    CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
-  CHECK_PARSE_EQ("\\1(a)", "(: '\\x01' (^ 'a'))");
+  CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
    CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))");
-  CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: '\\x01' (^ 'a') (<-  
1)))");
+  CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: (^ 'a') (<- 1)))");
    CHECK_PARSE_EQ("[\\0]", "[\\x00]");
    CHECK_PARSE_EQ("[\\11]", "[\\x09]");
    CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]");
@@ -196,7 +196,7 @@
    CHECK_ESCAPES("(a)", false);
    CHECK_ESCAPES("(a)\\1", false);
    CHECK_ESCAPES("(\\1a)", false);
-  CHECK_ESCAPES("\\1(a)", true);
+  CHECK_ESCAPES("\\1(a)", false);
    CHECK_ESCAPES("a\\s", false);
    CHECK_ESCAPES("a\\S", false);
    CHECK_ESCAPES("a\\d", false);
@@ -923,5 +923,5 @@

  TEST(Graph) {
    V8::Initialize(NULL);
-  Execute("(a|^b|c)", "", false);
+  Execute("(x)?\\1y", "", true);
  }

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to