Author: [EMAIL PROTECTED]
Date: Fri Nov 14 06:06:29 2008
New Revision: 759

Modified:
    branches/experimental/regexp2000/src/ast.h
    branches/experimental/regexp2000/src/jsregexp.cc
    branches/experimental/regexp2000/src/parser.cc
    branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.cc
    branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.h
    branches/experimental/regexp2000/src/regexp-macro-assembler.cc
    branches/experimental/regexp2000/src/regexp-macro-assembler.h
    branches/experimental/regexp2000/test/cctest/test-regexp.cc

Log:
Better handling of captures.
Ignores back-references to captures that can never be captured at that  
point.
RegExpBackReference keeps link to the RegExpCapture.
Fix for bad formatting.


Modified: branches/experimental/regexp2000/src/ast.h
==============================================================================
--- branches/experimental/regexp2000/src/ast.h  (original)
+++ branches/experimental/regexp2000/src/ast.h  Fri Nov 14 06:06:29 2008
@@ -1341,10 +1341,13 @@
  };


+enum CaptureAvailability {
+    CAPTURE_AVAILABLE, CAPTURE_UNREACHABLE,  
CAPTURE_PERMANENTLY_UNREACHABLE };
+
  class RegExpCapture: public RegExpTree {
   public:
    explicit RegExpCapture(RegExpTree* body, int index)
-    : body_(body), index_(index) { }
+    : body_(body), index_(index), available_(CAPTURE_AVAILABLE) { }
    virtual void* Accept(RegExpVisitor* visitor, void* data);
    virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success,
@@ -1357,11 +1360,16 @@
    virtual RegExpCapture* AsCapture();
    RegExpTree* body() { return body_; }
    int index() { return index_; }
+  inline CaptureAvailability available() { return available_; }
+  inline void set_available(CaptureAvailability availability) {
+    available_ = availability;
+  }
    static int StartRegister(int index) { return index * 2; }
    static int EndRegister(int index) { return index * 2 + 1; }
   private:
    RegExpTree* body_;
    int index_;
+  CaptureAvailability available_;
  };


@@ -1385,15 +1393,17 @@

  class RegExpBackreference: public RegExpTree {
   public:
-  explicit RegExpBackreference(int index) : index_(index) { }
+  explicit RegExpBackreference(RegExpCapture* capture)
+    : capture_(capture) { }
    virtual void* Accept(RegExpVisitor* visitor, void* data);
    virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success,
                               RegExpNode* on_failure);
    virtual RegExpBackreference* AsBackreference();
-  int index() { return index_; }
+  int index() { return capture_->index(); }
+  RegExpCapture* capture() { return capture_; }
   private:
-  int index_;
+  RegExpCapture* capture_;
  };



Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc    (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc    Fri Nov 14 06:06:29  
2008
@@ -1876,15 +1876,16 @@
    byte codes[10240];
    Re2kAssembler assembler(Vector<byte>(codes, 1024));
    RegExpMacroAssemblerRe2k macro_assembler(&assembler);
-  return compiler.Assemble(&macro_assembler, node, input->capture_count,  
ignore_case);
+  return compiler.Assemble(&macro_assembler,
+                           node,
+                           input->capture_count,
+                           ignore_case);
  }

  RegExpMacroAssembler::RegExpMacroAssembler() {
-
  }

  RegExpMacroAssembler::~RegExpMacroAssembler() {
-
  }



Modified: branches/experimental/regexp2000/src/parser.cc
==============================================================================
--- branches/experimental/regexp2000/src/parser.cc      (original)
+++ branches/experimental/regexp2000/src/parser.cc      Fri Nov 14 06:06:29 2008
@@ -291,6 +291,9 @@
   public:
    RegExpBuilder();
    void AddCharacter(uc16 character);
+  // "Adds" an empty expression. Does nothing except consume a
+  // following quantifier
+  void AddEmpty();
    void AddAtom(RegExpTree* tree);
    void AddAssertion(RegExpTree* tree);
    void NewAlternative();  // '|'
@@ -299,6 +302,7 @@
   private:
    void FlushCharacters();
    bool FlushTerms();
+  bool pending_empty_;
    ZoneList<uc16>* characters_;
    BufferedZoneList<RegExpTree, 2> terms_;
    BufferedZoneList<RegExpTree, 2> alternatives_;
@@ -311,7 +315,8 @@
  };


-RegExpBuilder::RegExpBuilder() : characters_(NULL), terms_(),  
alternatives_()
+RegExpBuilder::RegExpBuilder()
+  : pending_empty_(false), characters_(NULL), terms_(), alternatives_()
  #ifdef DEBUG
    , last_added_(ADD_NONE)
  #endif
@@ -319,6 +324,7 @@


  void RegExpBuilder::FlushCharacters() {
+  pending_empty_ = false;
    if (characters_ != NULL) {
      RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());
      characters_ = NULL;
@@ -329,6 +335,7 @@


  void RegExpBuilder::AddCharacter(uc16 c) {
+  pending_empty_ = false;
    if (characters_ == NULL) {
      characters_ = new ZoneList<uc16>(4);
    }
@@ -337,6 +344,11 @@
  }


+void RegExpBuilder::AddEmpty() {
+  pending_empty_ = true;
+}
+
+
  void RegExpBuilder::AddAtom(RegExpTree* atom) {
    FlushCharacters();
    terms_.Add(atom);
@@ -391,6 +403,10 @@


  void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
+  if (pending_empty_) {
+    pending_empty_ = false;
+    return;
+  }
    RegExpTree* atom;
    if (characters_ != NULL) {
      ASSERT(last_added_ == ADD_CHAR);
@@ -465,7 +481,7 @@

    bool HasCharacterEscapes();

-  int captures_started() { return captures_started_; }
+  int captures_started() { return captures_ == NULL ? 0 :  
captures_->length(); }

    static const uc32 kEndMarker = unibrow::Utf8::kBadChar;
   private:
@@ -479,13 +495,13 @@
    bool has_more_;
    bool has_next_;
    bool multiline_mode_;
-  int captures_started_;
    unibrow::CharacterStream* in_;
    Handle<String>* error_;
    static const int kMaxPushback = 5;
    int pushback_count_;
    uc32 pushback_buffer_[kMaxPushback];
    bool has_character_escapes_;
+  ZoneList<RegExpCapture*>* captures_;
  };


@@ -3437,11 +3453,11 @@
      has_more_(true),
      has_next_(true),
      multiline_mode_(multiline_mode),
-    captures_started_(0),
      in_(in),
      error_(error),
      pushback_count_(0),
-    has_character_escapes_(false) {
+    has_character_escapes_(false),
+    captures_(NULL) {
    Advance(2);
  }

@@ -3523,15 +3539,25 @@
  //   Atom Quantifier
  RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
    RegExpBuilder builder;
+  int capture_start_index = captures_started();
    while (true) {
      switch (current()) {
      case kEndMarker:
      case ')':
        return builder.ToRegExp();
-    case '|':
+    case '|': {
        Advance();
        builder.NewAlternative();
+      int capture_new_alt_start_index = captures_started();
+      for (int i = capture_start_index; i < capture_new_alt_start_index;  
i++) {
+        RegExpCapture* capture = captures_->at(i);
+        if (capture->available() == CAPTURE_AVAILABLE) {
+          capture->set_available(CAPTURE_UNREACHABLE);
+        }
+      }
+      capture_start_index = capture_new_alt_start_index;
        continue;
+    }
      case '*':
      case '+':
      case '?':
@@ -3606,7 +3632,13 @@
        case '7': case '8': case '9': {
          int index = 0;
          if (ParseBackreferenceIndex(&index)) {
-          RegExpTree* atom = new RegExpBackreference(index);
+          RegExpCapture* capture = captures_->at(index - 1);
+          if (capture == NULL || capture->available() !=  
CAPTURE_AVAILABLE) {
+            // Prepare to ignore a following quantifier
+            builder.AddEmpty();
+            goto has_read_atom;
+          }
+          RegExpTree* atom = new RegExpBackreference(capture);
            builder.AddAtom(atom);
            goto has_read_atom;  // Avoid setting has_character_escapes_.
          }
@@ -3775,10 +3807,10 @@
    // This is a not according the the ECMAScript specification. According to
    // that, one must accept values up to the total number of left capturing
    // parentheses in the entire input, even if they are meaningless.
-  if (captures_started_ == 0)
+  if (captures_ == NULL)
      return false;
    int value = next() - '0';
-  if (value > captures_started_)
+  if (value > captures_->length())
      return false;
    static const int kMaxChars = kMaxPushback - 2;
    EmbeddedVector<uc32, kMaxChars> chars_seen;
@@ -3791,7 +3823,7 @@
        value = 10 * value + (c - '0');
        // To avoid reading past the end of the stack-allocated pushback
        // buffers we only read kMaxChars before giving up.
-      if (value > captures_started_ || char_count > kMaxChars) {
+      if (value > captures_->length() || char_count > kMaxChars) {
          // If we give up we have to push the characters we read back
          // onto the pushback buffer in the reverse order.
          for (int i = 0; i < char_count; i++) {
@@ -4005,16 +4037,42 @@
          break;
      }
    } else {
-    captures_started_++;
+    if (captures_ == NULL) {
+      captures_ = new ZoneList<RegExpCapture*>(2);
+    }
+    captures_->Add(NULL);
    }
-  int capture_index = captures_started_;
+  int capture_index = captures_started();
    RegExpTree* body = ParseDisjunction(CHECK_OK);
    if (current() != ')') {
      ReportError(CStrVector("Unterminated group"), CHECK_OK);
    }
    Advance();
+
+  int end_capture_index = captures_started();
+  if (type == '!') {
+    // Captures inside a negative lookahead are never available outside it.
+    for (int i = capture_index; i < end_capture_index; i++) {
+      RegExpCapture* capture = captures_->at(i);
+      ASSERT(capture != NULL);
+      capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);
+    }
+  } else {
+    // Captures temporarily unavailable because they are in different
+    // alternatives are all available after the disjunction.
+    for (int i = capture_index; i < end_capture_index; i++) {
+      RegExpCapture* capture = captures_->at(i);
+      ASSERT(capture != NULL);
+      if (capture->available() == CAPTURE_UNREACHABLE) {
+        capture->set_available(CAPTURE_AVAILABLE);
+      }
+    }
+  }
+
    if (type == '(') {
-    return new RegExpCapture(body, capture_index);
+    RegExpCapture* capture = new RegExpCapture(body, capture_index);
+    captures_->at(capture_index - 1) = capture;
+    return capture;
    } else if (type == ':') {
      return body;
    } else {
@@ -4093,10 +4151,10 @@
    }
    Advance();
    if (ranges->length() == 0) {
-    return RegExpEmpty::GetInstance();
-  } else {
-    return new RegExpCharacterClass(ranges, is_negated);
+    ranges->Add(CharacterRange::Range(0, 0xffff));
+    is_negated = !is_negated;
    }
+  return new RegExpCharacterClass(ranges, is_negated);
  }



Modified:  
branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.cc
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.cc  
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.cc Fri  
Nov 14 06:06:29 2008
@@ -62,10 +62,10 @@
   */

  RegExpMacroAssemblerIA32::RegExpMacroAssemblerIA32()
- : masm_(new MacroAssembler(NULL, kRegExpCodeSize)),
-   constants_(kRegExpConstantsSize),
-   num_registers_(0),
-   ignore_case(false) {}
+  : masm_(new MacroAssembler(NULL, kRegExpCodeSize)),
+    constants_(kRegExpConstantsSize),
+    num_registers_(0),
+    ignore_case(false) {}


  RegExpMacroAssemblerIA32::~RegExpMacroAssemblerIA32() {
@@ -108,11 +108,11 @@
                                             Label* on_zero) {
    ReadCurrentChar(eax);
    __ sub(eax, start);
-  __ cmp(eax, 64); // FIXME: 64 = length_of_bitmap_in_bits.
+  __ cmp(eax, 64);  // FIXME: 64 = length_of_bitmap_in_bits.
    BranchOrBacktrack(greater_equal, on_zero);
    __ mov(ebx, eax);
    __ shr(ebx, 3);
-  // TODO: Where is the bitmap stored? Pass the bitmap as argument instead.
+  // TODO(lrn): Where is the bitmap stored? Pass the bitmap as argument  
instead.
    // __ mov(ecx, position_of_bitmap);
    __ movzx_b(ebx, Operand(ecx, ebx, times_1, 0));
    __ and_(eax, (1<<3)-1);
@@ -144,8 +144,8 @@
    BranchOrBacktrack(greater_equal, on_failure);

    if (str.length() <= kMaxInlineStringTests || ignore_case()) {
-    // TODO: make proper loop if str.length is large but ignore_case is  
true;
-    for(int i = 0; i < str.length(); i++) {
+    // TODO(lrn): make loop if str.length is large but ignore_case is true;
+    for (int i = 0; i < str.length(); i++) {
        ReadChar(eax, i);
        if (ignore_case()) {
          Canonicalize(eax);
@@ -167,7 +167,7 @@
      if (sizeof(SubjectChar) == 1) {
        __ rep_cmpsb();
      } else {
-      ASSERT(sizeof(SubjectChar)==2);
+      ASSERT(sizeof(SubjectChar) == 2);
        __ rep_cmpsw();
      }
      __ mov(esi, ebx);
@@ -201,7 +201,7 @@

    __ mov(ebx, eax);
    __ shr(eax, 2);
-  __ movzx_b(eax, Operand(ecx, eax)); // FIXME: ecx holds address of map
+  __ movzx_b(eax, Operand(ecx, eax));  // FIXME: ecx holds address of map
    Label got_nybble;
    Label high_bits;
    __ and_(ebx, 0x03);
@@ -263,7 +263,7 @@
    __ cmp(eax, destinations.length() - start);
    __ j(greater_equal, &fallthrough);

-  // TODO jumptable: jump to destinations[eax]
+  // TODO(lrn) jumptable: jump to destinations[eax]
    __ bind(&fallthrough);
  }

@@ -283,7 +283,7 @@
  }


-void RegExpMacroAssemblerIA32::GoTo(Label &to) {
+void RegExpMacroAssemblerIA32::GoTo(Label* to) {
    __ jmp(to);
  }

@@ -398,7 +398,7 @@
      return;
    }
    ASSERT(sizeof(SubjectChar) == 2);
-  // TODO: Use some tables.
+  // TODO(lrn): Use some tables.
  }


@@ -427,10 +427,10 @@
  }


-template <typename T>
-void LoadConstantBufferAddress(Register reg, ArraySlice<T>& buffer) {
-  __ mov(reg, buffer.array());
-  __ add(reg, buffer.base_offset());
+void RegExpMacroAssemblerIA32::LoadConstantBufferAddress(
+    Register reg, ArraySlice<T>* buffer) {
+  __ mov(reg, buffer->array());
+  __ add(reg, buffer->base_offset());
  }

  #undef __

Modified: branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.h
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.h   
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler-ia32.h  Fri  
Nov 14 06:06:29 2008
@@ -133,6 +133,10 @@
    // Read a character from input at the given offset from the current
    // position.
    void ReadChar(Register destination, int offset);
+
+  template <typename T>
+  void LoadConstantBufferAddress(Register reg, ArraySlice<T>* buffer);
+
    // Read the current character into the destination register.
    void ReadCurrentChar(Register destination);

@@ -145,7 +149,6 @@
    int num_registers_;
    bool ignore_case_;
  };
-
  }}

  #endif /* REGEXP_MACRO_ASSEMBLER_IA32_H_ */

Modified: branches/experimental/regexp2000/src/regexp-macro-assembler.cc
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler.cc       
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler.cc      Fri Nov 
 
14 06:06:29 2008
@@ -32,9 +32,9 @@


  ByteArrayProvider::ByteArrayProvider(int initial_size)
- : byte_array_size_(initial_size),
-   current_byte_array_(NULL),
-   current_byte_array_free_offset_(initial_size) {}
+  : byte_array_size_(initial_size),
+    current_byte_array_(NULL),
+    current_byte_array_free_offset_(initial_size) {}


  template <typename T>
@@ -58,5 +58,4 @@
    current_byte_array_free_offset_ = free_offset + size;
    return ArraySlice<T>(current_byte_array_, free_offset);
  }
-
  }}

Modified: branches/experimental/regexp2000/src/regexp-macro-assembler.h
==============================================================================
--- branches/experimental/regexp2000/src/regexp-macro-assembler.h       
(original)
+++ branches/experimental/regexp2000/src/regexp-macro-assembler.h       Fri Nov 
 
14 06:06:29 2008
@@ -123,7 +123,7 @@

  template <typename T>
  struct ArraySlice {
-public:
+ public:
    ArraySlice(Handle<ByteArray> array, size_t offset)
      : array_(array), offset_(offset) {}
    Handle<ByteArray> array() { return array_; }
@@ -139,15 +139,15 @@
    T& operator[](int idx) {
      return reinterpret_cast<T*>(array_->GetDataStartAddress() +  
offset)[idx];
    }
-private:
-  const Handle<ByteArray> array_;
-  const size_t offset_;
+ private:
+  Handle<ByteArray> array_;
+  size_t offset_;
  };


  class ByteArrayProvider {
   public:
-  ByteArrayProvider(int initial_size);
+  explicit ByteArrayProvider(int initial_size);
    // Provides a place to put "size" elements of type T. The information
    // can be stored in the provided ByteArray at the "offset". The offset is
    // aligned to an "align"-boundary

Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Fri Nov 14  
06:06:29 2008
@@ -106,7 +106,8 @@
    CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
    CHECK_PARSE_EQ("()", "(^ %)");
    CHECK_PARSE_EQ("(?=)", "(-> + %)");
-  CHECK_PARSE_EQ("[]", "%");
+  CHECK_PARSE_EQ("[]", "^[\x00-\uffff]");
+  CHECK_PARSE_EQ("[^]", "[\x00-\uffff]");
    CHECK_PARSE_EQ("[x]", "[x]");
    CHECK_PARSE_EQ("[xyz]", "[x y z]");
    CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
@@ -155,9 +156,11 @@
                "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
                " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\x09')");
    CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
-  CHECK_PARSE_EQ("(a\\1)", "(^ (: 'a' (<- 1)))");
-  CHECK_PARSE_EQ("(\\1a)", "(^ (: (<- 1) 'a'))");
+  CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
+  CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
    CHECK_PARSE_EQ("\\1(a)", "(: '\x01' (^ 'a'))");
+  CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))");
+  CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: '\x01' (^ 'a') (<-  
1)))");
    CHECK_PARSE_EQ("[\\0]", "[\0]");
    CHECK_PARSE_EQ("[\\11]", "[\t]");
    CHECK_PARSE_EQ("[\\11a]", "[\t a]");

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to