Revision: 3243
Author: [email protected]
Date: Mon Nov  9 02:01:23 2009
Log: * Fix regexp benchmark regression where we were doing work to
make standard regexps like \s and . case independent.
* Make use of the fact that the subject string is ASCII only
when making character classes case independent.
* Avoid spending time making large ideogram or punctuation
ranges case independent when there is no case mapping anyway.
Review URL: http://codereview.chromium.org/378024
http://code.google.com/p/v8/source/detail?r=3243

Modified:
  /branches/bleeding_edge/src/jsregexp.cc
  /branches/bleeding_edge/src/jsregexp.h
  /branches/bleeding_edge/test/cctest/test-regexp.cc
  /branches/bleeding_edge/test/mjsunit/cyrillic.js

=======================================
--- /branches/bleeding_edge/src/jsregexp.cc     Fri Nov  6 03:15:20 2009
+++ /branches/bleeding_edge/src/jsregexp.cc     Mon Nov  9 02:01:23 2009
@@ -2432,16 +2432,19 @@
  }


-void TextNode::MakeCaseIndependent() {
+void TextNode::MakeCaseIndependent(bool is_ascii) {
    int element_count = elms_->length();
    for (int i = 0; i < element_count; i++) {
      TextElement elm = elms_->at(i);
      if (elm.type == TextElement::CHAR_CLASS) {
        RegExpCharacterClass* cc = elm.data.u_char_class;
+      // None of the standard character classses is different in the case
+      // independent case and it slows us down if we don't know that.
+      if (cc->is_standard()) continue;
        ZoneList<CharacterRange>* ranges = cc->ranges();
        int range_count = ranges->length();
        for (int j = 0; j < range_count; j++) {
-        ranges->at(j).AddCaseEquivalents(ranges);
+        ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
        }
      }
    }
@@ -3912,19 +3915,31 @@
  }


-void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
+static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
+                            int bottom,
+                            int top);
+
+
+void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
+                                        bool is_ascii) {
+  uc16 bottom = from();
+  uc16 top = to();
+  if (is_ascii) {
+    if (bottom > String::kMaxAsciiCharCode) return;
+    if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
+  }
    unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-  if (IsSingleton()) {
+  if (top == bottom) {
      // If this is a singleton we just expand the one character.
-    int length = uncanonicalize.get(from(), '\0', chars);
+    int length = uncanonicalize.get(bottom, '\0', chars);
      for (int i = 0; i < length; i++) {
        uc32 chr = chars[i];
-      if (chr != from()) {
+      if (chr != bottom) {
          ranges->Add(CharacterRange::Singleton(chars[i]));
        }
      }
-  } else if (from() <= kRangeCanonicalizeMax &&
-             to() <= kRangeCanonicalizeMax) {
+  } else if (bottom <= kRangeCanonicalizeMax &&
+             top <= kRangeCanonicalizeMax) {
      // If this is a range we expand the characters block by block,
      // expanding contiguous subranges (blocks) one at a time.
      // The approach is as follows.  For a given start character we
@@ -3943,14 +3958,14 @@
      // completely contained in a block we do this for all the blocks
      // covered by the range.
      unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-    // First, look up the block that contains the 'from' character.
-    int length = canonrange.get(from(), '\0', range);
+    // First, look up the block that contains the 'bottom' character.
+    int length = canonrange.get(bottom, '\0', range);
      if (length == 0) {
-      range[0] = from();
+      range[0] = bottom;
      } else {
        ASSERT_EQ(1, length);
      }
-    int pos = from();
+    int pos = bottom;
      // The start of the current block.  Note that except for the first
      // iteration 'start' is always equal to 'pos'.
      int start;
@@ -3964,7 +3979,7 @@
      // Then we add the ranges one at a time, incrementing the current
      // position to be after the last block each time.  The position
      // always points to the start of a block.
-    while (pos < to()) {
+    while (pos < top) {
        length = canonrange.get(start, '\0', range);
        if (length == 0) {
          range[0] = start;
@@ -3975,57 +3990,122 @@
        // The start point of a block contains the distance to the end
        // of the range.
        int block_end = start + (range[0] & kPayloadMask) - 1;
-      int end = (block_end > to()) ? to() : block_end;
+      int end = (block_end > top) ? top : block_end;
        length = uncanonicalize.get(start, '\0', range);
        for (int i = 0; i < length; i++) {
          uc32 c = range[i];
          uc16 range_from = c + (pos - start);
          uc16 range_to = c + (end - start);
-        if (!(from() <= range_from && range_to <= to())) {
+        if (!(bottom <= range_from && range_to <= top)) {
            ranges->Add(CharacterRange(range_from, range_to));
          }
        }
        start = pos = block_end + 1;
      }
-  } else if (from() > 0 || to() < String::kMaxUC16CharCode) {
+  } else {
      // Unibrow ranges don't work for high characters due to the "2^11 bug".
-    // Therefore we do something dumber for these ranges.  We don't bother
-    // if the range is 0-max (as encountered at the start of an unanchored
-    // regexp).
-    ZoneList<unibrow::uchar> *characters = new  
ZoneList<unibrow::uchar>(100);
-    int bottom = from();
-    int top = to();
-    for (int i = bottom; i <= top; i++) {
-      int length = uncanonicalize.get(i, '\0', chars);
-      for (int j = 0; j < length; j++) {
-        uc32 chr = chars[j];
-        if (chr != i && chr < bottom || chr > top) {
-          characters->Add(chr);
+    // Therefore we do something dumber for these ranges.
+    AddUncanonicals(ranges, bottom, top);
+  }
+}
+
+
+static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
+                            int bottom,
+                            int top) {
+  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+  // Zones with no case mappings.  There is a DEBUG-mode loop to assert  
that
+  // this table is correct.
+  // 0x0600 - 0x0fff
+  // 0x1100 - 0x1cff
+  // 0x2000 - 0x20ff
+  // 0x2200 - 0x23ff
+  // 0x2500 - 0x2bff
+  // 0x2e00 - 0xa5ff
+  // 0xa800 - 0xfaff
+  // 0xfc00 - 0xfeff
+  const int boundary_count = 18;
+  // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in  
this
+  // array.  This is to split up big ranges and not because they actually  
denote
+  // a case-mapping-free-zone.
+  ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);
+  const int kFirstRealCaselessZoneIndex = 2;
+  int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,
+      0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400,  
0x2500,
+      0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
+
+  // Special ASCII rule from spec can save us some work here.
+  if (bottom == 0x80 && top == 0xffff) return;
+
+  // We have optimized support for this range.
+  if (top <= CharacterRange::kRangeCanonicalizeMax) {
+    CharacterRange range(bottom, top);
+    range.AddCaseEquivalents(ranges, false);
+    return;
+  }
+
+  // Split up very large ranges.  This helps remove ranges where there are  
no
+  // case mappings.
+  for (int i = 0; i < boundary_count; i++) {
+    if (bottom < boundaries[i] && top >= boundaries[i]) {
+      AddUncanonicals(ranges, bottom, boundaries[i] - 1);
+      AddUncanonicals(ranges, boundaries[i], top);
+      return;
+    }
+  }
+
+  // If we are completely in a zone with no case mappings then we are done.
+  // We start at 2 so as not to except the ASCII range from mappings.
+  for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {
+    if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
+#ifdef DEBUG
+      for (int j = bottom; j <= top; j++) {
+        unsigned current_char = j;
+        int length = uncanonicalize.get(current_char, '\0', chars);
+        for (int k = 0; k < length; k++) {
+          ASSERT(chars[k] == current_char);
          }
        }
-    }
-    if (characters->length() > 0) {
-      int new_from = characters->at(0);
-      int new_to = new_from;
-      for (int i = 1; i < characters->length(); i++) {
-        int chr = characters->at(i);
-        if (chr == new_to + 1) {
-          new_to++;
+#endif
+      return;
+    }
+  }
+
+  // Step through the range finding equivalent characters.
+  ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
+  for (int i = bottom; i <= top; i++) {
+    int length = uncanonicalize.get(i, '\0', chars);
+    for (int j = 0; j < length; j++) {
+      uc32 chr = chars[j];
+      if (chr != i && chr < bottom || chr > top) {
+        characters->Add(chr);
+      }
+    }
+  }
+
+  // Step through the equivalent characters finding simple ranges and
+  // adding ranges to the character class.
+  if (characters->length() > 0) {
+    int new_from = characters->at(0);
+    int new_to = new_from;
+    for (int i = 1; i < characters->length(); i++) {
+      int chr = characters->at(i);
+      if (chr == new_to + 1) {
+        new_to++;
+      } else {
+        if (new_to == new_from) {
+          ranges->Add(CharacterRange::Singleton(new_from));
          } else {
-          if (new_to == new_from) {
-            ranges->Add(CharacterRange::Singleton(new_from));
-          } else {
-            ranges->Add(CharacterRange(new_from, new_to));
-          }
-          new_from = new_to = chr;
-        }
-      }
-      if (new_to == new_from) {
-        ranges->Add(CharacterRange::Singleton(new_from));
-      } else {
-        ranges->Add(CharacterRange(new_from, new_to));
+          ranges->Add(CharacterRange(new_from, new_to));
+        }
+        new_from = new_to = chr;
        }
      }
+    if (new_to == new_from) {
+      ranges->Add(CharacterRange::Singleton(new_from));
+    } else {
+      ranges->Add(CharacterRange(new_from, new_to));
+    }
    }
  }

@@ -4271,7 +4351,7 @@

  void Analysis::VisitText(TextNode* that) {
    if (ignore_case_) {
-    that->MakeCaseIndependent();
+    that->MakeCaseIndependent(is_ascii_);
    }
    EnsureAnalyzed(that->on_success());
    if (!has_failed()) {
@@ -4489,7 +4569,7 @@
      }
    }
    data->node = node;
-  Analysis analysis(ignore_case);
+  Analysis analysis(ignore_case, is_ascii);
    analysis.EnsureAnalyzed(node);
    if (analysis.has_failed()) {
      const char* error_message = analysis.error_message();
=======================================
--- /branches/bleeding_edge/src/jsregexp.h      Thu Oct 15 08:01:36 2009
+++ /branches/bleeding_edge/src/jsregexp.h      Mon Nov  9 02:01:23 2009
@@ -200,7 +200,7 @@
    bool is_valid() { return from_ <= to_; }
    bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
    bool IsSingleton() { return (from_ == to_); }
-  void AddCaseEquivalents(ZoneList<CharacterRange>* ranges);
+  void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii);
    static void Split(ZoneList<CharacterRange>* base,
                      Vector<const uc16> overlay,
                      ZoneList<CharacterRange>** included,
@@ -703,7 +703,7 @@
                                      int characters_filled_in,
                                      bool not_at_start);
    ZoneList<TextElement>* elements() { return elms_; }
-  void MakeCaseIndependent();
+  void MakeCaseIndependent(bool is_ascii);
    virtual int GreedyLoopTextLength();
    virtual TextNode* Clone() {
      TextNode* result = new TextNode(*this);
@@ -1212,8 +1212,10 @@
  //   +-------+        +------------+
  class Analysis: public NodeVisitor {
   public:
-  explicit Analysis(bool ignore_case)
-      : ignore_case_(ignore_case), error_message_(NULL) { }
+  Analysis(bool ignore_case, bool is_ascii)
+      : ignore_case_(ignore_case),
+        is_ascii_(is_ascii),
+        error_message_(NULL) { }
    void EnsureAnalyzed(RegExpNode* node);

  #define DECLARE_VISIT(Type)                                          \
@@ -1232,6 +1234,7 @@
    }
   private:
    bool ignore_case_;
+  bool is_ascii_;
    const char* error_message_;

    DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
=======================================
--- /branches/bleeding_edge/test/cctest/test-regexp.cc  Mon Aug 31 05:40:37  
2009
+++ /branches/bleeding_edge/test/cctest/test-regexp.cc  Mon Nov  9 02:01:23  
2009
@@ -1466,7 +1466,7 @@
    ZoneScope zone_scope(DELETE_ON_EXIT);
    int count = expected.length();
    ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(count);
-  input.AddCaseEquivalents(list);
+  input.AddCaseEquivalents(list, false);
    CHECK_EQ(count, list->length());
    for (int i = 0; i < list->length(); i++) {
      CHECK_EQ(expected[i].from(), list->at(i).from());
=======================================
--- /branches/bleeding_edge/test/mjsunit/cyrillic.js    Fri Nov  6 03:15:20  
2009
+++ /branches/bleeding_edge/test/mjsunit/cyrillic.js    Mon Nov  9 02:01:23  
2009
@@ -60,6 +60,7 @@
    return new RegExp("[" + from + "-" + to + "]", flags);
  }

+// Test Cyrillic and Greek separately.
  for (var lang = 0; lang < 2; lang++) {
    var chars = (lang == 0) ? cyrillic : greek;

@@ -99,13 +100,13 @@
    }
  }

+// Test range that covers both greek and cyrillic characters.
  for (key in greek) {
    assertTrue(Range(greek.FIRST, cyrillic.last).test(greek[key]), 17 + key);
    if (cyrillic[key]) {
      assertTrue(Range(greek.FIRST, cyrillic.last).test(cyrillic[key]), 18 +  
key);
    }
  }
-

  for (var i = 0; i < 2; i++) {
    var ignore_case = (i == 0);
@@ -118,6 +119,8 @@
    assertTrue(Range(greek.first, cyrillic.LAST,  
flag).test(cyrillic.MIDDLE), 23);
    assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.LAST),  
24);

+  // A range that covers the lower case greek letters and the upper case  
cyrillic
+  // letters.
    assertEquals(ignore_case, Range(greek.first, cyrillic.LAST,  
flag).test(greek.FIRST), 25);
    assertEquals(ignore_case, Range(greek.first, cyrillic.LAST,  
flag).test(greek.MIDDLE), 26);
    assertEquals(ignore_case, Range(greek.first, cyrillic.LAST,  
flag).test(greek.LAST), 27);
@@ -128,6 +131,10 @@
  }


+// Sigma is special because there are two lower case versions of the same  
upper
+// case character.  JS requires that case independece means that you should
+// convert everything to upper case, so the two sigma variants are equal  
to each
+// other in a case independt comparison.
  for (var i = 0; i < 2; i++) {
    var simple = (i != 0);
    var name = simple ? "" : "[]";
@@ -166,4 +173,36 @@
    assertTrue(new RegExp(regex, "i").test(SIGMA), 56 + name);
  }

-print("ok");
+
+// Test all non-ASCII characters individually to ensure that our  
optimizations
+// didn't break anything.
+for (var i = 0x80; i <= 0xfffe; i++) {
+  var c = String.fromCharCode(i);
+  var c2 = String.fromCharCode(i + 1);
+  var re = new RegExp("[" + c + "-" + c2 + "]", "i");
+  assertTrue(re.test(c), 57);
+}
+
+for (var add_non_ascii_character_to_subject = 0;
+     add_non_ascii_character_to_subject < 2;
+     add_non_ascii_character_to_subject++) {
+  var suffix = add_non_ascii_character_to_subject ? "\ufffe" : "";
+  // A range that covers both ASCII and non-ASCII.
+  for (var i = 0; i < 2; i++) {
+    var full = (i != 0);
+    var mixed = full ? "[a-\uffff]" : "[a-" + cyrillic.LAST + "]";
+    var f = full ? "f" : "c";
+    for (var j = 0; j < 2; j++) {
+      var ignore_case = (j == 0);
+      var flag = ignore_case ? "i" : "";
+      var re = new RegExp(mixed, flag);
+      assertEquals(ignore_case || (full &&  
add_non_ascii_character_to_subject),
+                   re.test("A" + suffix),
+                   58 + flag + f);
+      assertTrue(re.test("a" + suffix), 59 + flag + f);
+      assertTrue(re.test("~" + suffix), 60 + flag + f);
+      assertTrue(re.test(cyrillic.MIDDLE), 61 + flag + f);
+      assertEquals(ignore_case || full, re.test(cyrillic.middle), 62 +  
flag + f);
+    }
+  }
+}

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to