Author: [email protected]
Date: Wed Feb 18 08:07:03 2009
New Revision: 1304

Modified:
    branches/bleeding_edge/src/bytecodes-irregexp.h
    branches/bleeding_edge/src/jsregexp.cc
    branches/bleeding_edge/src/jsregexp.h
    branches/bleeding_edge/src/objects.h
    branches/bleeding_edge/test/mjsunit/regexp-UC16.js

Log:
Irregexp:
* Fix UC16 character classes on ASCII subjects.
* Fix sign problem in Irregexp interpreter.
* Make passes over text nodes more readable.
Review URL: http://codereview.chromium.org/21450

Modified: branches/bleeding_edge/src/bytecodes-irregexp.h
==============================================================================
--- branches/bleeding_edge/src/bytecodes-irregexp.h     (original)
+++ branches/bleeding_edge/src/bytecodes-irregexp.h     Wed Feb 18 08:07:03 2009
@@ -33,7 +33,10 @@


  static const int BYTECODE_MASK = 0xff;
-static const unsigned int MAX_FIRST_ARG = 0xffffffu;
+// The first argument is packed in with the byte code in one word, but so  
it
+// has 24 bits, but it can be positive and negative so only use 23 bits for
+// positive values.
+static const unsigned int MAX_FIRST_ARG = 0x7fffffu;
  static const int BYTECODE_SHIFT = 8;

  #define  
BYTECODE_ITERATOR(V)                                                   \

Modified: branches/bleeding_edge/src/jsregexp.cc
==============================================================================
--- branches/bleeding_edge/src/jsregexp.cc      (original)
+++ branches/bleeding_edge/src/jsregexp.cc      Wed Feb 18 08:07:03 2009
@@ -1739,19 +1739,71 @@
  static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;


+// Returns the number of characters in the equivalence class, omitting  
those
+// that cannot occur in the source string because it is ASCII.
+static int GetCaseIndependentLetters(uc16 character,
+                                     bool ascii_subject,
+                                     unibrow::uchar* letters) {
+  int length = uncanonicalize.get(character, '\0', letters);
+  // Unibrow returns 0 or 1 for characters where case independependence is
+  // trivial.
+  if (length == 0) {
+    letters[0] = character;
+    length = 1;
+  }
+  if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
+    return length;
+  }
+  // The standard requires that non-ASCII characters cannot have ASCII
+  // character codes in their equivalence class.
+  return 0;
+}
+
+
+static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
+                                       uc16 c,
+                                       Label* on_failure,
+                                       int cp_offset,
+                                       bool check,
+                                       bool preloaded) {
+  RegExpMacroAssembler* assembler = compiler->macro_assembler();
+  bool bound_checked = false;
+  if (!preloaded) {
+    assembler->LoadCurrentCharacter(
+        cp_offset,
+        on_failure,
+        check);
+    bound_checked = true;
+  }
+  assembler->CheckNotCharacter(c, on_failure);
+  return bound_checked;
+}
+
+
  // Only emits non-letters (things that don't have case).  Only used for  
case
  // independent matches.
-static inline bool EmitAtomNonLetter(
-    RegExpMacroAssembler* macro_assembler,
-    uc16 c,
-    Label* on_failure,
-    int cp_offset,
-    bool check,
-    bool preloaded) {
+static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
+                                     uc16 c,
+                                     Label* on_failure,
+                                     int cp_offset,
+                                     bool check,
+                                     bool preloaded) {
+  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+  bool ascii = compiler->ascii();
    unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-  int length = uncanonicalize.get(c, '\0', chars);
+  int length = GetCaseIndependentLetters(c, ascii, chars);
+  if (length < 1) {
+    // This can't match.  Must be an ASCII subject and a non-ASCII  
character.
+    // We do not need to do anything since the ASCII pass already handled  
this.
+    return false;  // Bounds not checked.
+  }
    bool checked = false;
-  if (length <= 1) {
+  // We handle the length > 1 case in a later pass.
+  if (length == 1) {
+    if (ascii && c > String::kMaxAsciiCharCodeU) {
+      // Can't match - see above.
+      return false;  // Bounds not checked.
+    }
      if (!preloaded) {
        macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
        checked = check;
@@ -1801,18 +1853,25 @@
  }


+typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
+                                   uc16 c,
+                                   Label* on_failure,
+                                   int cp_offset,
+                                   bool check,
+                                   bool preloaded);
+
  // Only emits letters (things that have case).  Only used for case  
independent
  // matches.
-static inline bool EmitAtomLetter(
-    RegExpMacroAssembler* macro_assembler,
-    bool ascii,
-    uc16 c,
-    Label* on_failure,
-    int cp_offset,
-    bool check,
-    bool preloaded) {
+static inline bool EmitAtomLetter(RegExpCompiler* compiler,
+                                  uc16 c,
+                                  Label* on_failure,
+                                  int cp_offset,
+                                  bool check,
+                                  bool preloaded) {
+  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+  bool ascii = compiler->ascii();
    unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-  int length = uncanonicalize.get(c, '\0', chars);
+  int length = GetCaseIndependentLetters(c, ascii, chars);
    if (length <= 1) return false;
    // We may not need to check against the end of the input string
    // if this character lies before a character that matched.
@@ -1854,10 +1913,10 @@

  static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
                            RegExpCharacterClass* cc,
-                          int cp_offset,
+                          bool ascii,
                            Label* on_failure,
+                          int cp_offset,
                            bool check_offset,
-                          bool ascii,
                            bool preloaded) {
    if (cc->is_standard() &&
        macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
@@ -2238,12 +2297,14 @@
            // matching can turn an ASCII character into non-ASCII or
            // vice versa.
            details->set_cannot_match();
+          pos->determines_perfectly = false;
            return;
          }
          if (compiler->ignore_case()) {
            unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-          int length = uncanonicalize.get(c, '\0', chars);
-          if (length < 2) {
+          int length = GetCaseIndependentLetters(c, compiler->ascii(),  
chars);
+          ASSERT(length != 0);  // Can only happen if c > char_mask (see  
above).
+          if (length == 1) {
              // This letter has no case equivalents, so it's nice and simple
              // and the mask-compare will determine definitely whether we  
have
              // a match at this character position.
@@ -2288,7 +2349,6 @@
            details->positions(characters_filled_in);
        RegExpCharacterClass* tree = elm.data.u_char_class;
        ZoneList<CharacterRange>* ranges = tree->ranges();
-      CharacterRange range = ranges->at(0);
        if (tree->is_negated()) {
          // A quick check uses multi-character mask and compare.  There is  
no
          // useful way to incorporate a negative char class into this scheme
@@ -2297,27 +2357,46 @@
          pos->mask = 0;
          pos->value = 0;
        } else {
-        uint32_t differing_bits = (range.from() ^ range.to());
+        int first_range = 0;
+        while (ranges->at(first_range).from() > char_mask) {
+          first_range++;
+          if (first_range == ranges->length()) {
+            details->set_cannot_match();
+            pos->determines_perfectly = false;
+            return;
+          }
+        }
+        CharacterRange range = ranges->at(first_range);
+        uc16 from = range.from();
+        uc16 to = range.to();
+        if (to > char_mask) {
+          to = char_mask;
+        }
+        uint32_t differing_bits = (from ^ to);
          // A mask and compare is only perfect if the differing bits form a
          // number like 00011111 with one single block of trailing 1s.
          if ((differing_bits & (differing_bits + 1)) == 0) {
            pos->determines_perfectly = true;
          }
          uint32_t common_bits = ~SmearBitsRight(differing_bits);
-        uint32_t bits = (range.from() & common_bits);
-        for (int i = 1; i < ranges->length(); i++) {
+        uint32_t bits = (from & common_bits);
+        for (int i = first_range + 1; i < ranges->length(); i++) {
+          CharacterRange range = ranges->at(i);
+          uc16 from = range.from();
+          uc16 to = range.to();
+          if (from > char_mask) continue;
+          if (to > char_mask) to = char_mask;
            // Here we are combining more ranges into the mask and compare
            // value.  With each new range the mask becomes more sparse and
            // so the chances of a false positive rise.  A character class
            // with multiple ranges is assumed never to be equivalent to a
            // mask and compare operation.
            pos->determines_perfectly = false;
-          CharacterRange range = ranges->at(i);
-          uint32_t new_common_bits = (range.from() ^ range.to());
+          uint32_t new_common_bits = (from ^ to);
            new_common_bits = ~SmearBitsRight(new_common_bits);
            common_bits &= new_common_bits;
            bits &= new_common_bits;
-          uint32_t differing_bits = (range.from() & common_bits) ^ bits;
+          uint32_t differing_bits = (from & common_bits) ^ bits;
            common_bits ^= differing_bits;
            bits &= common_bits;
          }
@@ -2619,6 +2698,20 @@
  }


+static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
+  if (quick_check == NULL) return false;
+  if (offset >= quick_check->characters()) return false;
+  return quick_check->positions(offset)->determines_perfectly;
+}
+
+
+static void UpdateBoundsCheck(int index, int* checked_up_to) {
+  if (index > *checked_up_to) {
+    *checked_up_to = index;
+  }
+}
+
+
  // We call this repeatedly to generate code for each pass over the text  
node.
  // The passes are in increasing order of difficulty because we hope one
  // of the first passes will fail in which case we are saved the work of the
@@ -2663,83 +2756,55 @@
      TextElement elm = elms_->at(i);
      int cp_offset = trace->cp_offset() + elm.cp_offset;
      if (elm.type == TextElement::ATOM) {
-      if (pass == NON_ASCII_MATCH ||
-          pass == CHARACTER_MATCH ||
-          pass == CASE_CHARACTER_MATCH) {
-        Vector<const uc16> quarks = elm.data.u_atom->data();
-        for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
-          bool bound_checked = true;  // Most ops will check their bounds.
-          if (first_element_checked && i == 0 && j == 0) continue;
-          if (pass == NON_ASCII_MATCH) {
+      Vector<const uc16> quarks = elm.data.u_atom->data();
+      for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
+        if (first_element_checked && i == 0 && j == 0) continue;
+        if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
+        EmitCharacterFunction* emit_function = NULL;
+        switch (pass) {
+          case NON_ASCII_MATCH:
              ASSERT(ascii);
              if (quarks[j] > String::kMaxAsciiCharCode) {
                assembler->GoTo(backtrack);
                return;
              }
-          } else {
-            if (quick_check != NULL &&
-                elm.cp_offset + j < quick_check->characters() &&
-                quick_check->positions(elm.cp_offset + j)->
-                    determines_perfectly) {
-              continue;
-            }
-            if (pass == CHARACTER_MATCH) {
-              if (compiler->ignore_case()) {
-                bound_checked = EmitAtomNonLetter(
-                    assembler,
-                    quarks[j],
-                    backtrack,
-                    cp_offset + j,
-                    *checked_up_to < cp_offset + j,
-                    preloaded);
-              } else {
-                if (!preloaded) {
-                  assembler->LoadCurrentCharacter(
-                      cp_offset + j,
-                      backtrack,
-                      *checked_up_to < cp_offset + j);
-                }
-                assembler->CheckNotCharacter(quarks[j], backtrack);
-              }
-            } else {
-              ASSERT_EQ(pass, CASE_CHARACTER_MATCH);
-              ASSERT(compiler->ignore_case());
-              bound_checked = EmitAtomLetter(assembler,
-                                             compiler->ascii(),
+            break;
+          case NON_LETTER_CHARACTER_MATCH:
+            emit_function = &EmitAtomNonLetter;
+            break;
+          case SIMPLE_CHARACTER_MATCH:
+            emit_function = &EmitSimpleCharacter;
+            break;
+          case CASE_CHARACTER_MATCH:
+            emit_function = &EmitAtomLetter;
+            break;
+          default:
+            break;
+        }
+        if (emit_function != NULL) {
+          bool bound_checked = emit_function(compiler,
                                               quarks[j],
                                               backtrack,
                                               cp_offset + j,
                                               *checked_up_to < cp_offset +  
j,
                                               preloaded);
-            }
-            if (bound_checked) {
-              if (cp_offset + j > *checked_up_to) {
-                *checked_up_to = cp_offset + j;
-              }
-            }
-          }
+          if (bound_checked) UpdateBoundsCheck(cp_offset + j,  
checked_up_to);
          }
        }
      } else {
        ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
-      if (first_element_checked && i == 0) continue;
-      if (quick_check != NULL &&
-          elm.cp_offset < quick_check->characters() &&
-          quick_check->positions(elm.cp_offset)->determines_perfectly) {
-        continue;
-      }
        if (pass == CHARACTER_CLASS_MATCH) {
+        if (first_element_checked && i == 0) continue;
+        if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
          RegExpCharacterClass* cc = elm.data.u_char_class;
          EmitCharClass(assembler,
                        cc,
-                      cp_offset,
+                      ascii,
                        backtrack,
+                      cp_offset,
                        *checked_up_to < cp_offset,
-                      ascii,
                        preloaded);
-        if (cp_offset > *checked_up_to) {
-          *checked_up_to = cp_offset;
-        }
+        UpdateBoundsCheck(cp_offset, checked_up_to);
        }
      }
    }
@@ -2757,6 +2822,16 @@
  }


+bool TextNode::SkipPass(int int_pass, bool ignore_case) {
+  TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
+  if (ignore_case) {
+    return pass == SIMPLE_CHARACTER_MATCH;
+  } else {
+    return pass == NON_LETTER_CHARACTER_MATCH || pass ==  
CASE_CHARACTER_MATCH;
+  }
+}
+
+
  // This generates the code to match a text node.  A text node can contain
  // straight character sequences (possibly to be matched in a  
case-independent
  // way) and character classes.  For efficiency we do not do this in a  
single
@@ -2785,49 +2860,29 @@
    // If a character is preloaded into the current character register then
    // check that now.
    if (trace->characters_preloaded() == 1) {
-    TextEmitPass(compiler,
-                 CHARACTER_MATCH,
-                 true,
-                 trace,
-                 false,
-                 &bound_checked_to);
-    if (compiler->ignore_case()) {
+    for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
+      if (!SkipPass(pass, compiler->ignore_case())) {
+        TextEmitPass(compiler,
+                     static_cast<TextEmitPassType>(pass),
+                     true,
+                     trace,
+                     false,
+                     &bound_checked_to);
+      }
+    }
+    first_elt_done = true;
+  }
+
+  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
+    if (!SkipPass(pass, compiler->ignore_case())) {
        TextEmitPass(compiler,
-                   CASE_CHARACTER_MATCH,
-                   true,
-                   trace,
+                   static_cast<TextEmitPassType>(pass),
                     false,
+                   trace,
+                   first_elt_done,
                     &bound_checked_to);
      }
-    TextEmitPass(compiler,
-                 CHARACTER_CLASS_MATCH,
-                 true,
-                 trace,
-                 false,
-                 &bound_checked_to);
-    first_elt_done = true;
    }
-
-  TextEmitPass(compiler,
-               CHARACTER_MATCH,
-               false,
-               trace,
-               first_elt_done,
-               &bound_checked_to);
-  if (compiler->ignore_case()) {
-    TextEmitPass(compiler,
-                 CASE_CHARACTER_MATCH,
-                 false,
-                 trace,
-                 first_elt_done,
-                 &bound_checked_to);
-  }
-  TextEmitPass(compiler,
-               CHARACTER_CLASS_MATCH,
-               false,
-               trace,
-               first_elt_done,
-               &bound_checked_to);

    Trace successor_trace(*trace);
    successor_trace.set_at_start(false);

Modified: branches/bleeding_edge/src/jsregexp.h
==============================================================================
--- branches/bleeding_edge/src/jsregexp.h       (original)
+++ branches/bleeding_edge/src/jsregexp.h       Wed Feb 18 08:07:03 2009
@@ -824,11 +824,15 @@

   private:
    enum TextEmitPassType {
-    NON_ASCII_MATCH,
-    CHARACTER_MATCH,
-    CASE_CHARACTER_MATCH,
-    CHARACTER_CLASS_MATCH
+    NON_ASCII_MATCH,             // Check for characters that can't match.
+    SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
+    NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case  
equivs.
+    CASE_CHARACTER_MATCH,        // Case-independent single character  
check.
+    CHARACTER_CLASS_MATCH        // Character class.
    };
+  static bool SkipPass(int pass, bool ignore_case);
+  static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
+  static const int kLastPass = CHARACTER_CLASS_MATCH;
    void TextEmitPass(RegExpCompiler* compiler,
                      TextEmitPassType pass,
                      bool preloaded,

Modified: branches/bleeding_edge/src/objects.h
==============================================================================
--- branches/bleeding_edge/src/objects.h        (original)
+++ branches/bleeding_edge/src/objects.h        Wed Feb 18 08:07:03 2009
@@ -3240,6 +3240,7 @@

    // Max ascii char code.
    static const int kMaxAsciiCharCode = unibrow::Utf8::kMaxOneByteChar;
+  static const unsigned kMaxAsciiCharCodeU =  
unibrow::Utf8::kMaxOneByteChar;
    static const int kMaxUC16CharCode = 0xffff;

    // Minimum length for a cons or sliced string.

Modified: branches/bleeding_edge/test/mjsunit/regexp-UC16.js
==============================================================================
--- branches/bleeding_edge/test/mjsunit/regexp-UC16.js  (original)
+++ branches/bleeding_edge/test/mjsunit/regexp-UC16.js  Wed Feb 18 08:07:03  
2009
@@ -41,3 +41,7 @@
  assertEquals(longUC16String + "," + longUC16String.substring(1,4),
               String(/x(...)\1\1/i.exec(longUC16String)),
               "backref-UC16-twice");
+
+assertFalse(/\xc1/i.test('fooA'), "quickcheck-uc16-pattern-ascii-subject");
+assertFalse(/[\xe9]/.test('i'), "charclass-uc16-pattern-ascii-subject");
+assertFalse(/\u5e74| 
\u6708/.test('t'), "alternation-uc16-pattern-ascii-subject");

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to