Revision: 3626
Author: [email protected]
Date: Mon Jan 18 01:49:50 2010
Log: RegExp bitmap test for word character.

Review URL: http://codereview.chromium.org/547024
http://code.google.com/p/v8/source/detail?r=3626

Modified:
 /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc
 /branches/bleeding_edge/src/assembler.cc
 /branches/bleeding_edge/src/assembler.h
 /branches/bleeding_edge/src/ia32/assembler-ia32.cc
 /branches/bleeding_edge/src/ia32/assembler-ia32.h
 /branches/bleeding_edge/src/ia32/disasm-ia32.cc
 /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc
 /branches/bleeding_edge/src/regexp-macro-assembler.cc
 /branches/bleeding_edge/src/regexp-macro-assembler.h
 /branches/bleeding_edge/src/serialize.cc
 /branches/bleeding_edge/src/x64/assembler-x64.cc
 /branches/bleeding_edge/src/x64/assembler-x64.h
 /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc

=======================================
--- /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Thu Jan 7 11:01:23 2010 +++ /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Mon Jan 18 01:49:50 2010
@@ -526,64 +526,54 @@
     return true;
   }
   case 'n': {
-      // Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
-      __ eor(r0, current_character(), Operand(0x01));
-      // See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
-      __ sub(r0, r0, Operand(0x0b));
-      __ cmp(r0, Operand(0x0c - 0x0b));
-      if (mode_ == ASCII) {
-        BranchOrBacktrack(hi, on_no_match);
-      } else {
-        Label done;
-        __ b(ls, &done);
-        // Compare original value to 0x2028 and 0x2029, using the already
-        // computed (current_char ^ 0x01 - 0x0b). I.e., check for
-        // 0x201d (0x2028 - 0x0b) or 0x201e.
-        __ sub(r0, r0, Operand(0x2028 - 0x0b));
-        __ cmp(r0, Operand(1));
-        BranchOrBacktrack(hi, on_no_match);
-        __ bind(&done);
-      }
-      return true;
-    }
+    // Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
+    __ eor(r0, current_character(), Operand(0x01));
+    // See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
+    __ sub(r0, r0, Operand(0x0b));
+    __ cmp(r0, Operand(0x0c - 0x0b));
+    if (mode_ == ASCII) {
+      BranchOrBacktrack(hi, on_no_match);
+    } else {
+      Label done;
+      __ b(ls, &done);
+      // Compare original value to 0x2028 and 0x2029, using the already
+      // computed (current_char ^ 0x01 - 0x0b). I.e., check for
+      // 0x201d (0x2028 - 0x0b) or 0x201e.
+      __ sub(r0, r0, Operand(0x2028 - 0x0b));
+      __ cmp(r0, Operand(1));
+      BranchOrBacktrack(hi, on_no_match);
+      __ bind(&done);
+    }
+    return true;
+  }
   case 'w': {
-    // Match word character (0-9, A-Z, a-z and _).
-    Label digits, done;
-    __ cmp(current_character(), Operand('9'));
-    __ b(ls, &digits);
-    __ cmp(current_character(), Operand('_'));
-    __ b(eq, &done);
-    __ orr(r0, current_character(), Operand(0x20));
-    __ sub(r0, r0, Operand('a'));
-    __ cmp(r0, Operand('z' - 'a'));
-    BranchOrBacktrack(hi, on_no_match);
-    __ jmp(&done);
-
-    __ bind(&digits);
-    __ cmp(current_character(), Operand('0'));
-    BranchOrBacktrack(lo, on_no_match);
-    __ bind(&done);
-
+    if (mode_ != ASCII) {
+      // Table is 128 entries, so all ASCII characters can be tested.
+      __ cmp(current_character(), Operand('z'));
+      BranchOrBacktrack(hi, on_no_match);
+    }
+    ExternalReference map = ExternalReference::re_word_character_map();
+    __ mov(r0, Operand(map));
+    __ ldrb(r0, MemOperand(r0, current_character()));
+    __ tst(r0, Operand(r0));
+    BranchOrBacktrack(eq, on_no_match);
     return true;
   }
   case 'W': {
-    // Match non-word character (not 0-9, A-Z, a-z and _).
-    Label digits, done;
-    __ cmp(current_character(), Operand('9'));
-    __ b(ls, &digits);
-    __ cmp(current_character(), Operand('_'));
-    BranchOrBacktrack(eq, on_no_match);
-    __ orr(r0, current_character(), Operand(0x20));
-    __ sub(r0, r0, Operand('a'));
-    __ cmp(r0, Operand('z' - 'a'));
-    BranchOrBacktrack(ls, on_no_match);
-    __ jmp(&done);
-
-    __ bind(&digits);
-    __ cmp(current_character(), Operand('0'));
-    BranchOrBacktrack(hs, on_no_match);
-    __ bind(&done);
-
+    Label done;
+    if (mode_ != ASCII) {
+      // Table is 128 entries, so all ASCII characters can be tested.
+      __ cmp(current_character(), Operand('z'));
+      __ b(hi, &done);
+    }
+    ExternalReference map = ExternalReference::re_word_character_map();
+    __ mov(r0, Operand(map));
+    __ ldrb(r0, MemOperand(r0, current_character()));
+    __ tst(r0, Operand(r0));
+    BranchOrBacktrack(ne, on_no_match);
+    if (mode_ != ASCII) {
+      __ bind(&done);
+    }
     return true;
   }
   case '*':
=======================================
--- /branches/bleeding_edge/src/assembler.cc    Mon Jan 18 00:36:06 2010
+++ /branches/bleeding_edge/src/assembler.cc    Mon Jan 18 01:49:50 2010
@@ -670,6 +670,10 @@
FUNCTION_ADDR(NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16)));
 }

+ExternalReference ExternalReference::re_word_character_map() {
+  return ExternalReference(
+      NativeRegExpMacroAssembler::word_character_map_address());
+}

 ExternalReference ExternalReference::address_of_static_offsets_vector() {
   return ExternalReference(OffsetsVector::static_offsets_vector_address());
=======================================
--- /branches/bleeding_edge/src/assembler.h     Fri Jan 15 04:25:24 2010
+++ /branches/bleeding_edge/src/assembler.h     Mon Jan 18 01:49:50 2010
@@ -462,6 +462,10 @@

   // Function NativeRegExpMacroAssembler::GrowStack()
   static ExternalReference re_grow_stack();
+
+  // byte NativeRegExpMacroAssembler::word_character_bitmap
+  static ExternalReference re_word_character_map();
+
 #endif

// This lets you register a function that rewrites all external references.
=======================================
--- /branches/bleeding_edge/src/ia32/assembler-ia32.cc Mon Jan 11 07:19:53 2010 +++ /branches/bleeding_edge/src/ia32/assembler-ia32.cc Mon Jan 18 01:49:50 2010
@@ -1259,6 +1259,14 @@
   EMIT(0x85);
   emit_operand(reg, op);
 }
+
+
+void Assembler::test_b(Register reg, const Operand& op) {
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  EMIT(0x84);
+  emit_operand(reg, op);
+}


 void Assembler::test(const Operand& op, const Immediate& imm) {
=======================================
--- /branches/bleeding_edge/src/ia32/assembler-ia32.h Mon Jan 11 07:19:53 2010 +++ /branches/bleeding_edge/src/ia32/assembler-ia32.h Mon Jan 18 01:49:50 2010
@@ -624,6 +624,7 @@

   void test(Register reg, const Immediate& imm);
   void test(Register reg, const Operand& op);
+  void test_b(Register reg, const Operand& op);
   void test(const Operand& op, const Immediate& imm);

   void xor_(Register dst, int32_t imm32);
=======================================
--- /branches/bleeding_edge/src/ia32/disasm-ia32.cc     Mon Jan 11 07:19:53 2010
+++ /branches/bleeding_edge/src/ia32/disasm-ia32.cc     Mon Jan 18 01:49:50 2010
@@ -63,6 +63,7 @@
   {0x29, "sub", OPER_REG_OP_ORDER},
   {0x2A, "subb", REG_OPER_OP_ORDER},
   {0x2B, "sub", REG_OPER_OP_ORDER},
+  {0x84, "test_b", REG_OPER_OP_ORDER},
   {0x85, "test", REG_OPER_OP_ORDER},
   {0x31, "xor", OPER_REG_OP_ORDER},
   {0x33, "xor", REG_OPER_OP_ORDER},
=======================================
--- /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Thu Jan 7 11:01:23 2010 +++ /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Mon Jan 18 01:49:50 2010
@@ -539,46 +539,33 @@
     return true;
   }
   case 'w': {
-    Label done, check_digits;
-    __ cmp(Operand(current_character()), Immediate('9'));
-    __ j(less_equal, &check_digits);
-    __ cmp(Operand(current_character()), Immediate('_'));
-    __ j(equal, &done);
-    // Convert to lower case if letter.
-    __ mov(Operand(eax), current_character());
-    __ or_(eax, 0x20);
-    // check current character in range ['a'..'z'], nondestructively.
-    __ sub(Operand(eax), Immediate('a'));
-    __ cmp(Operand(eax), Immediate('z' - 'a'));
-    BranchOrBacktrack(above, on_no_match);
-    __ jmp(&done);
-    __ bind(&check_digits);
-    // Check current character in range ['0'..'9'].
-    __ cmp(Operand(current_character()), Immediate('0'));
-    BranchOrBacktrack(below, on_no_match);
-    __ bind(&done);
-
+    if (mode_ != ASCII) {
+      // Table is 128 entries, so all ASCII characters can be tested.
+      __ cmp(Operand(current_character()), Immediate('z'));
+      BranchOrBacktrack(above, on_no_match);
+    }
+ ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word char. + ExternalReference word_map = ExternalReference::re_word_character_map();
+    __ test_b(current_character(),
+ Operand::StaticArray(current_character(), times_1, word_map));
+    BranchOrBacktrack(zero, on_no_match);
     return true;
   }
   case 'W': {
-    Label done, check_digits;
-    __ cmp(Operand(current_character()), Immediate('9'));
-    __ j(less_equal, &check_digits);
-    __ cmp(Operand(current_character()), Immediate('_'));
-    BranchOrBacktrack(equal, on_no_match);
-    // Convert to lower case if letter.
-    __ mov(Operand(eax), current_character());
-    __ or_(eax, 0x20);
-    // check current character in range ['a'..'z'], nondestructively.
-    __ sub(Operand(eax), Immediate('a'));
-    __ cmp(Operand(eax), Immediate('z' - 'a'));
-    BranchOrBacktrack(below_equal, on_no_match);
-    __ jmp(&done);
-    __ bind(&check_digits);
-    // Check current character in range ['0'..'9'].
-    __ cmp(Operand(current_character()), Immediate('0'));
-    BranchOrBacktrack(above_equal, on_no_match);
-    __ bind(&done);
+    Label done;
+    if (mode_ != ASCII) {
+      // Table is 128 entries, so all ASCII characters can be tested.
+      __ cmp(Operand(current_character()), Immediate('z'));
+      __ j(above, &done);
+    }
+ ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word char. + ExternalReference word_map = ExternalReference::re_word_character_map();
+    __ test_b(current_character(),
+ Operand::StaticArray(current_character(), times_1, word_map));
+    BranchOrBacktrack(not_zero, on_no_match);
+    if (mode_ != ASCII) {
+      __ bind(&done);
+    }
     return true;
   }
   // Non-standard classes (with no syntactic shorthand) used internally.
=======================================
--- /branches/bleeding_edge/src/regexp-macro-assembler.cc Wed Jan 6 03:09:30 2010 +++ /branches/bleeding_edge/src/regexp-macro-assembler.cc Mon Jan 18 01:49:50 2010
@@ -189,6 +189,30 @@

 static unibrow::Mapping<unibrow::Ecma262Canonicalize> canonicalize;

+
+byte NativeRegExpMacroAssembler::word_character_map[] = {
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // '0' - '7'
+    0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
+
+    0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'A' - 'G'
+    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'H' - 'O'
+    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'P' - 'W'
+ 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X' - 'Z', '_'
+
+    0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'a' - 'g'
+    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
+    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
+    0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
+};
+
+
 int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16(
     Address byte_offset1,
     Address byte_offset2,
=======================================
--- /branches/bleeding_edge/src/regexp-macro-assembler.h Thu Jan 7 11:01:23 2010 +++ /branches/bleeding_edge/src/regexp-macro-assembler.h Mon Jan 18 01:49:50 2010
@@ -203,6 +203,15 @@
   static Address GrowStack(Address stack_pointer, Address* stack_top);

static const byte* StringCharacterPosition(String* subject, int start_index);
+
+  // Byte map of ASCII characters with a 0xff if the character is a word
+  // character (digit, letter or underscore) and 0x00 otherwise.
+  // Used by generated RegExp code.
+  static byte word_character_map[128];
+
+  static Address word_character_map_address() {
+    return &word_character_map[0];
+  }

   static Result Execute(Code* code,
                         String* input,
=======================================
--- /branches/bleeding_edge/src/serialize.cc    Fri Jan 15 06:20:31 2010
+++ /branches/bleeding_edge/src/serialize.cc    Mon Jan 18 01:49:50 2010
@@ -479,15 +479,19 @@
       UNCLASSIFIED,
       21,
       "NativeRegExpMacroAssembler::GrowStack()");
+  Add(ExternalReference::re_word_character_map().address(),
+      UNCLASSIFIED,
+      22,
+      "NativeRegExpMacroAssembler::word_character_map");
 #endif
   // Keyed lookup cache.
   Add(ExternalReference::keyed_lookup_cache_keys().address(),
       UNCLASSIFIED,
-      22,
+      23,
       "KeyedLookupCache::keys()");
   Add(ExternalReference::keyed_lookup_cache_field_offsets().address(),
       UNCLASSIFIED,
-      23,
+      24,
       "KeyedLookupCache::field_offsets()");
 }

=======================================
--- /branches/bleeding_edge/src/x64/assembler-x64.cc Fri Nov 13 04:32:57 2009 +++ /branches/bleeding_edge/src/x64/assembler-x64.cc Mon Jan 18 01:49:50 2010
@@ -1878,6 +1878,20 @@
   emit_operand(rax, op);  // Operation code 0
   emit(mask.value_);  // Low byte emitted.
 }
+
+
+void Assembler::testb(const Operand& op, Register reg) {
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  if (reg.code() > 3) {
+    // Register is not one of al, bl, cl, dl.  Its encoding needs REX.
+    emit_rex_32(reg, op);
+  } else {
+    emit_optional_rex_32(reg, op);
+  }
+  emit(0x84);
+  emit_operand(reg, op);
+}


 void Assembler::testl(Register dst, Register src) {
=======================================
--- /branches/bleeding_edge/src/x64/assembler-x64.h     Mon Nov 30 07:09:49 2009
+++ /branches/bleeding_edge/src/x64/assembler-x64.h     Mon Jan 18 01:49:50 2010
@@ -931,6 +931,7 @@
   void testb(Register dst, Register src);
   void testb(Register reg, Immediate mask);
   void testb(const Operand& op, Immediate mask);
+  void testb(const Operand& op, Register reg);
   void testl(Register dst, Register src);
   void testl(Register reg, Immediate mask);
   void testl(const Operand& op, Immediate mask);
=======================================
--- /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Thu Jan 7 11:01:23 2010 +++ /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Mon Jan 18 01:49:50 2010
@@ -582,49 +582,38 @@
     return true;
   }
   case 'w': {
-    Label done, check_digits;
-    __ cmpl(current_character(), Immediate('9'));
-    __ j(less_equal, &check_digits);
-    __ cmpl(current_character(), Immediate('_'));
-    __ j(equal, &done);
-    // Convert to lower case if letter.
-    __ movl(rax, current_character());
-    __ orl(rax, Immediate(0x20));
-    // check rax in range ['a'..'z'].
-    __ subl(rax, Immediate('a'));
-    __ cmpl(rax, Immediate('z' - 'a'));
-    BranchOrBacktrack(above, on_no_match);
-    __ jmp(&done);
-    __ bind(&check_digits);
-    // Check current character in range ['0'..'9'].
-    __ cmpl(current_character(), Immediate('0'));
-    BranchOrBacktrack(below, on_no_match);
-    __ bind(&done);
-
+    if (mode_ != ASCII) {
+      // Table is 128 entries, so all ASCII characters can be tested.
+      __ cmpl(current_character(), Immediate('z'));
+      BranchOrBacktrack(above, on_no_match);
+    }
+    __ movq(rbx, ExternalReference::re_word_character_map());
+ ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word char. + ExternalReference word_map = ExternalReference::re_word_character_map();
+    __ testb(Operand(rbx, current_character(), times_1, 0),
+             current_character());
+    BranchOrBacktrack(zero, on_no_match);
     return true;
   }
   case 'W': {
-    Label done, check_digits;
-    __ cmpl(current_character(), Immediate('9'));
-    __ j(less_equal, &check_digits);
-    __ cmpl(current_character(), Immediate('_'));
-    BranchOrBacktrack(equal, on_no_match);
-    // Convert to lower case if letter.
-    __ movl(rax, current_character());
-    __ orl(rax, Immediate(0x20));
-    // check current character in range ['a'..'z'], nondestructively.
-    __ subl(rax, Immediate('a'));
-    __ cmpl(rax, Immediate('z' - 'a'));
-    BranchOrBacktrack(below_equal, on_no_match);
-    __ jmp(&done);
-    __ bind(&check_digits);
-    // Check current character in range ['0'..'9'].
-    __ cmpl(current_character(), Immediate('0'));
-    BranchOrBacktrack(above_equal, on_no_match);
-    __ bind(&done);
-
+    Label done;
+    if (mode_ != ASCII) {
+      // Table is 128 entries, so all ASCII characters can be tested.
+      __ cmpl(current_character(), Immediate('z'));
+      __ j(above, &done);
+    }
+    __ movq(rbx, ExternalReference::re_word_character_map());
+ ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word char. + ExternalReference word_map = ExternalReference::re_word_character_map();
+    __ testb(Operand(rbx, current_character(), times_1, 0),
+             current_character());
+    BranchOrBacktrack(not_zero, on_no_match);
+    if (mode_ != ASCII) {
+      __ bind(&done);
+    }
     return true;
   }
+
   case '*':
     // Match any character.
     return true;
-- 
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to