Revision: 3626
Author: [email protected]
Date: Mon Jan 18 01:49:50 2010
Log: RegExp bitmap test for word character.
Review URL: http://codereview.chromium.org/547024
http://code.google.com/p/v8/source/detail?r=3626
Modified:
/branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc
/branches/bleeding_edge/src/assembler.cc
/branches/bleeding_edge/src/assembler.h
/branches/bleeding_edge/src/ia32/assembler-ia32.cc
/branches/bleeding_edge/src/ia32/assembler-ia32.h
/branches/bleeding_edge/src/ia32/disasm-ia32.cc
/branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc
/branches/bleeding_edge/src/regexp-macro-assembler.cc
/branches/bleeding_edge/src/regexp-macro-assembler.h
/branches/bleeding_edge/src/serialize.cc
/branches/bleeding_edge/src/x64/assembler-x64.cc
/branches/bleeding_edge/src/x64/assembler-x64.h
/branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc
=======================================
--- /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Thu Jan
7 11:01:23 2010
+++ /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Mon Jan
18 01:49:50 2010
@@ -526,64 +526,54 @@
return true;
}
case 'n': {
- // Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
- __ eor(r0, current_character(), Operand(0x01));
- // See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
- __ sub(r0, r0, Operand(0x0b));
- __ cmp(r0, Operand(0x0c - 0x0b));
- if (mode_ == ASCII) {
- BranchOrBacktrack(hi, on_no_match);
- } else {
- Label done;
- __ b(ls, &done);
- // Compare original value to 0x2028 and 0x2029, using the already
- // computed (current_char ^ 0x01 - 0x0b). I.e., check for
- // 0x201d (0x2028 - 0x0b) or 0x201e.
- __ sub(r0, r0, Operand(0x2028 - 0x0b));
- __ cmp(r0, Operand(1));
- BranchOrBacktrack(hi, on_no_match);
- __ bind(&done);
- }
- return true;
- }
+ // Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029)
+ __ eor(r0, current_character(), Operand(0x01));
+ // See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c
+ __ sub(r0, r0, Operand(0x0b));
+ __ cmp(r0, Operand(0x0c - 0x0b));
+ if (mode_ == ASCII) {
+ BranchOrBacktrack(hi, on_no_match);
+ } else {
+ Label done;
+ __ b(ls, &done);
+ // Compare original value to 0x2028 and 0x2029, using the already
+ // computed (current_char ^ 0x01 - 0x0b). I.e., check for
+ // 0x201d (0x2028 - 0x0b) or 0x201e.
+ __ sub(r0, r0, Operand(0x2028 - 0x0b));
+ __ cmp(r0, Operand(1));
+ BranchOrBacktrack(hi, on_no_match);
+ __ bind(&done);
+ }
+ return true;
+ }
case 'w': {
- // Match word character (0-9, A-Z, a-z and _).
- Label digits, done;
- __ cmp(current_character(), Operand('9'));
- __ b(ls, &digits);
- __ cmp(current_character(), Operand('_'));
- __ b(eq, &done);
- __ orr(r0, current_character(), Operand(0x20));
- __ sub(r0, r0, Operand('a'));
- __ cmp(r0, Operand('z' - 'a'));
- BranchOrBacktrack(hi, on_no_match);
- __ jmp(&done);
-
- __ bind(&digits);
- __ cmp(current_character(), Operand('0'));
- BranchOrBacktrack(lo, on_no_match);
- __ bind(&done);
-
+ if (mode_ != ASCII) {
+ // Table is 128 entries, so all ASCII characters can be tested.
+ __ cmp(current_character(), Operand('z'));
+ BranchOrBacktrack(hi, on_no_match);
+ }
+ ExternalReference map = ExternalReference::re_word_character_map();
+ __ mov(r0, Operand(map));
+ __ ldrb(r0, MemOperand(r0, current_character()));
+ __ tst(r0, Operand(r0));
+ BranchOrBacktrack(eq, on_no_match);
return true;
}
case 'W': {
- // Match non-word character (not 0-9, A-Z, a-z and _).
- Label digits, done;
- __ cmp(current_character(), Operand('9'));
- __ b(ls, &digits);
- __ cmp(current_character(), Operand('_'));
- BranchOrBacktrack(eq, on_no_match);
- __ orr(r0, current_character(), Operand(0x20));
- __ sub(r0, r0, Operand('a'));
- __ cmp(r0, Operand('z' - 'a'));
- BranchOrBacktrack(ls, on_no_match);
- __ jmp(&done);
-
- __ bind(&digits);
- __ cmp(current_character(), Operand('0'));
- BranchOrBacktrack(hs, on_no_match);
- __ bind(&done);
-
+ Label done;
+ if (mode_ != ASCII) {
+ // Table is 128 entries, so all ASCII characters can be tested.
+ __ cmp(current_character(), Operand('z'));
+ __ b(hi, &done);
+ }
+ ExternalReference map = ExternalReference::re_word_character_map();
+ __ mov(r0, Operand(map));
+ __ ldrb(r0, MemOperand(r0, current_character()));
+ __ tst(r0, Operand(r0));
+ BranchOrBacktrack(ne, on_no_match);
+ if (mode_ != ASCII) {
+ __ bind(&done);
+ }
return true;
}
case '*':
=======================================
--- /branches/bleeding_edge/src/assembler.cc Mon Jan 18 00:36:06 2010
+++ /branches/bleeding_edge/src/assembler.cc Mon Jan 18 01:49:50 2010
@@ -670,6 +670,10 @@
FUNCTION_ADDR(NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16)));
}
+ExternalReference ExternalReference::re_word_character_map() {
+ return ExternalReference(
+ NativeRegExpMacroAssembler::word_character_map_address());
+}
ExternalReference ExternalReference::address_of_static_offsets_vector() {
return ExternalReference(OffsetsVector::static_offsets_vector_address());
=======================================
--- /branches/bleeding_edge/src/assembler.h Fri Jan 15 04:25:24 2010
+++ /branches/bleeding_edge/src/assembler.h Mon Jan 18 01:49:50 2010
@@ -462,6 +462,10 @@
// Function NativeRegExpMacroAssembler::GrowStack()
static ExternalReference re_grow_stack();
+
+ // byte NativeRegExpMacroAssembler::word_character_bitmap
+ static ExternalReference re_word_character_map();
+
#endif
// This lets you register a function that rewrites all external
references.
=======================================
--- /branches/bleeding_edge/src/ia32/assembler-ia32.cc Mon Jan 11 07:19:53
2010
+++ /branches/bleeding_edge/src/ia32/assembler-ia32.cc Mon Jan 18 01:49:50
2010
@@ -1259,6 +1259,14 @@
EMIT(0x85);
emit_operand(reg, op);
}
+
+
+void Assembler::test_b(Register reg, const Operand& op) {
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ EMIT(0x84);
+ emit_operand(reg, op);
+}
void Assembler::test(const Operand& op, const Immediate& imm) {
=======================================
--- /branches/bleeding_edge/src/ia32/assembler-ia32.h Mon Jan 11 07:19:53
2010
+++ /branches/bleeding_edge/src/ia32/assembler-ia32.h Mon Jan 18 01:49:50
2010
@@ -624,6 +624,7 @@
void test(Register reg, const Immediate& imm);
void test(Register reg, const Operand& op);
+ void test_b(Register reg, const Operand& op);
void test(const Operand& op, const Immediate& imm);
void xor_(Register dst, int32_t imm32);
=======================================
--- /branches/bleeding_edge/src/ia32/disasm-ia32.cc Mon Jan 11 07:19:53 2010
+++ /branches/bleeding_edge/src/ia32/disasm-ia32.cc Mon Jan 18 01:49:50 2010
@@ -63,6 +63,7 @@
{0x29, "sub", OPER_REG_OP_ORDER},
{0x2A, "subb", REG_OPER_OP_ORDER},
{0x2B, "sub", REG_OPER_OP_ORDER},
+ {0x84, "test_b", REG_OPER_OP_ORDER},
{0x85, "test", REG_OPER_OP_ORDER},
{0x31, "xor", OPER_REG_OP_ORDER},
{0x33, "xor", REG_OPER_OP_ORDER},
=======================================
--- /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Thu
Jan 7 11:01:23 2010
+++ /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Mon Jan
18 01:49:50 2010
@@ -539,46 +539,33 @@
return true;
}
case 'w': {
- Label done, check_digits;
- __ cmp(Operand(current_character()), Immediate('9'));
- __ j(less_equal, &check_digits);
- __ cmp(Operand(current_character()), Immediate('_'));
- __ j(equal, &done);
- // Convert to lower case if letter.
- __ mov(Operand(eax), current_character());
- __ or_(eax, 0x20);
- // check current character in range ['a'..'z'], nondestructively.
- __ sub(Operand(eax), Immediate('a'));
- __ cmp(Operand(eax), Immediate('z' - 'a'));
- BranchOrBacktrack(above, on_no_match);
- __ jmp(&done);
- __ bind(&check_digits);
- // Check current character in range ['0'..'9'].
- __ cmp(Operand(current_character()), Immediate('0'));
- BranchOrBacktrack(below, on_no_match);
- __ bind(&done);
-
+ if (mode_ != ASCII) {
+ // Table is 128 entries, so all ASCII characters can be tested.
+ __ cmp(Operand(current_character()), Immediate('z'));
+ BranchOrBacktrack(above, on_no_match);
+ }
+ ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word
char.
+ ExternalReference word_map =
ExternalReference::re_word_character_map();
+ __ test_b(current_character(),
+ Operand::StaticArray(current_character(), times_1,
word_map));
+ BranchOrBacktrack(zero, on_no_match);
return true;
}
case 'W': {
- Label done, check_digits;
- __ cmp(Operand(current_character()), Immediate('9'));
- __ j(less_equal, &check_digits);
- __ cmp(Operand(current_character()), Immediate('_'));
- BranchOrBacktrack(equal, on_no_match);
- // Convert to lower case if letter.
- __ mov(Operand(eax), current_character());
- __ or_(eax, 0x20);
- // check current character in range ['a'..'z'], nondestructively.
- __ sub(Operand(eax), Immediate('a'));
- __ cmp(Operand(eax), Immediate('z' - 'a'));
- BranchOrBacktrack(below_equal, on_no_match);
- __ jmp(&done);
- __ bind(&check_digits);
- // Check current character in range ['0'..'9'].
- __ cmp(Operand(current_character()), Immediate('0'));
- BranchOrBacktrack(above_equal, on_no_match);
- __ bind(&done);
+ Label done;
+ if (mode_ != ASCII) {
+ // Table is 128 entries, so all ASCII characters can be tested.
+ __ cmp(Operand(current_character()), Immediate('z'));
+ __ j(above, &done);
+ }
+ ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word
char.
+ ExternalReference word_map =
ExternalReference::re_word_character_map();
+ __ test_b(current_character(),
+ Operand::StaticArray(current_character(), times_1,
word_map));
+ BranchOrBacktrack(not_zero, on_no_match);
+ if (mode_ != ASCII) {
+ __ bind(&done);
+ }
return true;
}
// Non-standard classes (with no syntactic shorthand) used internally.
=======================================
--- /branches/bleeding_edge/src/regexp-macro-assembler.cc Wed Jan 6
03:09:30 2010
+++ /branches/bleeding_edge/src/regexp-macro-assembler.cc Mon Jan 18
01:49:50 2010
@@ -189,6 +189,30 @@
static unibrow::Mapping<unibrow::Ecma262Canonicalize> canonicalize;
+
+byte NativeRegExpMacroAssembler::word_character_map[] = {
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // '0' - '7'
+ 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
+
+ 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'A' - 'G'
+ 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'H' - 'O'
+ 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'P' - 'W'
+ 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X'
- 'Z', '_'
+
+ 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'a' - 'g'
+ 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
+ 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
+ 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
+};
+
+
int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16(
Address byte_offset1,
Address byte_offset2,
=======================================
--- /branches/bleeding_edge/src/regexp-macro-assembler.h Thu Jan 7
11:01:23 2010
+++ /branches/bleeding_edge/src/regexp-macro-assembler.h Mon Jan 18
01:49:50 2010
@@ -203,6 +203,15 @@
static Address GrowStack(Address stack_pointer, Address* stack_top);
static const byte* StringCharacterPosition(String* subject, int
start_index);
+
+ // Byte map of ASCII characters with a 0xff if the character is a word
+ // character (digit, letter or underscore) and 0x00 otherwise.
+ // Used by generated RegExp code.
+ static byte word_character_map[128];
+
+ static Address word_character_map_address() {
+ return &word_character_map[0];
+ }
static Result Execute(Code* code,
String* input,
=======================================
--- /branches/bleeding_edge/src/serialize.cc Fri Jan 15 06:20:31 2010
+++ /branches/bleeding_edge/src/serialize.cc Mon Jan 18 01:49:50 2010
@@ -479,15 +479,19 @@
UNCLASSIFIED,
21,
"NativeRegExpMacroAssembler::GrowStack()");
+ Add(ExternalReference::re_word_character_map().address(),
+ UNCLASSIFIED,
+ 22,
+ "NativeRegExpMacroAssembler::word_character_map");
#endif
// Keyed lookup cache.
Add(ExternalReference::keyed_lookup_cache_keys().address(),
UNCLASSIFIED,
- 22,
+ 23,
"KeyedLookupCache::keys()");
Add(ExternalReference::keyed_lookup_cache_field_offsets().address(),
UNCLASSIFIED,
- 23,
+ 24,
"KeyedLookupCache::field_offsets()");
}
=======================================
--- /branches/bleeding_edge/src/x64/assembler-x64.cc Fri Nov 13 04:32:57
2009
+++ /branches/bleeding_edge/src/x64/assembler-x64.cc Mon Jan 18 01:49:50
2010
@@ -1878,6 +1878,20 @@
emit_operand(rax, op); // Operation code 0
emit(mask.value_); // Low byte emitted.
}
+
+
+void Assembler::testb(const Operand& op, Register reg) {
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ if (reg.code() > 3) {
+ // Register is not one of al, bl, cl, dl. Its encoding needs REX.
+ emit_rex_32(reg, op);
+ } else {
+ emit_optional_rex_32(reg, op);
+ }
+ emit(0x84);
+ emit_operand(reg, op);
+}
void Assembler::testl(Register dst, Register src) {
=======================================
--- /branches/bleeding_edge/src/x64/assembler-x64.h Mon Nov 30 07:09:49 2009
+++ /branches/bleeding_edge/src/x64/assembler-x64.h Mon Jan 18 01:49:50 2010
@@ -931,6 +931,7 @@
void testb(Register dst, Register src);
void testb(Register reg, Immediate mask);
void testb(const Operand& op, Immediate mask);
+ void testb(const Operand& op, Register reg);
void testl(Register dst, Register src);
void testl(Register reg, Immediate mask);
void testl(const Operand& op, Immediate mask);
=======================================
--- /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Thu Jan
7 11:01:23 2010
+++ /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Mon Jan
18 01:49:50 2010
@@ -582,49 +582,38 @@
return true;
}
case 'w': {
- Label done, check_digits;
- __ cmpl(current_character(), Immediate('9'));
- __ j(less_equal, &check_digits);
- __ cmpl(current_character(), Immediate('_'));
- __ j(equal, &done);
- // Convert to lower case if letter.
- __ movl(rax, current_character());
- __ orl(rax, Immediate(0x20));
- // check rax in range ['a'..'z'].
- __ subl(rax, Immediate('a'));
- __ cmpl(rax, Immediate('z' - 'a'));
- BranchOrBacktrack(above, on_no_match);
- __ jmp(&done);
- __ bind(&check_digits);
- // Check current character in range ['0'..'9'].
- __ cmpl(current_character(), Immediate('0'));
- BranchOrBacktrack(below, on_no_match);
- __ bind(&done);
-
+ if (mode_ != ASCII) {
+ // Table is 128 entries, so all ASCII characters can be tested.
+ __ cmpl(current_character(), Immediate('z'));
+ BranchOrBacktrack(above, on_no_match);
+ }
+ __ movq(rbx, ExternalReference::re_word_character_map());
+ ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word
char.
+ ExternalReference word_map =
ExternalReference::re_word_character_map();
+ __ testb(Operand(rbx, current_character(), times_1, 0),
+ current_character());
+ BranchOrBacktrack(zero, on_no_match);
return true;
}
case 'W': {
- Label done, check_digits;
- __ cmpl(current_character(), Immediate('9'));
- __ j(less_equal, &check_digits);
- __ cmpl(current_character(), Immediate('_'));
- BranchOrBacktrack(equal, on_no_match);
- // Convert to lower case if letter.
- __ movl(rax, current_character());
- __ orl(rax, Immediate(0x20));
- // check current character in range ['a'..'z'], nondestructively.
- __ subl(rax, Immediate('a'));
- __ cmpl(rax, Immediate('z' - 'a'));
- BranchOrBacktrack(below_equal, on_no_match);
- __ jmp(&done);
- __ bind(&check_digits);
- // Check current character in range ['0'..'9'].
- __ cmpl(current_character(), Immediate('0'));
- BranchOrBacktrack(above_equal, on_no_match);
- __ bind(&done);
-
+ Label done;
+ if (mode_ != ASCII) {
+ // Table is 128 entries, so all ASCII characters can be tested.
+ __ cmpl(current_character(), Immediate('z'));
+ __ j(above, &done);
+ }
+ __ movq(rbx, ExternalReference::re_word_character_map());
+ ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word
char.
+ ExternalReference word_map =
ExternalReference::re_word_character_map();
+ __ testb(Operand(rbx, current_character(), times_1, 0),
+ current_character());
+ BranchOrBacktrack(not_zero, on_no_match);
+ if (mode_ != ASCII) {
+ __ bind(&done);
+ }
return true;
}
+
case '*':
// Match any character.
return true;
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev