Author: [EMAIL PROTECTED]
Date: Thu Oct 30 04:11:27 2008
New Revision: 649
Added:
branches/experimental/regexp2000/src/jsregexp-inl.h
Modified:
branches/experimental/regexp2000/src/jsregexp.cc
branches/experimental/regexp2000/src/jsregexp.h
branches/experimental/regexp2000/src/utils.h
branches/experimental/regexp2000/test/cctest/test-regexp.cc
Log:
Added most of an implementation of compact character classes, for use
in the regexp compiler.
Added: branches/experimental/regexp2000/src/jsregexp-inl.h
==============================================================================
--- (empty file)
+++ branches/experimental/regexp2000/src/jsregexp-inl.h Thu Oct 30 04:11:27
2008
@@ -0,0 +1,90 @@
+// Copyright 2006-2008 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef V8_JSREGEXP_INL_H_
+#define V8_JSREGEXP_INL_H_
+
+
+#include "jsregexp.h"
+
+
+namespace v8 {
+namespace internal {
+
+
+CharacterClass CharacterClass::SingletonField(uc16 value) {
+ CharacterClass result(FIELD);
+ result.segment_ = segment_of(value);
+ result.data_.u_field = long_bit(value & kSegmentMask);
+ return result;
+}
+
+
+CharacterClass CharacterClass::RangeField(Range range) {
+ CharacterClass result;
+ result.InitializeFieldFrom(Vector<Range>(&range, 1));
+ return result;
+}
+
+
+CharacterClass CharacterClass::Union(CharacterClass* left,
+ CharacterClass* right) {
+ CharacterClass result(UNION);
+ result.data_.u_union.left = left;
+ result.data_.u_union.right = right;
+ return result;
+}
+
+
+void CharacterClass::write_nibble(int index, byte value) {
+ ASSERT(0 <= index && index < 16);
+ data_.u_field |= static_cast<uint64_t>(value) << (4 * index);
+}
+
+
+byte CharacterClass::read_nibble(int index) {
+ ASSERT(0 <= index && index < 16);
+ return (data_.u_field >> (4 * index)) & 0xf;
+}
+
+
+unsigned CharacterClass::segment_of(uc16 value) {
+ return value >> CharacterClass::kFieldWidth;
+}
+
+
+uc16 CharacterClass::segment_start(unsigned segment) {
+ return segment << CharacterClass::kFieldWidth;
+}
+
+
+
+} // namespace internal
+} // namespace v8
+
+
+#endif // V8_JSREGEXP_INL_H_
Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc Thu Oct 30 04:11:27
2008
@@ -32,7 +32,7 @@
#include "ast.h"
#include "execution.h"
#include "factory.h"
-#include "jsregexp.h"
+#include "jsregexp-inl.h"
#include "platform.h"
#include "runtime.h"
#include "top.h"
@@ -825,6 +825,224 @@
current = Compile(children->at(i), current);
}
return current;
+}
+
+
+bool CharacterClass::Contains(uc16 c) {
+ switch (type()) {
+ case FIELD: {
+ int offset = segment_start(segment_);
+ if (c < offset) return false;
+ int index = c - offset;
+ if (index >= CharacterClass::kFieldMax) return false;
+ return (data_.u_field & long_bit(index)) != 0;
+ }
+ case RANGES: {
+ int offset = segment_start(segment_);
+ if (c < offset) return false;
+ bool is_on = false;
+ unsigned i = 0;
+ while (i < count_) {
+ int delta = 0;
+ int nibble = read_nibble(i);
+ int bit_offset = 0;
+ while ((nibble & 0x8) == 0) {
+ delta |= nibble << bit_offset;
+ bit_offset += 3;
+ nibble = read_nibble(++i);
+ }
+ delta |= (nibble & 0x7) << bit_offset;
+ i++;
+ offset += delta;
+ if (c < offset)
+ return is_on;
+ is_on = !is_on;
+ }
+ return false;
+ }
+ case UNION: {
+ return data_.u_union.left->Contains(c)
+ || data_.u_union.right->Contains(c);
+ }
+ default:
+ UNREACHABLE();
+ return false;
+ }
+}
+
+
+template <int kCount>
+CharacterClass* StaticCharacterClassAllocator<kCount>::Allocate() {
+ ASSERT(used_ < kCount);
+ CharacterClass* result = &preallocated_[used_];
+ used_++;
+ return result;
+}
+
+
+class StaticCharacterClasses {
+ public:
+ StaticCharacterClasses();
+ CharacterClass* GetClass(uc16 tag);
+ static StaticCharacterClasses* instance_;
+ private:
+ StaticCharacterClassAllocator<5> static_allocator_;
+ CharacterClass digit_;
+ CharacterClass whitespace_;
+ CharacterClass word_;
+};
+
+
+StaticCharacterClasses* StaticCharacterClasses::instance_ = NULL;
+
+
+StaticCharacterClasses::StaticCharacterClasses() {
+#define MAKE_CLASS(Name)\
+ CharacterClass::Ranges(Vector<CharacterClass::Range>(k##Name##Ranges,\
+
k##Name##RangeCount), \
+ &static_allocator_)
+
+ const int kDigitRangeCount = 1;
+ CharacterClass::Range kDigitRanges[kDigitRangeCount] = {
+ CharacterClass::Range('0', '9')
+ };
+ digit_ = MAKE_CLASS(Digit);
+
+ const int kWhiteSpaceRangeCount = 10;
+ CharacterClass::Range kWhiteSpaceRanges[kWhiteSpaceRangeCount] = {
+ CharacterClass::Range(0x0009, 0x0009), CharacterClass::Range(0x000B,
0x000C),
+ CharacterClass::Range(0x0020, 0x0020), CharacterClass::Range(0x00A0,
0x00A0),
+ CharacterClass::Range(0x1680, 0x1680), CharacterClass::Range(0x180E,
0x180E),
+ CharacterClass::Range(0x2000, 0x200A), CharacterClass::Range(0x202F,
0x202F),
+ CharacterClass::Range(0x205F, 0x205F), CharacterClass::Range(0x3000,
0x3000)
+ };
+ whitespace_ = MAKE_CLASS(WhiteSpace);
+
+ const int kWordRangeCount = 4;
+ static CharacterClass::Range kWordRanges[kWordRangeCount] = {
+ CharacterClass::Range('0', '9'), CharacterClass::Range('A', 'Z'),
+ CharacterClass::Range('_', '_'), CharacterClass::Range('a', 'z')
+ };
+ word_ = MAKE_CLASS(Word);
+
+#undef MAKE_CLASS
+}
+
+
+CharacterClass* StaticCharacterClasses::GetClass(uc16 tag) {
+ switch (tag) {
+ case 'd':
+ return &digit_;
+ case 's':
+ return &whitespace_;
+ case 'w':
+ return &word_;
+ default:
+ UNREACHABLE();
+ return NULL;
+ }
+}
+
+
+CharacterClass* CharacterClass::GetCharacterClass(uc16 tag) {
+ if (StaticCharacterClasses::instance_ == NULL)
+ StaticCharacterClasses::instance_ = new StaticCharacterClasses();
+ return StaticCharacterClasses::instance_->GetClass(tag);
+}
+
+
+CharacterClass CharacterClass::Ranges(Vector<Range> boundaries,
+ CharacterClassAllocator* alloc) {
+ CharacterClass result;
+ result.InitializeRangesFrom(boundaries, alloc);
+ return result;
+}
+
+
+void CharacterClass::InitializeFieldFrom(Vector<Range> boundaries) {
+ ASSERT(boundaries.length() > 0);
+ ASSERT_EQ(EMPTY, type_);
+ type_ = FIELD;
+ segment_ = segment_of(boundaries.first().from());
+ ASSERT_EQ(static_cast<int>(segment_),
+ static_cast<int>(segment_of(boundaries.last().to())));
+ data_.u_field = 0;
+ for (int i = 0; i < boundaries.length(); i++) {
+ int start = boundaries[i].from();
+ int end = boundaries[i].to();
+ // Mask in the range
+ uint64_t t0 = ~static_cast<uint64_t>(0) >> (63 - (end & kSegmentMask));
+ uint64_t t1 = ~static_cast<uint64_t>(0) << (start & kSegmentMask);
+ data_.u_field |= (t0 & t1);
+ }
+}
+
+
+void CharacterClass::InitializeRangesFrom(Vector<Range> boundaries,
+ CharacterClassAllocator* alloc) {
+ ASSERT(boundaries.length() > 0);
+ ASSERT_EQ(EMPTY, type_);
+ int start_segment = segment_of(boundaries.first().from());
+ int end_segment = segment_of(boundaries.last().to());
+ if (start_segment == end_segment) {
+ InitializeFieldFrom(boundaries);
+ return;
+ }
+ type_ = RANGES;
+ segment_ = start_segment;
+ data_.u_ranges = 0;
+ int offset = 0;
+ uc16 last_boundary = segment_start(segment_);
+ int i;
+ for (i = 0; i < boundaries.length(); i++) {
+ Range current = boundaries[i];
+ ASSERT(last_boundary <= current.from());
+ int word = current.from() - last_boundary;
+ // We have to repeat the write operation to write both the delta
+ // from the last range to the start of this one, and the delta
+ // from the start to the end of this range.
+ for (int j = 0; j < 2; j++) {
+ do {
+ if (offset >= 16)
+ goto overflow;
+ byte nibble = (word & 0x7);
+ int next_word = word >> 3;
+ if (next_word == 0)
+ nibble |= 0x8;
+ write_nibble(offset, nibble);
+ word = next_word;
+ offset++;
+ } while (word > 0);
+ word = current.to() - current.from() + 1;
+ }
+ count_ = offset;
+ last_boundary = current.to() + 1;
+ continue;
+ overflow:
+ CharacterClass* left = alloc->Allocate();
+ ASSERT(i > 0);
+ if (segment_of(boundaries[i-1].to()) == segment_) {
+ // If what we've written so far fits within a single segment we
+ // can fit it into a bitfield. Scan forward to see if any more
+ // ranges can be included.
+ while (i < boundaries.length()
+ && segment_of(boundaries[i].to()) == segment_)
+ i++;
+ ASSERT(i < boundaries.length());
+ left->InitializeFieldFrom(boundaries.SubVector(0, i));
+ } else {
+ // Otherwise we copy the work we've done so far into the left
+ // heap-allocated charclass.
+ *left = *this;
+ }
+ CharacterClass* right = alloc->Allocate();
+ right->InitializeRangesFrom(boundaries.SubVector(i,
boundaries.length()),
+ alloc);
+ type_ = UNION;
+ data_.u_union.left = left;
+ data_.u_union.right = right;
+ return;
+ }
}
Modified: branches/experimental/regexp2000/src/jsregexp.h
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.h (original)
+++ branches/experimental/regexp2000/src/jsregexp.h Thu Oct 30 04:11:27 2008
@@ -126,6 +126,117 @@
template <typename Char> class RegExpNode;
+class CharacterClassAllocator;
+
+
+class CharacterClass {
+ public:
+
+ enum Type { EMPTY = 0, FIELD = 1, RANGES = 2, UNION = 3 };
+
+ // A closed range from and including 'from', to and including 'to'.
+ class Range {
+ public:
+ Range() : from_(0), to_(0) { }
+ Range(uc16 from, uc16 to) : from_(from), to_(to) { ASSERT(from <= to);
}
+ uc16 from() { return from_; }
+ uc16 to() { return to_; }
+ private:
+ uc16 from_;
+ uc16 to_;
+ };
+
+ CharacterClass() : type_(EMPTY) { }
+
+ explicit CharacterClass(Type type) : type_(type) { }
+
+ bool Contains(uc16 c);
+
+ // Returns a character class with a single bit set
+ static inline CharacterClass SingletonField(uc16 chr);
+
+ // Returns a bitfield character class with a closed range set. The
+ // range must fit within one field, that is, fit between two adjacent
+ // kFieldMax-aligned boundaries.
+ static inline CharacterClass RangeField(Range range);
+
+ static inline CharacterClass Union(CharacterClass* left,
+ CharacterClass* right);
+
+ // Initializes an empty charclass as a bitfield containing the
+ // specified ranges.
+ void InitializeFieldFrom(Vector<Range> ranges);
+
+ // Initializes this character class to be the specified ranges.
+ // This class must be empty.
+ void InitializeRangesFrom(Vector<Range> ranges,
+ CharacterClassAllocator* alloc);
+
+ // Creates a new character class containing the specified ranges
+ // and allocating any sub-classes using the specified allocator.
+ static CharacterClass Ranges(Vector<Range> boundaries,
+ CharacterClassAllocator* alloc);
+
+ // Returns one of the built-in character classes such as '\w' or
+ // '\S'.
+ static CharacterClass* GetCharacterClass(uc16 tag);
+
+ inline void write_nibble(int index, byte value);
+ inline byte read_nibble(int index);
+
+ static inline unsigned segment_of(uc16 value);
+ static inline uc16 segment_start(unsigned segment);
+
+ private:
+ static const int kCharSize = 16;
+ static const int kFieldSegmentIndexWidth = 10;
+ static const int kFieldWidth = kCharSize - kFieldSegmentIndexWidth;
+ static const int kFieldMax = (1 << kFieldWidth);
+ static const int kSegmentMask = (1 << kFieldWidth) - 1;
+ static const int kNibbleCount = kFieldMax / 4;
+ STATIC_ASSERT(kFieldMax == 8 * sizeof(uint64_t));
+
+ Type type() { return type_; }
+
+ static inline uint64_t long_bit(int index) {
+ return static_cast<uint64_t>(1) << index;
+ }
+
+ Type type_: 2;
+ unsigned segment_ : 10;
+ unsigned count_ : 4;
+ union {
+ // These have the same type to make it easier to change one without
+ // touching the other.
+ uint64_t u_field;
+ uint64_t u_ranges;
+ struct {
+ CharacterClass* left;
+ CharacterClass* right;
+ } u_union;
+ } data_;
+};
+
+
+STATIC_ASSERT(sizeof(CharacterClass) == 3 * kIntSize);
+
+
+class CharacterClassAllocator {
+ public:
+ virtual CharacterClass* Allocate() = 0;
+ virtual ~CharacterClassAllocator() { }
+};
+
+
+template <int kCount>
+class StaticCharacterClassAllocator: public CharacterClassAllocator {
+ public:
+ StaticCharacterClassAllocator() : used_(0) { }
+ virtual CharacterClass* Allocate();
+ private:
+ int used_;
+ CharacterClass preallocated_[kCount];
+};
class RegExpEngine: public AllStatic {
Modified: branches/experimental/regexp2000/src/utils.h
==============================================================================
--- branches/experimental/regexp2000/src/utils.h (original)
+++ branches/experimental/regexp2000/src/utils.h Thu Oct 30 04:11:27 2008
@@ -283,6 +283,15 @@
return Vector<T>(NewArray<T>(length), length);
}
+ // Returns a vector using the same backing storage as this one,
+ // spanning from and including 'from', to but not including 'to'.
+ Vector<T> SubVector(int from, int to) {
+ ASSERT(from < length_);
+ ASSERT(to <= length_);
+ ASSERT(from < to);
+ return Vector<T>(start() + from, to - from);
+ }
+
// Returns the length of the vector.
int length() const { return length_; }
@@ -297,6 +306,10 @@
ASSERT(0 <= index && index < length_);
return start_[index];
}
+
+ T& first() { return start_[0]; }
+
+ T& last() { return start_[length_ - 1]; }
// Returns a clone of this vector with a new backing store.
Vector<T> Clone() const {
Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Thu Oct 30
04:11:27 2008
@@ -26,13 +26,15 @@
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <stdlib.h>
+
#include "v8.h"
#include "cctest.h"
#include "zone-inl.h"
#include "parser.h"
#include "ast.h"
-#include "jsregexp.h"
+#include "jsregexp-inl.h"
using namespace v8::internal;
@@ -268,5 +270,157 @@
}
-// "123456789abcdb".match(/(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(\11)/)
-// 123456789abcdb,1,2,3,4,5,6,7,8,9,a,b,c,d,b
+TEST(SingletonField) {
+ // Test all bits from 0 to 256
+ for (int i = 0; i < 256; i++) {
+ CharacterClass entry = CharacterClass::SingletonField(i);
+ for (int j = 0; j < 256; j++) {
+ CHECK_EQ(i == j, entry.Contains(j));
+ }
+ }
+ // Test upwards through the data range
+ static const uint32_t kMax = 1 << 16;
+ for (uint32_t i = 0; i < kMax; i = 1 + static_cast<uint32_t>(i * 1.2)) {
+ CharacterClass entry = CharacterClass::SingletonField(i);
+ for (uint32_t j = 0; j < kMax; j = 1 + static_cast<uint32_t>(j * 1.2))
{
+ CHECK_EQ(i == j, entry.Contains(j));
+ }
+ }
+}
+
+
+TEST(RangeField) {
+ // Test bitfields containing a single range.
+ for (int i = 256; i < 320; i++) {
+ for (int j = i; j < 320; j++) {
+ CharacterClass::Range range(i, j);
+ CharacterClass entry = CharacterClass::RangeField(range);
+ for (int k = 256; k < 320; k++) {
+ CHECK_EQ(i <= k && k <= j, entry.Contains(k));
+ }
+ }
+ }
+}
+
+
+static void TestBuiltins(CharacterClass klass, bool (pred)(uc16)) {
+ for (int i = 0; i < (1 << 16); i++)
+ CHECK_EQ(pred(i), klass.Contains(i));
+}
+
+
+static bool IsDigit(uc16 c) {
+ return ('0' <= c && c <= '9');
+}
+
+
+static bool IsWhiteSpace(uc16 c) {
+ switch (c) {
+ case 0x09: case 0x0B: case 0x0C: case 0x20: case 0xA0:
+ return true;
+ default:
+ return unibrow::Space::Is(c);
+ }
+}
+
+
+static bool IsWord(uc16 c) {
+ return ('a' <= c && c <= 'z')
+ || ('A' <= c && c <= 'Z')
+ || ('0' <= c && c <= '9')
+ || (c == '_');
+}
+
+
+TEST(Builtins) {
+ TestBuiltins(*CharacterClass::GetCharacterClass('d'), IsDigit);
+ TestBuiltins(*CharacterClass::GetCharacterClass('s'), IsWhiteSpace);
+ TestBuiltins(*CharacterClass::GetCharacterClass('w'), IsWord);
+}
+
+
+TEST(SimpleRanges) {
+ // Test range classes containing a single range.
+ for (int i = 365; i < 730; i += 3) {
+ for (int j = i; j < 1095; j += 3) {
+ EmbeddedVector<CharacterClass::Range, 1> entries;
+ entries[0] = CharacterClass::Range(i, j);
+ CharacterClass klass = CharacterClass::Ranges(entries, NULL);
+ for (int k = 0; k < 1095; k += 3) {
+ CHECK_EQ(i <= k && k <= j, klass.Contains(k));
+ }
+ }
+ }
+}
+
+
+static unsigned PseudoRandom(int i, int j) {
+ return (~((i * 781) ^ (j * 329)));
+}
+
+
+// Generates pseudo-random character-class with kCount pseudo-random
+// ranges set.
+template <int kCount>
+class SimpleRangeGenerator {
+ public:
+ SimpleRangeGenerator() : i_(0) { }
+
+ // Returns the next character class and sets the ranges vector. The
+ // returned value will not have any ranges that extend beyond the 'max'
+ // value.
+ CharacterClass Next(int max) {
+ for (int j = 0; j < 2 * kCount; j++) {
+ entries_[j] = PseudoRandom(i_, j) % max;
+ }
+ i_++;
+ qsort(entries_.start(), 2 * kCount, sizeof(uc16), Compare);
+ for (int j = 0; j < kCount; j++) {
+ ranges_[j] = CharacterClass::Range(entries_[2 * j],
+ entries_[2 * j + 1]);
+ }
+ return CharacterClass::Ranges(ranges_, NULL);
+ }
+
+ // Returns the ranges of the last range that was returned. Note that
+ // the returned vector will be clobbered the next time Next() is called.
+ Vector<CharacterClass::Range> ranges() { return ranges_; }
+ private:
+
+ static int Compare(const void* a, const void* b) {
+ return *static_cast<const uc16*>(a) - *static_cast<const uc16*>(b);
+ }
+
+ int i_;
+ EmbeddedVector<uc16, 2 * kCount> entries_;
+ EmbeddedVector<CharacterClass::Range, kCount> ranges_;
+};
+
+
+TEST(LessSimpleRanges) {
+ // Tests range character classes containing 3 pseudo-random ranges.
+ SimpleRangeGenerator<3> gen;
+ for (int i = 0; i < 1024; i++) {
+ CharacterClass klass = gen.Next(256);
+ Vector<CharacterClass::Range> entries = gen.ranges();
+ for (int j = 0; j < 256; j++) {
+ bool is_on = false;
+ for (int k = 0; !is_on && k < 3; k++)
+ is_on = (entries[k].from() <= j && j <= entries[k].to());
+ CHECK_EQ(is_on, klass.Contains(j));
+ }
+ }
+}
+
+
+TEST(Unions) {
+ SimpleRangeGenerator<3> gen1;
+ SimpleRangeGenerator<3> gen2;
+ for (int i = 0; i < 1024; i++) {
+ CharacterClass klass1 = gen1.Next(256);
+ CharacterClass klass2 = gen2.Next(256);
+ CharacterClass uhnion = CharacterClass::Union(&klass1, &klass2);
+ for (int j = 0; j < 256; j++)
+ CHECK_EQ(klass1.Contains(j) || klass2.Contains(j),
uhnion.Contains(j));
+ }
+}
--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---