Author: [EMAIL PROTECTED]
Date: Thu Oct 30 04:11:27 2008
New Revision: 649

Added:
    branches/experimental/regexp2000/src/jsregexp-inl.h
Modified:
    branches/experimental/regexp2000/src/jsregexp.cc
    branches/experimental/regexp2000/src/jsregexp.h
    branches/experimental/regexp2000/src/utils.h
    branches/experimental/regexp2000/test/cctest/test-regexp.cc

Log:
Added most of an implementation of compact character classes, for use
in the regexp compiler.


Added: branches/experimental/regexp2000/src/jsregexp-inl.h
==============================================================================
--- (empty file)
+++ branches/experimental/regexp2000/src/jsregexp-inl.h Thu Oct 30 04:11:27  
2008
@@ -0,0 +1,90 @@
+// Copyright 2006-2008 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef V8_JSREGEXP_INL_H_
+#define V8_JSREGEXP_INL_H_
+
+
+#include "jsregexp.h"
+
+
+namespace v8 {
+namespace internal {
+
+
+CharacterClass CharacterClass::SingletonField(uc16 value) {
+  CharacterClass result(FIELD);
+  result.segment_ = segment_of(value);
+  result.data_.u_field = long_bit(value & kSegmentMask);
+  return result;
+}
+
+
+CharacterClass CharacterClass::RangeField(Range range) {
+  CharacterClass result;
+  result.InitializeFieldFrom(Vector<Range>(&range, 1));
+  return result;
+}
+
+
+CharacterClass CharacterClass::Union(CharacterClass* left,
+                                     CharacterClass* right) {
+  CharacterClass result(UNION);
+  result.data_.u_union.left = left;
+  result.data_.u_union.right = right;
+  return result;
+}
+
+
+void CharacterClass::write_nibble(int index, byte value) {
+  ASSERT(0 <= index && index < 16);
+  data_.u_field |= static_cast<uint64_t>(value) << (4 * index);
+}
+
+
+byte CharacterClass::read_nibble(int index) {
+  ASSERT(0 <= index && index < 16);
+  return (data_.u_field >> (4 * index)) & 0xf;
+}
+
+
+unsigned CharacterClass::segment_of(uc16 value) {
+  return value >> CharacterClass::kFieldWidth;
+}
+
+
+uc16 CharacterClass::segment_start(unsigned segment) {
+  return segment << CharacterClass::kFieldWidth;
+}
+
+
+
+}  // namespace internal
+}  // namespace v8
+
+
+#endif  // V8_JSREGEXP_INL_H_

Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc    (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc    Thu Oct 30 04:11:27  
2008
@@ -32,7 +32,7 @@
  #include "ast.h"
  #include "execution.h"
  #include "factory.h"
-#include "jsregexp.h"
+#include "jsregexp-inl.h"
  #include "platform.h"
  #include "runtime.h"
  #include "top.h"
@@ -825,6 +825,224 @@
      current = Compile(children->at(i), current);
    }
    return current;
+}
+
+
+bool CharacterClass::Contains(uc16 c) {
+  switch (type()) {
+    case FIELD: {
+      int offset = segment_start(segment_);
+      if (c < offset) return false;
+      int index = c - offset;
+      if (index >= CharacterClass::kFieldMax) return false;
+      return (data_.u_field & long_bit(index)) != 0;
+    }
+    case RANGES: {
+      int offset = segment_start(segment_);
+      if (c < offset) return false;
+      bool is_on = false;
+      unsigned i = 0;
+      while (i < count_) {
+        int delta = 0;
+        int nibble = read_nibble(i);
+        int bit_offset = 0;
+        while ((nibble & 0x8) == 0) {
+          delta |= nibble << bit_offset;
+          bit_offset += 3;
+          nibble = read_nibble(++i);
+        }
+        delta |= (nibble & 0x7) << bit_offset;
+        i++;
+        offset += delta;
+        if (c < offset)
+          return is_on;
+        is_on = !is_on;
+      }
+      return false;
+    }
+    case UNION: {
+      return data_.u_union.left->Contains(c)
+          || data_.u_union.right->Contains(c);
+    }
+    default:
+      UNREACHABLE();
+      return false;
+  }
+}
+
+
+template <int kCount>
+CharacterClass* StaticCharacterClassAllocator<kCount>::Allocate() {
+  ASSERT(used_ < kCount);
+  CharacterClass* result = &preallocated_[used_];
+  used_++;
+  return result;
+}
+
+
+class StaticCharacterClasses {
+ public:
+  StaticCharacterClasses();
+  CharacterClass* GetClass(uc16 tag);
+  static StaticCharacterClasses* instance_;
+ private:
+  StaticCharacterClassAllocator<5> static_allocator_;
+  CharacterClass digit_;
+  CharacterClass whitespace_;
+  CharacterClass word_;
+};
+
+
+StaticCharacterClasses* StaticCharacterClasses::instance_ = NULL;
+
+
+StaticCharacterClasses::StaticCharacterClasses() {
+#define MAKE_CLASS(Name)\
+  CharacterClass::Ranges(Vector<CharacterClass::Range>(k##Name##Ranges,\
+                                                        
k##Name##RangeCount), \
+                         &static_allocator_)
+
+  const int kDigitRangeCount = 1;
+  CharacterClass::Range kDigitRanges[kDigitRangeCount] = {
+    CharacterClass::Range('0', '9')
+  };
+  digit_ = MAKE_CLASS(Digit);
+
+  const int kWhiteSpaceRangeCount = 10;
+  CharacterClass::Range kWhiteSpaceRanges[kWhiteSpaceRangeCount] = {
+  CharacterClass::Range(0x0009, 0x0009), CharacterClass::Range(0x000B,  
0x000C),
+  CharacterClass::Range(0x0020, 0x0020), CharacterClass::Range(0x00A0,  
0x00A0),
+  CharacterClass::Range(0x1680, 0x1680), CharacterClass::Range(0x180E,  
0x180E),
+  CharacterClass::Range(0x2000, 0x200A), CharacterClass::Range(0x202F,  
0x202F),
+  CharacterClass::Range(0x205F, 0x205F), CharacterClass::Range(0x3000,  
0x3000)
+  };
+  whitespace_ = MAKE_CLASS(WhiteSpace);
+
+  const int kWordRangeCount = 4;
+  static CharacterClass::Range kWordRanges[kWordRangeCount] = {
+    CharacterClass::Range('0', '9'), CharacterClass::Range('A', 'Z'),
+    CharacterClass::Range('_', '_'), CharacterClass::Range('a', 'z')
+  };
+  word_ = MAKE_CLASS(Word);
+
+#undef MAKE_CLASS
+}
+
+
+CharacterClass* StaticCharacterClasses::GetClass(uc16 tag) {
+  switch (tag) {
+    case 'd':
+      return &digit_;
+    case 's':
+      return &whitespace_;
+    case 'w':
+      return &word_;
+    default:
+      UNREACHABLE();
+      return NULL;
+  }
+}
+
+
+CharacterClass* CharacterClass::GetCharacterClass(uc16 tag) {
+  if (StaticCharacterClasses::instance_ == NULL)
+    StaticCharacterClasses::instance_ = new StaticCharacterClasses();
+  return StaticCharacterClasses::instance_->GetClass(tag);
+}
+
+
+CharacterClass CharacterClass::Ranges(Vector<Range> boundaries,
+                                      CharacterClassAllocator* alloc) {
+  CharacterClass result;
+  result.InitializeRangesFrom(boundaries, alloc);
+  return result;
+}
+
+
+void CharacterClass::InitializeFieldFrom(Vector<Range> boundaries) {
+  ASSERT(boundaries.length() > 0);
+  ASSERT_EQ(EMPTY, type_);
+  type_ = FIELD;
+  segment_ = segment_of(boundaries.first().from());
+  ASSERT_EQ(static_cast<int>(segment_),
+            static_cast<int>(segment_of(boundaries.last().to())));
+  data_.u_field = 0;
+  for (int i = 0; i < boundaries.length(); i++) {
+    int start = boundaries[i].from();
+    int end = boundaries[i].to();
+    // Mask in the range
+    uint64_t t0 = ~static_cast<uint64_t>(0) >> (63 - (end & kSegmentMask));
+    uint64_t t1 = ~static_cast<uint64_t>(0) << (start & kSegmentMask);
+    data_.u_field |= (t0 & t1);
+  }
+}
+
+
+void CharacterClass::InitializeRangesFrom(Vector<Range> boundaries,
+                                          CharacterClassAllocator* alloc) {
+  ASSERT(boundaries.length() > 0);
+  ASSERT_EQ(EMPTY, type_);
+  int start_segment = segment_of(boundaries.first().from());
+  int end_segment = segment_of(boundaries.last().to());
+  if (start_segment == end_segment) {
+    InitializeFieldFrom(boundaries);
+    return;
+  }
+  type_ = RANGES;
+  segment_ = start_segment;
+  data_.u_ranges = 0;
+  int offset = 0;
+  uc16 last_boundary = segment_start(segment_);
+  int i;
+  for (i = 0; i < boundaries.length(); i++) {
+    Range current = boundaries[i];
+    ASSERT(last_boundary <= current.from());
+    int word = current.from() - last_boundary;
+    // We have to repeat the write operation to write both the delta
+    // from the last range to the start of this one, and the delta
+    // from the start to the end of this range.
+    for (int j = 0; j < 2; j++) {
+      do {
+        if (offset >= 16)
+          goto overflow;
+        byte nibble = (word & 0x7);
+        int next_word = word >> 3;
+        if (next_word == 0)
+          nibble |= 0x8;
+        write_nibble(offset, nibble);
+        word = next_word;
+        offset++;
+      } while (word > 0);
+      word = current.to() - current.from() + 1;
+    }
+    count_ = offset;
+    last_boundary = current.to() + 1;
+    continue;
+   overflow:
+    CharacterClass* left = alloc->Allocate();
+    ASSERT(i > 0);
+    if (segment_of(boundaries[i-1].to()) == segment_) {
+      // If what we've written so far fits within a single segment we
+      // can fit it into a bitfield.  Scan forward to see if any more
+      // ranges can be included.
+      while (i < boundaries.length()
+          && segment_of(boundaries[i].to()) == segment_)
+        i++;
+      ASSERT(i < boundaries.length());
+      left->InitializeFieldFrom(boundaries.SubVector(0, i));
+    } else {
+      // Otherwise we copy the work we've done so far into the left
+      // heap-allocated charclass.
+      *left = *this;
+    }
+    CharacterClass* right = alloc->Allocate();
+    right->InitializeRangesFrom(boundaries.SubVector(i,  
boundaries.length()),
+                                alloc);
+    type_ = UNION;
+    data_.u_union.left = left;
+    data_.u_union.right = right;
+    return;
+  }
  }



Modified: branches/experimental/regexp2000/src/jsregexp.h
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.h     (original)
+++ branches/experimental/regexp2000/src/jsregexp.h     Thu Oct 30 04:11:27 2008
@@ -126,6 +126,117 @@


  template <typename Char> class RegExpNode;
+class CharacterClassAllocator;
+
+
+class CharacterClass {
+ public:
+
+  enum Type { EMPTY = 0, FIELD = 1, RANGES = 2, UNION = 3 };
+
+  // A closed range from and including 'from', to and including 'to'.
+  class Range {
+   public:
+    Range() : from_(0), to_(0) { }
+    Range(uc16 from, uc16 to) : from_(from), to_(to) { ASSERT(from <= to);  
}
+    uc16 from() { return from_; }
+    uc16 to() { return to_; }
+   private:
+    uc16 from_;
+    uc16 to_;
+  };
+
+  CharacterClass() : type_(EMPTY) { }
+
+  explicit CharacterClass(Type type) : type_(type) { }
+
+  bool Contains(uc16 c);
+
+  // Returns a character class with a single bit set
+  static inline CharacterClass SingletonField(uc16 chr);
+
+  // Returns a bitfield character class with a closed range set.  The
+  // range must fit within one field, that is, fit between two adjacent
+  // kFieldMax-aligned boundaries.
+  static inline CharacterClass RangeField(Range range);
+
+  static inline CharacterClass Union(CharacterClass* left,
+                                     CharacterClass* right);
+
+  // Initializes an empty charclass as a bitfield containing the
+  // specified ranges.
+  void InitializeFieldFrom(Vector<Range> ranges);
+
+  // Initializes this character class to be the specified ranges.
+  // This class must be empty.
+  void InitializeRangesFrom(Vector<Range> ranges,
+                            CharacterClassAllocator* alloc);
+
+  // Creates a new character class containing the specified ranges
+  // and allocating any sub-classes using the specified allocator.
+  static CharacterClass Ranges(Vector<Range> boundaries,
+                               CharacterClassAllocator* alloc);
+
+  // Returns one of the built-in character classes such as '\w' or
+  // '\S'.
+  static CharacterClass* GetCharacterClass(uc16 tag);
+
+  inline void write_nibble(int index, byte value);
+  inline byte read_nibble(int index);
+
+  static inline unsigned segment_of(uc16 value);
+  static inline uc16 segment_start(unsigned segment);
+
+ private:
+  static const int kCharSize = 16;
+  static const int kFieldSegmentIndexWidth = 10;
+  static const int kFieldWidth = kCharSize - kFieldSegmentIndexWidth;
+  static const int kFieldMax = (1 << kFieldWidth);
+  static const int kSegmentMask = (1 << kFieldWidth) - 1;
+  static const int kNibbleCount = kFieldMax / 4;
+  STATIC_ASSERT(kFieldMax == 8 * sizeof(uint64_t));
+
+  Type type() { return type_; }
+
+  static inline uint64_t long_bit(int index) {
+    return static_cast<uint64_t>(1) << index;
+  }
+
+  Type type_: 2;
+  unsigned segment_ : 10;
+  unsigned count_ : 4;
+  union {
+    // These have the same type to make it easier to change one without
+    // touching the other.
+    uint64_t u_field;
+    uint64_t u_ranges;
+    struct {
+      CharacterClass* left;
+      CharacterClass* right;
+    } u_union;
+  } data_;
+};
+
+
+STATIC_ASSERT(sizeof(CharacterClass) == 3 * kIntSize);
+
+
+class CharacterClassAllocator {
+ public:
+  virtual CharacterClass* Allocate() = 0;
+  virtual ~CharacterClassAllocator() { }
+};
+
+
+template <int kCount>
+class StaticCharacterClassAllocator: public CharacterClassAllocator {
+ public:
+  StaticCharacterClassAllocator() : used_(0) { }
+  virtual CharacterClass* Allocate();
+ private:
+  int used_;
+  CharacterClass preallocated_[kCount];
+};


  class RegExpEngine: public AllStatic {

Modified: branches/experimental/regexp2000/src/utils.h
==============================================================================
--- branches/experimental/regexp2000/src/utils.h        (original)
+++ branches/experimental/regexp2000/src/utils.h        Thu Oct 30 04:11:27 2008
@@ -283,6 +283,15 @@
      return Vector<T>(NewArray<T>(length), length);
    }

+  // Returns a vector using the same backing storage as this one,
+  // spanning from and including 'from', to but not including 'to'.
+  Vector<T> SubVector(int from, int to) {
+    ASSERT(from < length_);
+    ASSERT(to <= length_);
+    ASSERT(from < to);
+    return Vector<T>(start() + from, to - from);
+  }
+
    // Returns the length of the vector.
    int length() const { return length_; }

@@ -297,6 +306,10 @@
      ASSERT(0 <= index && index < length_);
      return start_[index];
    }
+
+  T& first() { return start_[0]; }
+
+  T& last() { return start_[length_ - 1]; }

    // Returns a clone of this vector with a new backing store.
    Vector<T> Clone() const {

Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Thu Oct 30  
04:11:27 2008
@@ -26,13 +26,15 @@
  // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


+#include <stdlib.h>
+
  #include "v8.h"

  #include "cctest.h"
  #include "zone-inl.h"
  #include "parser.h"
  #include "ast.h"
-#include "jsregexp.h"
+#include "jsregexp-inl.h"


  using namespace v8::internal;
@@ -268,5 +270,157 @@
  }


-// "123456789abcdb".match(/(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(\11)/)
-// 123456789abcdb,1,2,3,4,5,6,7,8,9,a,b,c,d,b
+TEST(SingletonField) {
+  // Test all bits from 0 to 256
+  for (int i = 0; i < 256; i++) {
+    CharacterClass entry = CharacterClass::SingletonField(i);
+    for (int j = 0; j < 256; j++) {
+      CHECK_EQ(i == j, entry.Contains(j));
+    }
+  }
+  // Test upwards through the data range
+  static const uint32_t kMax = 1 << 16;
+  for (uint32_t i = 0; i < kMax; i = 1 + static_cast<uint32_t>(i * 1.2)) {
+    CharacterClass entry = CharacterClass::SingletonField(i);
+    for (uint32_t j = 0; j < kMax; j = 1 + static_cast<uint32_t>(j * 1.2))  
{
+      CHECK_EQ(i == j, entry.Contains(j));
+    }
+  }
+}
+
+
+TEST(RangeField) {
+  // Test bitfields containing a single range.
+  for (int i = 256; i < 320; i++) {
+    for (int j = i; j < 320; j++) {
+      CharacterClass::Range range(i, j);
+      CharacterClass entry = CharacterClass::RangeField(range);
+      for (int k = 256; k < 320; k++) {
+        CHECK_EQ(i <= k && k <= j, entry.Contains(k));
+      }
+    }
+  }
+}
+
+
+static void TestBuiltins(CharacterClass klass, bool (pred)(uc16)) {
+  for (int i = 0; i < (1 << 16); i++)
+    CHECK_EQ(pred(i), klass.Contains(i));
+}
+
+
+static bool IsDigit(uc16 c) {
+  return ('0' <= c && c <= '9');
+}
+
+
+static bool IsWhiteSpace(uc16 c) {
+  switch (c) {
+    case 0x09: case 0x0B: case 0x0C: case 0x20: case 0xA0:
+      return true;
+    default:
+      return unibrow::Space::Is(c);
+  }
+}
+
+
+static bool IsWord(uc16 c) {
+  return ('a' <= c && c <= 'z')
+      || ('A' <= c && c <= 'Z')
+      || ('0' <= c && c <= '9')
+      || (c == '_');
+}
+
+
+TEST(Builtins) {
+  TestBuiltins(*CharacterClass::GetCharacterClass('d'), IsDigit);
+  TestBuiltins(*CharacterClass::GetCharacterClass('s'), IsWhiteSpace);
+  TestBuiltins(*CharacterClass::GetCharacterClass('w'), IsWord);
+}
+
+
+TEST(SimpleRanges) {
+  // Test range classes containing a single range.
+  for (int i = 365; i < 730; i += 3) {
+    for (int j = i; j < 1095; j += 3) {
+      EmbeddedVector<CharacterClass::Range, 1> entries;
+      entries[0] = CharacterClass::Range(i, j);
+      CharacterClass klass = CharacterClass::Ranges(entries, NULL);
+      for (int k = 0; k < 1095; k += 3) {
+        CHECK_EQ(i <= k && k <= j, klass.Contains(k));
+      }
+    }
+  }
+}
+
+
+static unsigned PseudoRandom(int i, int j) {
+  return (~((i * 781) ^ (j * 329)));
+}
+
+
+// Generates pseudo-random character-class with kCount pseudo-random
+// ranges set.
+template <int kCount>
+class SimpleRangeGenerator {
+ public:
+  SimpleRangeGenerator() : i_(0) { }
+
+  // Returns the next character class and sets the ranges vector.  The
+  // returned value will not have any ranges that extend beyond the 'max'
+  // value.
+  CharacterClass Next(int max) {
+    for (int j = 0; j < 2 * kCount; j++) {
+      entries_[j] = PseudoRandom(i_, j) % max;
+    }
+    i_++;
+    qsort(entries_.start(), 2 * kCount, sizeof(uc16), Compare);
+    for (int j = 0; j < kCount; j++) {
+      ranges_[j] = CharacterClass::Range(entries_[2 * j],
+                                         entries_[2 * j + 1]);
+    }
+    return CharacterClass::Ranges(ranges_, NULL);
+  }
+
+  // Returns the ranges of the last range that was returned.  Note that
+  // the returned vector will be clobbered the next time Next() is called.
+  Vector<CharacterClass::Range> ranges() { return ranges_; }
+ private:
+
+  static int Compare(const void* a, const void* b) {
+    return *static_cast<const uc16*>(a) - *static_cast<const uc16*>(b);
+  }
+
+  int i_;
+  EmbeddedVector<uc16, 2 * kCount> entries_;
+  EmbeddedVector<CharacterClass::Range, kCount> ranges_;
+};
+
+
+TEST(LessSimpleRanges) {
+  // Tests range character classes containing 3 pseudo-random ranges.
+  SimpleRangeGenerator<3> gen;
+  for (int i = 0; i < 1024; i++) {
+    CharacterClass klass = gen.Next(256);
+    Vector<CharacterClass::Range> entries = gen.ranges();
+    for (int j = 0; j < 256; j++) {
+      bool is_on = false;
+      for (int k = 0; !is_on && k < 3; k++)
+        is_on = (entries[k].from() <= j && j <= entries[k].to());
+      CHECK_EQ(is_on, klass.Contains(j));
+    }
+  }
+}
+
+
+TEST(Unions) {
+  SimpleRangeGenerator<3> gen1;
+  SimpleRangeGenerator<3> gen2;
+  for (int i = 0; i < 1024; i++) {
+    CharacterClass klass1 = gen1.Next(256);
+    CharacterClass klass2 = gen2.Next(256);
+    CharacterClass uhnion = CharacterClass::Union(&klass1, &klass2);
+    for (int j = 0; j < 256; j++)
+      CHECK_EQ(klass1.Contains(j) || klass2.Contains(j),  
uhnion.Contains(j));
+  }
+}

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to