Revision: 5325
Author: [email protected]
Date: Tue Aug 24 03:53:44 2010
Log: Created collector class and used it to collect identifiers during scanning.

The collector class automatically expands to hold the values added to it,
like a List, but doesn't ensure that the backing store is contiguous, which
allows it to avoid copying back and forth as the buffer grows.

This is in preparation for identifyng identical symbols during preparsing.

Review URL: http://codereview.chromium.org/3181036
http://code.google.com/p/v8/source/detail?r=5325

Modified:
 /branches/bleeding_edge/src/conversions.cc
 /branches/bleeding_edge/src/conversions.h
 /branches/bleeding_edge/src/parser.cc
 /branches/bleeding_edge/src/runtime.cc
 /branches/bleeding_edge/src/runtime.h
 /branches/bleeding_edge/src/scanner.cc
 /branches/bleeding_edge/src/scanner.h
 /branches/bleeding_edge/src/utils.h
 /branches/bleeding_edge/test/cctest/test-utils.cc

=======================================
--- /branches/bleeding_edge/src/conversions.cc  Wed May  5 06:51:27 2010
+++ /branches/bleeding_edge/src/conversions.cc  Tue Aug 24 03:53:44 2010
@@ -733,9 +733,16 @@

double StringToDouble(const char* str, int flags, double empty_string_val) {
   const char* end = str + StrLength(str);
-
   return InternalStringToDouble(str, end, flags, empty_string_val);
 }
+
+
+double StringToDouble(Vector<const char> str,
+                      int flags,
+                      double empty_string_val) {
+  const char* end = str.start() + str.length();
+  return InternalStringToDouble(str.start(), end, flags, empty_string_val);
+}


 extern "C" char* dtoa(double d, int mode, int ndigits,
=======================================
--- /branches/bleeding_edge/src/conversions.h   Wed Mar 31 10:19:05 2010
+++ /branches/bleeding_edge/src/conversions.h   Tue Aug 24 03:53:44 2010
@@ -96,8 +96,12 @@


 // Converts a string into a double value according to ECMA-262 9.3.1
-double StringToDouble(const char* str, int flags, double empty_string_val = 0);
 double StringToDouble(String* str, int flags, double empty_string_val = 0);
+double StringToDouble(Vector<const char> str,
+                      int flags,
+                      double empty_string_val = 0);
+// This version expects a zero-terminated character array.
+double StringToDouble(const char* str, int flags, double empty_string_val = 0);

 // Converts a string into an integer.
 double StringToInt(String* str, int radix);
=======================================
--- /branches/bleeding_edge/src/parser.cc       Tue Aug 24 02:04:17 2010
+++ /branches/bleeding_edge/src/parser.cc       Tue Aug 24 03:53:44 2010
@@ -3378,7 +3378,7 @@
     case Token::NUMBER: {
       Consume(Token::NUMBER);
       double value =
- StringToDouble(scanner_.literal_string(), ALLOW_HEX | ALLOW_OCTALS);
+        StringToDouble(scanner_.literal(), ALLOW_HEX | ALLOW_OCTALS);
       result = NewNumberLiteral(value);
       break;
     }
@@ -3736,7 +3736,7 @@
       case Token::NUMBER: {
         Consume(Token::NUMBER);
         double value =
- StringToDouble(scanner_.literal_string(), ALLOW_HEX | ALLOW_OCTALS);
+          StringToDouble(scanner_.literal(), ALLOW_HEX | ALLOW_OCTALS);
         key = NewNumberLiteral(value);
         break;
       }
@@ -3989,7 +3989,7 @@
   Expect(Token::MOD, CHECK_OK);
   Handle<String> name = ParseIdentifier(CHECK_OK);
   Runtime::Function* function =
-      Runtime::FunctionForName(scanner_.literal_string());
+      Runtime::FunctionForName(scanner_.literal());
   ZoneList<Expression*>* args = ParseArguments(CHECK_OK);
   if (function == NULL && extension_ != NULL) {
     // The extension structures are only accessible while parsing the
@@ -4276,7 +4276,7 @@
     case Token::NUMBER: {
       Consume(Token::NUMBER);
       ASSERT(scanner_.literal_length() > 0);
-      double value = StringToDouble(scanner_.literal_string(),
+      double value = StringToDouble(scanner_.literal(),
NO_FLAGS, // Hex, octal or trailing junk.
                                     OS::nan_value());
       return NewNumberLiteral(value);
=======================================
--- /branches/bleeding_edge/src/runtime.cc      Mon Aug 23 06:58:56 2010
+++ /branches/bleeding_edge/src/runtime.cc      Tue Aug 24 03:53:44 2010
@@ -10688,9 +10688,10 @@
 }


-Runtime::Function* Runtime::FunctionForName(const char* name) {
+Runtime::Function* Runtime::FunctionForName(Vector<const char> name) {
   for (Function* f = Runtime_functions; f->name != NULL; f++) {
-    if (strcmp(f->name, name) == 0) {
+    if (strncmp(f->name, name.start(), name.length()) == 0
+        && f->name[name.length()] == 0) {
       return f;
     }
   }
=======================================
--- /branches/bleeding_edge/src/runtime.h       Fri Aug 20 02:37:22 2010
+++ /branches/bleeding_edge/src/runtime.h       Tue Aug 24 03:53:44 2010
@@ -419,7 +419,7 @@
   static Function* FunctionForId(FunctionId fid);

   // Get the runtime function with the given name.
-  static Function* FunctionForName(const char* name);
+  static Function* FunctionForName(Vector<const char> name);

static int StringMatch(Handle<String> sub, Handle<String> pat, int index);

=======================================
--- /branches/bleeding_edge/src/scanner.cc      Tue Jul 13 03:29:31 2010
+++ /branches/bleeding_edge/src/scanner.cc      Tue Aug 24 03:53:44 2010
@@ -50,35 +50,22 @@
// ----------------------------------------------------------------------------
 // UTF8Buffer

-UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { }
+UTF8Buffer::UTF8Buffer() : buffer_(kInitialCapacity) { }


-UTF8Buffer::~UTF8Buffer() {
-  if (data_ != NULL) DeleteArray(data_);
-}
+UTF8Buffer::~UTF8Buffer() {}


 void UTF8Buffer::AddCharSlow(uc32 c) {
-  static const int kCapacityGrowthLimit = 1 * MB;
-  if (cursor_ > limit_) {
-    int old_capacity = Capacity();
-    int old_position = pos();
-    int new_capacity =
-        Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit);
-    char* new_data = NewArray<char>(new_capacity);
-    memcpy(new_data, data_, old_position);
-    DeleteArray(data_);
-    data_ = new_data;
-    cursor_ = new_data + old_position;
-    limit_ = ComputeLimit(new_data, new_capacity);
-    ASSERT(Capacity() == new_capacity && pos() == old_position);
-  }
-  if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
-    *cursor_++ = c;  // Common case: 7-bit ASCII.
-  } else {
-    cursor_ += unibrow::Utf8::Encode(cursor_, c);
-  }
-  ASSERT(pos() <= Capacity());
+  ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar);
+  int length = unibrow::Utf8::Length(c);
+  Vector<char> block = buffer_.AddBlock(length, '\0');
+#ifdef DEBUG
+  int written_length = unibrow::Utf8::Encode(block.start(), c);
+  CHECK_EQ(length, written_length);
+#else
+  unibrow::Utf8::Encode(block.start(), c);
+#endif
 }


@@ -399,8 +386,8 @@
   // Set c0_ (one character ahead)
   ASSERT(kCharacterLookaheadBufferSize == 1);
   Advance();
-  // Initializer current_ to not refer to a literal buffer.
-  current_.literal_buffer = NULL;
+  // Initialise current_ to not refer to a literal.
+  current_.literal_chars = Vector<const char>();

   // Skip initial whitespace allowing HTML comment ends just like
   // after a newline and scan first token.
@@ -428,24 +415,16 @@


 void Scanner::StartLiteral() {
- // Use the first buffer unless it's currently in use by the current_ token.
-  // In most cases we won't have two literals/identifiers in a row, so
- // the second buffer won't be used very often and is unlikely to grow much.
-  UTF8Buffer* free_buffer =
-      (current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_
-                                                      : &literal_buffer_2_;
-  next_.literal_buffer = free_buffer;
-  free_buffer->Reset();
+  literal_buffer_.StartLiteral();
 }


 void Scanner::AddChar(uc32 c) {
-  next_.literal_buffer->AddChar(c);
-}
-
+  literal_buffer_.AddChar(c);
+}

 void Scanner::TerminateLiteral() {
-  AddChar(0);
+  next_.literal_chars = literal_buffer_.EndLiteral();
 }


@@ -575,7 +554,7 @@


 void Scanner::ScanJson() {
-  next_.literal_buffer = NULL;
+  next_.literal_chars = Vector<const char>();
   Token::Value token;
   has_line_terminator_before_next_ = false;
   do {
@@ -761,7 +740,7 @@


 void Scanner::ScanJavaScript() {
-  next_.literal_buffer = NULL;
+  next_.literal_chars = Vector<const char>();
   Token::Value token;
   has_line_terminator_before_next_ = false;
   do {
=======================================
--- /branches/bleeding_edge/src/scanner.h       Tue Jun 22 05:31:24 2010
+++ /branches/bleeding_edge/src/scanner.h       Tue Aug 24 03:53:44 2010
@@ -40,45 +40,36 @@
   UTF8Buffer();
   ~UTF8Buffer();

-  void AddChar(uc32 c) {
-    ASSERT_NOT_NULL(data_);
-    if (cursor_ <= limit_ &&
-        static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
-      *cursor_++ = static_cast<char>(c);
+  inline void AddChar(uc32 c) {
+    if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
+      buffer_.Add(static_cast<char>(c));
     } else {
       AddCharSlow(c);
     }
   }

-  void Reset() {
-    if (data_ == NULL) {
-      data_ = NewArray<char>(kInitialCapacity);
-      limit_ = ComputeLimit(data_, kInitialCapacity);
-    }
-    cursor_ = data_;
+  void StartLiteral() {
+    buffer_.StartSequence();
   }

-  int pos() const {
-    ASSERT_NOT_NULL(data_);
-    return static_cast<int>(cursor_ - data_);
+  Vector<const char> EndLiteral() {
+    buffer_.Add(kEndMarker);
+    Vector<char> sequence = buffer_.EndSequence();
+    return Vector<const char>(sequence.start(), sequence.length());
   }

-  char* data() const { return data_; }
-
+  // The end marker added after a parsed literal.
+  // Using zero allows the usage of strlen and similar functions on
+  // identifiers and numbers (but not strings, since they may contain zero
+  // bytes).
+  // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside
+  // an utf-8 string. This requires changes in all places that uses
+ // str-functions on the literals, but allows a single pointer to represent
+  // the literal, even if it contains embedded zeros.
+  static const char kEndMarker = '\x00';
  private:
   static const int kInitialCapacity = 256;
-  char* data_;
-  char* cursor_;
-  char* limit_;
-
-  int Capacity() const {
-    ASSERT_NOT_NULL(data_);
- return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize;
-  }
-
-  static char* ComputeLimit(char* data, int capacity) {
-    return (data + capacity) - unibrow::Utf8::kMaxEncodedSize;
-  }
+  SequenceCollector<char> buffer_;

   void AddCharSlow(uc32 c);
 };
@@ -314,27 +305,34 @@
   // These functions only give the correct result if the literal
   // was scanned between calls to StartLiteral() and TerminateLiteral().
   const char* literal_string() const {
-    return current_.literal_buffer->data();
-  }
+    return current_.literal_chars.start();
+  }
+
   int literal_length() const {
-    // Excluding terminal '\0' added by TerminateLiteral().
-    return current_.literal_buffer->pos() - 1;
+    // Excluding terminal '\x00' added by TerminateLiteral().
+    return current_.literal_chars.length() - 1;
+  }
+
+  Vector<const char> literal() const {
+    return Vector<const char>(literal_string(), literal_length());
   }

   // Returns the literal string for the next token (the token that
   // would be returned if Next() were called).
   const char* next_literal_string() const {
-    return next_.literal_buffer->data();
-  }
+    return next_.literal_chars.start();
+  }
+
+
   // Returns the length of the next token (that would be returned if
   // Next() were called).
   int next_literal_length() const {
-    return next_.literal_buffer->pos() - 1;
+    // Excluding terminal '\x00' added by TerminateLiteral().
+    return next_.literal_chars.length() - 1;
   }

   Vector<const char> next_literal() const {
-    return Vector<const char>(next_literal_string(),
-                              next_literal_length());
+ return Vector<const char>(next_literal_string(), next_literal_length());
   }

   // Scans the input as a regular expression pattern, previous
@@ -371,7 +369,7 @@
   struct TokenDesc {
     Token::Value token;
     Location location;
-    UTF8Buffer* literal_buffer;
+    Vector<const char> literal_chars;
   };

   void Init(Handle<String> source,
@@ -380,10 +378,10 @@
             ParserLanguage language);

   // Literal buffer support
-  void StartLiteral();
-  void AddChar(uc32 ch);
-  void AddCharAdvance();
-  void TerminateLiteral();
+  inline void StartLiteral();
+  inline void AddChar(uc32 ch);
+  inline void AddCharAdvance();
+  inline void TerminateLiteral();

   // Low-level scanning support.
   void Advance() { c0_ = source_->Advance(); }
@@ -487,9 +485,8 @@
   SafeStringInputBuffer safe_string_input_buffer_;

   // Buffer to hold literal values (identifiers, strings, numbers)
-  // using 0-terminated UTF-8 encoding.
-  UTF8Buffer literal_buffer_1_;
-  UTF8Buffer literal_buffer_2_;
+  // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
+  UTF8Buffer literal_buffer_;

   bool stack_overflow_;
   static StaticResource<Utf8Decoder> utf8_decoder_;
=======================================
--- /branches/bleeding_edge/src/utils.h Thu Aug 19 01:49:26 2010
+++ /branches/bleeding_edge/src/utils.h Tue Aug 24 03:53:44 2010
@@ -474,6 +474,185 @@
   return Vector< Handle<Object> >(
       reinterpret_cast<v8::internal::Handle<Object>*>(elms), length);
 }
+
+
+/*
+ * A class that collects values into a backing store.
+ * Specialized versions of the class can allow access to the backing store
+ * in different ways.
+ * There is no guarantee that the backing store is contiguous (and, as a
+ * consequence, no guarantees that consecutively added elements are adjacent
+ * in memory). The collector may move elements unless it has guaranteed not
+ * to.
+ */
+template <typename T>
+class Collector {
+ public:
+  Collector(int initial_capacity = kMinCapacity,
+            int growth_factor = 2,
+            int max_growth = 1 * MB)
+      : growth_factor_(growth_factor), max_growth_(max_growth) {
+    if (initial_capacity < kMinCapacity) {
+      initial_capacity = kMinCapacity;
+    }
+    current_chunk_ = NewArray<T>(initial_capacity);
+    current_capacity_ = initial_capacity;
+    index_ = 0;
+  }
+
+  virtual ~Collector() {
+    // Free backing store (in reverse allocation order).
+    DeleteArray(current_chunk_);
+    for (int i = chunks_.length() - 1; i >= 0; i--) {
+      chunks_.at(i).Dispose();
+    }
+  }
+
+  // Add a single element.
+  inline void Add(T value) {
+    if (index_ >= current_capacity_) {
+      Grow(1);
+    }
+    current_chunk_[index_] = value;
+    index_++;
+  }
+
+  // Add a block of contiguous elements and return a Vector backed by the
+  // memory area.
+  // A basic Collector will keep this vector valid as long as the Collector
+  // is alive.
+  inline Vector<T> AddBlock(int size, T initial_value) {
+    if (index_ + size > current_capacity_) {
+      Grow(size);
+    }
+    T* position = current_chunk_ + index_;
+    index_ += size;
+    for (int i = 0; i < size; i++) {
+      position[i] = initial_value;
+    }
+    return Vector<T>(position, size);
+  }
+
+
+  // Allocate a single contiguous vector, copy all the collected
+  // elements to the vector, and return it.
+  // The caller is responsible for freeing the memory of the returned
+  // vector (e.g., using Vector::Dispose).
+  Vector<T> ToVector() {
+    // Find the total length.
+    int total_length = index_;
+    for (int i = 0; i < chunks_.length(); i++) {
+      total_length += chunks_.at(i).length();
+    }
+    T* new_store = NewArray<T>(total_length);
+    int position = 0;
+    for (int i = 0; i < chunks_.length(); i++) {
+      Vector<T> chunk = chunks_.at(i);
+      for (int j = 0; j < chunk.length(); j++) {
+        new_store[position] = chunk[j];
+        position++;
+      }
+    }
+    for (int i = 0; i < index_; i++) {
+      new_store[position] = current_chunk_[i];
+      position++;
+    }
+    return Vector<T>(new_store, total_length);
+  }
+
+ protected:
+  static const int kMinCapacity = 16;
+  List<Vector<T> > chunks_;
+  T* current_chunk_;
+  int growth_factor_;
+  int max_growth_;
+  int current_capacity_;
+  int index_;
+
+ // Creates a new current chunk, and stores the old chunk in the chunks_ list.
+  void Grow(int min_capacity) {
+    ASSERT(growth_factor_ > 1);
+    int growth = current_capacity_ * (growth_factor_ - 1);
+    if (growth > max_growth_) {
+      growth = max_growth_;
+    }
+    int new_capacity = current_capacity_ + growth;
+    if (new_capacity < min_capacity) {
+      new_capacity = min_capacity;
+    }
+    T* new_chunk = NewArray<T>(new_capacity);
+    int new_index = PrepareGrow(Vector<T>(new_chunk, new_capacity));
+    chunks_.Add(Vector<T>(current_chunk_, index_));
+    current_chunk_ = new_chunk;
+    current_capacity_ = new_capacity;
+    index_ = new_index;
+    ASSERT(index_ + min_capacity <= current_capacity_);
+  }
+
+  // Before replacing the current chunk, give a subclass the option to move
+  // some of the current data into the new chunk. The function may update
+ // the current index_ value to represent data no longer in the current chunk.
+  // Returns the initial index of the new chunk (after copied data).
+  virtual int PrepareGrow(Vector<T> new_chunk)  {
+    return 0;
+  }
+};
+
+
+/*
+ * A collector that allows sequences of values to be guaranteed to
+ * stay consecutive.
+ * If the backing store grows while a sequence is active, the current
+ * sequence might be moved, but after the sequence is ended, it will
+ * not move again.
+ * NOTICE: Blocks allocated using Collector::AddBlock(int) can move
+ * as well, if inside an active sequence where another element is added.
+ */
+template <typename T>
+class SequenceCollector : public Collector<T> {
+ public:
+  SequenceCollector(int initial_capacity,
+                    int growth_factor = 2,
+                    int max_growth = 1 * MB)
+      : Collector<T>(initial_capacity, growth_factor, max_growth),
+        sequence_start_(kNoSequence) { }
+
+  virtual ~SequenceCollector() {}
+
+  void StartSequence() {
+    ASSERT(sequence_start_ == kNoSequence);
+    sequence_start_ = this->index_;
+  }
+
+  Vector<T> EndSequence() {
+    ASSERT(sequence_start_ != kNoSequence);
+    int sequence_start = sequence_start_;
+    sequence_start_ = kNoSequence;
+    return Vector<T>(this->current_chunk_ + sequence_start,
+                     this->index_ - sequence_start);
+  }
+
+ private:
+  static const int kNoSequence = -1;
+  int sequence_start_;
+
+  // Move the currently active sequence to the new chunk.
+  virtual int PrepareGrow(Vector<T> new_chunk) {
+    if (sequence_start_ != kNoSequence) {
+      int sequence_length = this->index_ - sequence_start_;
+      // The new chunk is always larger than the current chunk, so there
+      // is room for the copy.
+      ASSERT(sequence_length < new_chunk.length());
+      for (int i = 0; i < sequence_length; i++) {
+        new_chunk[i] = this->current_chunk_[sequence_start_ + i];
+      }
+      this->index_ = sequence_start_;
+      sequence_start_ = 0;
+      return sequence_length;
+    }
+    return 0;
+  }
+};


 // Simple support to read a file into a 0-terminated C-string.
=======================================
--- /branches/bleeding_edge/test/cctest/test-utils.cc Fri Jun 4 04:30:55 2010 +++ /branches/bleeding_edge/test/cctest/test-utils.cc Tue Aug 24 03:53:44 2010
@@ -131,3 +131,64 @@
   buffer2.Dispose();
   buffer1.Dispose();
 }
+
+
+TEST(Collector) {
+  Collector<int> collector(8);
+  const int kLoops = 5;
+  const int kSequentialSize = 1000;
+  const int kBlockSize = 7;
+  for (int loop = 0; loop < kLoops; loop++) {
+    Vector<int> block = collector.AddBlock(7, 0xbadcafe);
+    for (int i = 0; i < kSequentialSize; i++) {
+      collector.Add(i);
+    }
+    for (int i = 0; i < kBlockSize - 1; i++) {
+      block[i] = i * 7;
+    }
+  }
+  Vector<int> result = collector.ToVector();
+  CHECK_EQ(kLoops * (kBlockSize + kSequentialSize), result.length());
+  for (int i = 0; i < kLoops; i++) {
+    int offset = i * (kSequentialSize + kBlockSize);
+    for (int j = 0; j < kBlockSize - 1; j++) {
+      CHECK_EQ(j * 7, result[offset + j]);
+    }
+    CHECK_EQ(0xbadcafe, result[offset + kBlockSize - 1]);
+    for (int j = 0; j < kSequentialSize; j++) {
+      CHECK_EQ(j, result[offset + kBlockSize + j]);
+    }
+  }
+  result.Dispose();
+}
+
+
+TEST(SequenceCollector) {
+  SequenceCollector<int> collector(8);
+  const int kLoops = 5000;
+  const int kMaxSequenceSize = 13;
+  int total_length = 0;
+  for (int loop = 0; loop < kLoops; loop++) {
+    int seq_length = loop % kMaxSequenceSize;
+    collector.StartSequence();
+    for (int j = 0; j < seq_length; j++) {
+      collector.Add(j);
+    }
+    Vector<int> sequence = collector.EndSequence();
+    for (int j = 0; j < seq_length; j++) {
+      CHECK_EQ(j, sequence[j]);
+    }
+    total_length += seq_length;
+  }
+  Vector<int> result = collector.ToVector();
+  CHECK_EQ(total_length, result.length());
+  int offset = 0;
+  for (int loop = 0; loop < kLoops; loop++) {
+    int seq_length = loop % kMaxSequenceSize;
+    for (int j = 0; j < seq_length; j++) {
+      CHECK_EQ(j, result[offset]);
+      offset++;
+    }
+  }
+  result.Dispose();
+}

--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to