Revision: 13842
Author: [email protected]
Date: Wed Mar 6 07:39:57 2013
Log: Added back some utf8 optimizations
[email protected]
BUG=https://code.google.com/p/v8/issues/detail?id=2551
Review URL: https://codereview.chromium.org/12390057
http://code.google.com/p/v8/source/detail?r=13842
Modified:
/branches/bleeding_edge/src/api.cc
/branches/bleeding_edge/src/objects-inl.h
/branches/bleeding_edge/src/objects.h
=======================================
--- /branches/bleeding_edge/src/api.cc Thu Feb 28 09:03:34 2013
+++ /branches/bleeding_edge/src/api.cc Wed Mar 6 07:39:57 2013
@@ -3831,68 +3831,200 @@
}
-class Utf8LengthVisitor {
+class Utf8LengthHelper : public i::AllStatic {
public:
- explicit Utf8LengthVisitor()
- : utf8_length_(0),
- last_character_(unibrow::Utf16::kNoPreviousCharacter) {}
+ enum State {
+ kEndsWithLeadingSurrogate = 1 << 0,
+ kStartsWithTrailingSurrogate = 1 << 1,
+ kLeftmostEdgeIsCalculated = 1 << 2,
+ kRightmostEdgeIsCalculated = 1 << 3,
+ kLeftmostEdgeIsSurrogate = 1 << 4,
+ kRightmostEdgeIsSurrogate = 1 << 5
+ };
- inline int GetLength() {
- return utf8_length_;
+ static const uint8_t kInitialState = 0;
+
+ static inline bool EndsWithSurrogate(uint8_t state) {
+ return state & kEndsWithLeadingSurrogate;
}
- template<typename Char>
- inline void Visit(const Char* chars, unsigned length) {
- ASSERT(length > 0);
- // TODO(dcarney) Add back ascii fast path.
- int utf8_length = 0;
- int last_character = last_character_;
- for (unsigned i = 0; i < length; i++) {
- uint16_t c = chars[i];
- utf8_length += unibrow::Utf8::Length(c, last_character);
- last_character = c;
+ static inline bool StartsWithSurrogate(uint8_t state) {
+ return state & kStartsWithTrailingSurrogate;
+ }
+
+ class Visitor {
+ public:
+ explicit Visitor()
+ : utf8_length_(0),
+ state_(kInitialState) {}
+
+ template<typename Char>
+ inline void Visit(const Char* chars, int length) {
+ int utf8_length = 0;
+ int last_character = unibrow::Utf16::kNoPreviousCharacter;
+ for (int i = 0; i < length; i++) {
+ uint16_t c = chars[i];
+ utf8_length += unibrow::Utf8::Length(c, last_character);
+ if (sizeof(Char) > 1) {
+ last_character = c;
+ }
+ }
+ utf8_length_ = utf8_length;
}
- last_character_ = last_character;
- utf8_length_ += utf8_length;
+
+ void VisitOneByteString(const uint8_t* chars, int length) {
+ Visit(chars, length);
+ state_ = kInitialState;
+ }
+
+ void VisitTwoByteString(const uint16_t* chars, int length) {
+ Visit(chars, length);
+ uint8_t state = 0;
+ if (unibrow::Utf16::IsTrailSurrogate(chars[0])) {
+ state |= kStartsWithTrailingSurrogate;
+ }
+ if (unibrow::Utf16::IsLeadSurrogate(chars[length-1])) {
+ state |= kEndsWithLeadingSurrogate;
+ }
+ state_ = state;
+ }
+
+ static i::ConsString* VisitFlat(i::String* string,
+ int* length,
+ uint8_t* state) {
+ Visitor visitor;
+ i::ConsString* cons_string = i::String::VisitFlat(&visitor, string);
+ *length = visitor.utf8_length_;
+ *state = visitor.state_;
+ return cons_string;
+ }
+
+ private:
+ int utf8_length_;
+ uint8_t state_;
+ DISALLOW_COPY_AND_ASSIGN(Visitor);
+ };
+
+ static inline void MergeLeafLeft(int* length,
+ uint8_t* state,
+ uint8_t leaf_state) {
+ bool edge_surrogate = StartsWithSurrogate(leaf_state);
+ if (!(*state & kLeftmostEdgeIsCalculated)) {
+ ASSERT(!(*state & kLeftmostEdgeIsSurrogate));
+ *state |= kLeftmostEdgeIsCalculated
+ | (edge_surrogate ? kLeftmostEdgeIsSurrogate : 0);
+ } else if (EndsWithSurrogate(*state) && edge_surrogate) {
+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+ }
+ if (EndsWithSurrogate(leaf_state)) {
+ *state |= kEndsWithLeadingSurrogate;
+ } else {
+ *state &= ~kEndsWithLeadingSurrogate;
+ }
}
- inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
- Visit(chars, length);
+ static inline void MergeLeafRight(int* length,
+ uint8_t* state,
+ uint8_t leaf_state) {
+ bool edge_surrogate = EndsWithSurrogate(leaf_state);
+ if (!(*state & kRightmostEdgeIsCalculated)) {
+ ASSERT(!(*state & kRightmostEdgeIsSurrogate));
+ *state |= (kRightmostEdgeIsCalculated
+ | (edge_surrogate ? kRightmostEdgeIsSurrogate : 0));
+ } else if (edge_surrogate && StartsWithSurrogate(*state)) {
+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+ }
+ if (StartsWithSurrogate(leaf_state)) {
+ *state |= kStartsWithTrailingSurrogate;
+ } else {
+ *state &= ~kStartsWithTrailingSurrogate;
+ }
}
- inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
- Visit(chars, length);
+ static inline void MergeTerminal(int* length,
+ uint8_t state,
+ uint8_t* state_out) {
+ ASSERT((state & kLeftmostEdgeIsCalculated) &&
+ (state & kRightmostEdgeIsCalculated));
+ if (EndsWithSurrogate(state) && StartsWithSurrogate(state)) {
+ *length -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+ }
+ *state_out = kInitialState |
+ (state & kLeftmostEdgeIsSurrogate ? kStartsWithTrailingSurrogate :
0) |
+ (state & kRightmostEdgeIsSurrogate ? kEndsWithLeadingSurrogate :
0);
+ }
+
+ static int Calculate(i::ConsString* current, uint8_t* state_out) {
+ using namespace internal;
+ int total_length = 0;
+ uint8_t state = kInitialState;
+ while (true) {
+ i::String* left = current->first();
+ i::String* right = current->second();
+ uint8_t right_leaf_state;
+ uint8_t left_leaf_state;
+ int leaf_length;
+ ConsString* left_as_cons =
+ Visitor::VisitFlat(left, &leaf_length, &left_leaf_state);
+ if (left_as_cons == NULL) {
+ total_length += leaf_length;
+ MergeLeafLeft(&total_length, &state, left_leaf_state);
+ }
+ ConsString* right_as_cons =
+ Visitor::VisitFlat(right, &leaf_length, &right_leaf_state);
+ if (right_as_cons == NULL) {
+ total_length += leaf_length;
+ MergeLeafRight(&total_length, &state, right_leaf_state);
+ // Terminal node.
+ if (left_as_cons == NULL) {
+ MergeTerminal(&total_length, state, state_out);
+ return total_length;
+ }
+ } else if (left_as_cons != NULL) {
+ // Both strings are ConsStrings.
+ // Recurse on smallest.
+ if (left->length() < right->length()) {
+ total_length += Calculate(left_as_cons, &left_leaf_state);
+ MergeLeafLeft(&total_length, &state, left_leaf_state);
+ current = right_as_cons;
+ continue;
+ } else {
+ total_length += Calculate(right_as_cons, &right_leaf_state);
+ MergeLeafRight(&total_length, &state, right_leaf_state);
+ current = left_as_cons;
+ continue;
+ }
+ }
+ // 1 leaf node. Do in place descent.
+ if (left_as_cons != NULL) {
+ current = left_as_cons;
+ } else {
+ ASSERT(right_as_cons != NULL);
+ current = right_as_cons;
+ }
+ }
+ UNREACHABLE();
+ return 0;
+ }
+
+ static inline int Calculate(i::ConsString* current) {
+ uint8_t state = kInitialState;
+ return Calculate(current, &state);
}
private:
- int utf8_length_;
- int last_character_;
- DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8LengthHelper);
};
static int Utf8Length(i::String* str, i::Isolate* isolate) {
- unsigned length = static_cast<unsigned>(str->length());
+ int length = str->length();
if (length == 0) return 0;
- int32_t type = str->map()->instance_type();
- Utf8LengthVisitor visitor;
- // Non ConsString branch.
- if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {
- i::ConsStringNullOp null_op;
- i::String::Visit(str, 0, visitor, null_op, type, length);
- return visitor.GetLength();
- }
- i::ConsStringIteratorOp* op = isolate->write_iterator();
- unsigned offset = 0;
- i::String* leaf = op->Operate(str, &offset, &type, &length);
- ASSERT(leaf != NULL);
- while (leaf != NULL) {
- i::ConsStringNullOp null_op;
- ASSERT(offset == 0);
- i::String::Visit(leaf, 0, visitor, null_op, type, length);
- leaf = op->ContinueOperation(&type, &length);
- }
- return visitor.GetLength();
+ uint8_t state;
+ i::ConsString* cons_string =
+ Utf8LengthHelper::Visitor::VisitFlat(str, &length, &state);
+ if (cons_string == NULL) return length;
+ return Utf8LengthHelper::Calculate(cons_string);
}
@@ -3906,12 +4038,14 @@
class Utf8WriterVisitor {
public:
- Utf8WriterVisitor(char* buffer, int capacity)
+ Utf8WriterVisitor(
+ char* buffer, int capacity, bool skip_capacity_check)
: early_termination_(false),
last_character_(unibrow::Utf16::kNoPreviousCharacter),
buffer_(buffer),
start_(buffer),
capacity_(capacity),
+ skip_capacity_check_(capacity == -1 || skip_capacity_check),
utf16_chars_read_(0) {
}
@@ -3935,7 +4069,7 @@
// Can't encode using last_character as gcc has array bounds issues.
int written = Utf8::Encode(temp_buffer,
character,
- unibrow::Utf16::kNoPreviousCharacter);
+ Utf16::kNoPreviousCharacter);
// Won't fit.
if (written > remaining) return 0;
// Copy over the character from temp_buffer.
@@ -3948,45 +4082,55 @@
template<typename Char>
void Visit(const Char* chars, const int length) {
using namespace unibrow;
- // TODO(dcarney): Add back ascii fast path.
ASSERT(!early_termination_);
- ASSERT(length > 0);
+ if (length == 0) return;
// Copy state to stack.
char* buffer = buffer_;
- int last_character = last_character_;
+ int last_character =
+ sizeof(Char) == 1 ? Utf16::kNoPreviousCharacter : last_character_;
int i = 0;
// Do a fast loop where there is no exit capacity check.
while (true) {
int fast_length;
- if (capacity_ == -1) {
+ if (skip_capacity_check_) {
fast_length = length;
} else {
int remaining_capacity = capacity_ - static_cast<int>(buffer -
start_);
// Need enough space to write everything but one character.
STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);
- int writable_length = (remaining_capacity - 3)/3;
+ int max_size_per_char = sizeof(Char) == 1 ? 2 : 3;
+ int writable_length =
+ (remaining_capacity - max_size_per_char)/max_size_per_char;
// Need to drop into slow loop.
if (writable_length <= 0) break;
fast_length = i + writable_length;
if (fast_length > length) fast_length = length;
}
// Write the characters to the stream.
- for (; i < fast_length; i++) {
- uint16_t character = *chars++;
- buffer += Utf8::Encode(buffer, character, last_character);
- last_character = character;
- ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
+ if (sizeof(Char) == 1) {
+ for (; i < fast_length; i++) {
+ buffer +=
+ Utf8::Encode(buffer, *chars++, Utf16::kNoPreviousCharacter);
+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
+ }
+ } else {
+ for (; i < fast_length; i++) {
+ uint16_t character = *chars++;
+ buffer += Utf8::Encode(buffer, character, last_character);
+ last_character = character;
+ ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
+ }
}
// Array is fully written. Exit.
if (fast_length == length) {
// Write state back out to object.
last_character_ = last_character;
buffer_ = buffer;
- utf16_chars_read_ += i;
+ utf16_chars_read_ += length;
return;
}
}
- ASSERT(capacity_ != -1);
+ ASSERT(!skip_capacity_check_);
// Slow loop. Must check capacity on each iteration.
int remaining_capacity = capacity_ - static_cast<int>(buffer - start_);
ASSERT(remaining_capacity >= 0);
@@ -4014,15 +4158,15 @@
return early_termination_;
}
- inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
- Visit(chars, static_cast<int>(length));
+ inline void VisitOneByteString(const uint8_t* chars, int length) {
+ Visit(chars, length);
}
- inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
- Visit(chars, static_cast<int>(length));
+ inline void VisitTwoByteString(const uint16_t* chars, int length) {
+ Visit(chars, length);
}
- inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
+ int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
// Write out number of utf16 characters written to the stream.
if (utf16_chars_read_out != NULL) {
*utf16_chars_read_out = utf16_chars_read_;
@@ -4042,9 +4186,30 @@
char* buffer_;
char* const start_;
int capacity_;
+ bool const skip_capacity_check_;
int utf16_chars_read_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
};
+
+
+static bool RecursivelySerializeToUtf8(i::String* current,
+ Utf8WriterVisitor* writer,
+ int recursion_budget) {
+ while (!writer->IsDone()) {
+ i::ConsString* cons_string = i::String::VisitFlat(writer, current);
+ if (cons_string == NULL) return true; // Leaf node.
+ if (recursion_budget <= 0) return false;
+ // Must write the left branch first.
+ i::String* first = cons_string->first();
+ bool success = RecursivelySerializeToUtf8(first,
+ writer,
+ recursion_budget - 1);
+ if (!success) return false;
+ // Inline tail recurse for right branch.
+ current = cons_string->second();
+ }
+ return true;
+}
int String::WriteUtf8(char* buffer,
@@ -4059,23 +4224,41 @@
if (options & HINT_MANY_WRITES_EXPECTED) {
FlattenString(str); // Flatten the string for efficiency.
}
- Utf8WriterVisitor writer(buffer, capacity);
- i::ConsStringIteratorOp* op = isolate->write_iterator();
- op->Reset();
- int32_t type = str->map()->instance_type();
- unsigned str_length = static_cast<unsigned>(str->length());
- if (str_length != 0) {
- i::String::Visit(*str, 0, writer, *op, type, str_length);
- while (!writer.IsDone()) {
- unsigned length_out;
- i::String* next = op->ContinueOperation(&type, &length_out);
- if (next == NULL) break;
- // TODO(dcarney): need an asserting null op.
- i::ConsStringNullOp null_op;
- i::String::Visit(next, 0, writer, null_op, type, length_out);
+ const int string_length = str->length();
+ bool write_null = !(options & NO_NULL_TERMINATION);
+ // First check if we can just write the string without checking capacity.
+ if (capacity == -1 || capacity / 3 >= string_length) {
+ Utf8WriterVisitor writer(buffer, capacity, true);
+ const int kMaxRecursion = 100;
+ bool success = RecursivelySerializeToUtf8(*str, &writer,
kMaxRecursion);
+ if (success) return writer.CompleteWrite(write_null, nchars_ref);
+ } else if (capacity >= string_length) {
+ // First check that the buffer is large enough.
+ int utf8_bytes = v8::Utf8Length(*str, str->GetIsolate());
+ if (utf8_bytes <= capacity) {
+ // ASCII fast path.
+ if (utf8_bytes == string_length) {
+ WriteOneByte(reinterpret_cast<uint8_t*>(buffer), 0, capacity,
options);
+ if (nchars_ref != NULL) *nchars_ref = string_length;
+ if (write_null && (utf8_bytes+1 <= capacity)) {
+ return string_length + 1;
+ }
+ return string_length;
+ }
+ if (write_null && (utf8_bytes+1 > capacity)) {
+ options |= NO_NULL_TERMINATION;
+ }
+ // Recurse once without a capacity limit.
+ // This will get into the first branch above.
+ // TODO(dcarney) Check max left rec. in Utf8Length and fall through.
+ return WriteUtf8(buffer, -1, nchars_ref, options);
}
}
- return writer.CompleteWrite(!(options & NO_NULL_TERMINATION),
nchars_ref);
+ // Recursive slow path can potentially be unreasonable slow. Flatten.
+ str = FlattenGetString(str);
+ Utf8WriterVisitor writer(buffer, capacity, false);
+ i::String::VisitFlat(&writer, *str);
+ return writer.CompleteWrite(write_null, nchars_ref);
}
=======================================
--- /branches/bleeding_edge/src/objects-inl.h Tue Mar 5 09:38:35 2013
+++ /branches/bleeding_edge/src/objects-inl.h Wed Mar 6 07:39:57 2013
@@ -2646,6 +2646,32 @@
}
}
}
+
+
+// TODO(dcarney): Remove this class after conversion to VisitFlat.
+class ConsStringCaptureOp {
+ public:
+ inline ConsStringCaptureOp() : cons_string_(NULL) {}
+ inline String* Operate(String* string, unsigned*, int32_t*, unsigned*) {
+ cons_string_ = ConsString::cast(string);
+ return NULL;
+ }
+ ConsString* cons_string_;
+};
+
+
+template<class Visitor>
+ConsString* String::VisitFlat(Visitor* visitor,
+ String* string,
+ int offset,
+ int length,
+ int32_t type) {
+ ASSERT(length >= 0 && length == string->length());
+ ASSERT(offset >= 0 && offset <= length);
+ ConsStringCaptureOp op;
+ Visit(string, offset, *visitor, op, type, static_cast<unsigned>(length));
+ return op.cons_string_;
+}
uint16_t SeqOneByteString::SeqOneByteStringGet(int index) {
=======================================
--- /branches/bleeding_edge/src/objects.h Tue Mar 5 09:38:35 2013
+++ /branches/bleeding_edge/src/objects.h Wed Mar 6 07:39:57 2013
@@ -7337,6 +7337,8 @@
};
+class ConsString;
+
// The String abstract class captures JavaScript string values:
//
// Ecma-262:
@@ -7615,6 +7617,7 @@
return NonOneByteStart(chars, length) >= length;
}
+ // TODO(dcarney): Replace all instances of this with VisitFlat.
template<class Visitor, class ConsOp>
static inline void Visit(String* string,
unsigned offset,
@@ -7622,6 +7625,21 @@
ConsOp& cons_op,
int32_t type,
unsigned length);
+
+ template<class Visitor>
+ static inline ConsString* VisitFlat(Visitor* visitor,
+ String* string,
+ int offset,
+ int length,
+ int32_t type);
+
+ template<class Visitor>
+ static inline ConsString* VisitFlat(Visitor* visitor,
+ String* string,
+ int offset = 0) {
+ int32_t type = string->map()->instance_type();
+ return VisitFlat(visitor, string, offset, string->length(), type);
+ }
private:
friend class Name;
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.