Revision: 4894
Author: [email protected]
Date: Thu Jun 17 09:19:28 2010
Log: Track ascii-ness of data in externalized strings.

If a two-byte string only contains ascii characters, then we can save
memory when flattening a cons string containing it. Similarly we can
use this in Array.prototype.join implementation. To track this a new
bit is added to instance type. This bit is used as a hint in generated
code and in runtime functions.

To enable testing a new V8 extension is added controlled by
--expose-externalize-string flag.

Review URL: http://codereview.chromium.org/2762008
http://code.google.com/p/v8/source/detail?r=4894

Added:
 /branches/bleeding_edge/test/mjsunit/string-externalize.js
Modified:
 /branches/bleeding_edge/src/arm/codegen-arm.cc
 /branches/bleeding_edge/src/bootstrapper.cc
 /branches/bleeding_edge/src/execution.cc
 /branches/bleeding_edge/src/execution.h
 /branches/bleeding_edge/src/flag-definitions.h
 /branches/bleeding_edge/src/heap.cc
 /branches/bleeding_edge/src/heap.h
 /branches/bleeding_edge/src/ia32/codegen-ia32.cc
 /branches/bleeding_edge/src/objects-debug.cc
 /branches/bleeding_edge/src/objects-inl.h
 /branches/bleeding_edge/src/objects.cc
 /branches/bleeding_edge/src/objects.h
 /branches/bleeding_edge/src/runtime.cc
 /branches/bleeding_edge/src/x64/codegen-x64.cc

=======================================
--- /dev/null
+++ /branches/bleeding_edge/test/mjsunit/string-externalize.js Thu Jun 17 09:19:28 2010
@@ -0,0 +1,95 @@
+// Copyright 2010 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Flags: --expose-externalize-string
+
+var size = 1024;
+
+function test() {
+  var str = "";
+
+  // Build an ascii cons string.
+  for (var i = 0; i < size; i++) {
+      str += String.fromCharCode(i & 0x7f);
+  }
+  assertTrue(isAsciiString(str));
+
+  var twoByteExternalWithAsciiData =
+      "AA" + (function() { return "A"; })();
+ externalizeString(twoByteExternalWithAsciiData, true /* force two-byte */);
+  assertFalse(isAsciiString(twoByteExternalWithAsciiData));
+
+  var realTwoByteExternalString =
+      "\u1234\u1234" + (function() { return "\u1234"; })();
+  externalizeString(realTwoByteExternalString);
+  assertFalse(isAsciiString(realTwoByteExternalString));
+
+  assertTrue(isAsciiString(["a", twoByteExternalWithAsciiData].join("")));
+
+  // Appending a two-byte string that contains only ascii chars should
+  // still produce an ascii cons.
+  var str1 = str + twoByteExternalWithAsciiData;
+  assertTrue(isAsciiString(str1));
+
+  // Force flattening of the string.
+  var old_length = str1.length - twoByteExternalWithAsciiData.length;
+  for (var i = 0; i < old_length; i++) {
+    assertEquals(String.fromCharCode(i & 0x7f), str1[i]);
+  }
+  for (var i = old_length; i < str1.length; i++) {
+    assertEquals("A", str1[i]);
+  }
+
+  // Flattened string should still be ascii.
+  assertTrue(isAsciiString(str1));
+
+  // Lower-casing an ascii string should produce ascii.
+  assertTrue(isAsciiString(str1.toLowerCase()));
+
+  assertFalse(isAsciiString(["a", realTwoByteExternalString].join("")));
+
+  // Appending a real two-byte string should produce a two-byte cons.
+  var str2 = str + realTwoByteExternalString;
+  assertFalse(isAsciiString(str2));
+
+  // Force flattening of the string.
+  old_length = str2.length - realTwoByteExternalString.length;
+  for (var i = 0; i < old_length; i++) {
+    assertEquals(String.fromCharCode(i & 0x7f), str2[i]);
+  }
+  for (var i = old_length; i < str.length; i++) {
+    assertEquals("\u1234", str2[i]);
+  }
+
+  // Flattened string should still be two-byte.
+  assertFalse(isAsciiString(str2));
+}
+
+// Run the test many times to ensure IC-s don't break things.
+for (var i = 0; i < 10; i++) {
+  test();
+}
=======================================
--- /branches/bleeding_edge/src/arm/codegen-arm.cc      Thu Jun 17 03:45:37 2010
+++ /branches/bleeding_edge/src/arm/codegen-arm.cc      Thu Jun 17 09:19:28 2010
@@ -10559,13 +10559,14 @@
     __ ldrb(r4, FieldMemOperand(r4, Map::kInstanceTypeOffset));
     __ ldrb(r5, FieldMemOperand(r5, Map::kInstanceTypeOffset));
   }
-  Label non_ascii, allocated;
+  Label non_ascii, allocated, ascii_data;
   ASSERT_EQ(0, kTwoByteStringTag);
   __ tst(r4, Operand(kStringEncodingMask));
   __ tst(r5, Operand(kStringEncodingMask), ne);
   __ b(eq, &non_ascii);

   // Allocate an ASCII cons string.
+  __ bind(&ascii_data);
   __ AllocateAsciiConsString(r7, r6, r4, r5, &string_add_runtime);
   __ bind(&allocated);
   // Fill the fields of the cons string.
@@ -10577,6 +10578,19 @@
   __ Ret();

   __ bind(&non_ascii);
+  // At least one of the strings is two-byte. Check whether it happens
+  // to contain only ascii characters.
+  // r4: first instance type.
+  // r5: second instance type.
+  __ tst(r4, Operand(kAsciiDataHintMask));
+  __ tst(r5, Operand(kAsciiDataHintMask), ne);
+  __ b(ne, &ascii_data);
+  __ eor(r4, r4, Operand(r5));
+  ASSERT(kAsciiStringTag != 0 && kAsciiDataHintTag != 0);
+  __ and_(r4, r4, Operand(kAsciiStringTag | kAsciiDataHintTag));
+  __ cmp(r4, Operand(kAsciiStringTag | kAsciiDataHintTag));
+  __ b(eq, &ascii_data);
+
   // Allocate a two byte cons string.
   __ AllocateTwoByteConsString(r7, r6, r4, r5, &string_add_runtime);
   __ jmp(&allocated);
=======================================
--- /branches/bleeding_edge/src/bootstrapper.cc Wed May 12 05:44:00 2010
+++ /branches/bleeding_edge/src/bootstrapper.cc Thu Jun 17 09:19:28 2010
@@ -1462,6 +1462,7 @@
   }

   if (FLAG_expose_gc) InstallExtension("v8/gc");
+  if (FLAG_expose_externalize_string) InstallExtension("v8/externalize");

   if (extensions == NULL) return true;
   // Install required extensions
=======================================
--- /branches/bleeding_edge/src/execution.cc    Mon Apr 19 05:39:07 2010
+++ /branches/bleeding_edge/src/execution.cc    Thu Jun 17 09:19:28 2010
@@ -679,7 +679,7 @@

 // --- G C   E x t e n s i o n ---

-const char* GCExtension::kSource = "native function gc();";
+const char* const GCExtension::kSource = "native function gc();";


 v8::Handle<v8::FunctionTemplate> GCExtension::GetNativeFunction(
@@ -695,7 +695,115 @@
 }


-static GCExtension kGCExtension;
-v8::DeclareExtension kGCExtensionDeclaration(&kGCExtension);
+static GCExtension gc_extension;
+static v8::DeclareExtension gc_extension_declaration(&gc_extension);
+
+
+// --- E x t e r n a l i z e S t r i n g   E x t e n s i o n ---
+
+
+template <typename Char, typename Base>
+class SimpleStringResource : public Base {
+ public:
+  // Takes ownership of |data|.
+  SimpleStringResource(Char* data, size_t length)
+      : data_(data),
+        length_(length) {}
+
+  virtual ~SimpleStringResource() { delete data_; }
+
+  virtual const Char* data() const { return data_; }
+
+  virtual size_t length() const { return length_; }
+
+ private:
+  Char* const data_;
+  const size_t length_;
+};
+
+
+typedef SimpleStringResource<char, v8::String::ExternalAsciiStringResource>
+    SimpleAsciiStringResource;
+typedef SimpleStringResource<uc16, v8::String::ExternalStringResource>
+    SimpleTwoByteStringResource;
+
+
+const char* const ExternalizeStringExtension::kSource =
+    "native function externalizeString();"
+    "native function isAsciiString();";
+
+
+v8::Handle<v8::FunctionTemplate> ExternalizeStringExtension::GetNativeFunction(
+    v8::Handle<v8::String> str) {
+  if (strcmp(*v8::String::AsciiValue(str), "externalizeString") == 0) {
+ return v8::FunctionTemplate::New(ExternalizeStringExtension::Externalize);
+  } else {
+    ASSERT(strcmp(*v8::String::AsciiValue(str), "isAsciiString") == 0);
+    return v8::FunctionTemplate::New(ExternalizeStringExtension::IsAscii);
+  }
+}
+
+
+v8::Handle<v8::Value> ExternalizeStringExtension::Externalize(
+    const v8::Arguments& args) {
+  if (args.Length() < 1 || !args[0]->IsString()) {
+    return v8::ThrowException(v8::String::New(
+        "First parameter to externalizeString() must be a string."));
+  }
+  bool force_two_byte = false;
+  if (args.Length() >= 2) {
+    if (args[1]->IsBoolean()) {
+      force_two_byte = args[1]->BooleanValue();
+    } else {
+      return v8::ThrowException(v8::String::New(
+          "Second parameter to externalizeString() must be a boolean."));
+    }
+  }
+  bool result = false;
+  Handle<String> string = Utils::OpenHandle(*args[0].As<v8::String>());
+  if (string->IsExternalString()) {
+    return v8::ThrowException(v8::String::New(
+        "externalizeString() can't externalize twice."));
+  }
+  if (string->IsAsciiRepresentation() && !force_two_byte) {
+    char* data = new char[string->length()];
+    String::WriteToFlat(*string, data, 0, string->length());
+    SimpleAsciiStringResource* resource = new SimpleAsciiStringResource(
+        data, string->length());
+    result = string->MakeExternal(resource);
+    if (result && !string->IsSymbol()) {
+      i::ExternalStringTable::AddString(*string);
+    }
+  } else {
+    uc16* data = new uc16[string->length()];
+    String::WriteToFlat(*string, data, 0, string->length());
+ SimpleTwoByteStringResource* resource = new SimpleTwoByteStringResource(
+        data, string->length());
+    result = string->MakeExternal(resource);
+    if (result && !string->IsSymbol()) {
+      i::ExternalStringTable::AddString(*string);
+    }
+  }
+  if (!result) {
+ return v8::ThrowException(v8::String::New("externalizeString() failed."));
+  }
+  return v8::Undefined();
+}
+
+
+v8::Handle<v8::Value> ExternalizeStringExtension::IsAscii(
+    const v8::Arguments& args) {
+  if (args.Length() != 1 || !args[0]->IsString()) {
+    return v8::ThrowException(v8::String::New(
+        "isAsciiString() requires a single string argument."));
+  }
+ return Utils::OpenHandle(*args[0].As<v8::String>())->IsAsciiRepresentation() ?
+      v8::True() : v8::False();
+}
+
+
+static ExternalizeStringExtension externalize_extension;
+static v8::DeclareExtension externalize_extension_declaration(
+    &externalize_extension);

 } }  // namespace v8::internal
=======================================
--- /branches/bleeding_edge/src/execution.h     Wed Apr 14 00:36:49 2010
+++ /branches/bleeding_edge/src/execution.h     Thu Jun 17 09:19:28 2010
@@ -316,10 +316,21 @@
       v8::Handle<v8::String> name);
   static v8::Handle<v8::Value> GC(const v8::Arguments& args);
  private:
-  static const char* kSource;
+  static const char* const kSource;
 };


+class ExternalizeStringExtension : public v8::Extension {
+ public:
+ ExternalizeStringExtension() : v8::Extension("v8/externalize", kSource) {}
+  virtual v8::Handle<v8::FunctionTemplate> GetNativeFunction(
+      v8::Handle<v8::String> name);
+  static v8::Handle<v8::Value> Externalize(const v8::Arguments& args);
+  static v8::Handle<v8::Value> IsAscii(const v8::Arguments& args);
+ private:
+  static const char* const kSource;
+};
+
 } }  // namespace v8::internal

 #endif  // V8_EXECUTION_H_
=======================================
--- /branches/bleeding_edge/src/flag-definitions.h      Fri Jun 11 00:06:51 2010
+++ /branches/bleeding_edge/src/flag-definitions.h      Thu Jun 17 09:19:28 2010
@@ -123,6 +123,8 @@
 DEFINE_string(expose_natives_as, NULL, "expose natives in global object")
 DEFINE_string(expose_debug_as, NULL, "expose debug in global object")
 DEFINE_bool(expose_gc, false, "expose gc extension")
+DEFINE_bool(expose_externalize_string, false,
+            "expose externalize string extension")
 DEFINE_int(stack_trace_limit, 10, "number of stack frames to capture")
 DEFINE_bool(disable_native_files, false, "disable builtin natives files")

=======================================
--- /branches/bleeding_edge/src/heap.cc Fri Jun 11 00:06:51 2010
+++ /branches/bleeding_edge/src/heap.cc Thu Jun 17 09:19:28 2010
@@ -1928,6 +1928,18 @@
     Top::context()->mark_out_of_memory();
     return Failure::OutOfMemoryException();
   }
+
+  bool is_ascii_data_in_two_byte_string = false;
+  if (!is_ascii) {
+    // At least one of the strings uses two-byte representation so we
+    // can't use the fast case code for short ascii strings below, but
+    // we can try to save memory if all chars actually fit in ascii.
+    is_ascii_data_in_two_byte_string =
+        first->HasOnlyAsciiChars() && second->HasOnlyAsciiChars();
+    if (is_ascii_data_in_two_byte_string) {
+      Counters::string_add_runtime_ext_to_ascii.Increment();
+    }
+  }

   // If the resulting string is small make a flat string.
   if (length < String::kMinNonFlatLength) {
@@ -1955,22 +1967,13 @@
       for (int i = 0; i < second_length; i++) *dest++ = src[i];
       return result;
     } else {
-      // For short external two-byte strings we check whether they can
-      // be represented using ascii.
-      if (!first_is_ascii) {
-        first_is_ascii = first->IsExternalTwoByteStringWithAsciiChars();
-      }
-      if (first_is_ascii && !second_is_ascii) {
-        second_is_ascii = second->IsExternalTwoByteStringWithAsciiChars();
-      }
-      if (first_is_ascii && second_is_ascii) {
+      if (is_ascii_data_in_two_byte_string) {
         Object* result = AllocateRawAsciiString(length);
         if (result->IsFailure()) return result;
         // Copy the characters into the new object.
         char* dest = SeqAsciiString::cast(result)->GetChars();
         String::WriteToFlat(first, dest, 0, first_length);
         String::WriteToFlat(second, dest + first_length, 0, second_length);
-        Counters::string_add_runtime_ext_to_ascii.Increment();
         return result;
       }

@@ -1984,7 +1987,8 @@
     }
   }

-  Map* map = is_ascii ? cons_ascii_string_map() : cons_string_map();
+  Map* map = (is_ascii || is_ascii_data_in_two_byte_string) ?
+      cons_ascii_string_map() : cons_string_map();

   Object* result = Allocate(map, NEW_SPACE);
   if (result->IsFailure()) return result;
@@ -2070,7 +2074,23 @@
     return Failure::OutOfMemoryException();
   }

-  Map* map = Heap::external_string_map();
+  // For small strings we check whether the resource contains only
+  // ascii characters.  If yes, we use a different string map.
+  bool is_ascii = true;
+  if (length >= static_cast<size_t>(String::kMinNonFlatLength)) {
+    is_ascii = false;
+  } else {
+    const uc16* data = resource->data();
+    for (size_t i = 0; i < length; i++) {
+      if (data[i] > String::kMaxAsciiCharCode) {
+        is_ascii = false;
+        break;
+      }
+    }
+  }
+
+  Map* map = is_ascii ?
+ Heap::external_string_with_ascii_data_map() : Heap::external_string_map();
   Object* result = Allocate(map, NEW_SPACE);
   if (result->IsFailure()) return result;

@@ -2853,6 +2873,9 @@
   if (map == cons_ascii_string_map()) return cons_ascii_symbol_map();
   if (map == external_string_map()) return external_symbol_map();
if (map == external_ascii_string_map()) return external_ascii_symbol_map();
+  if (map == external_string_with_ascii_data_map()) {
+    return external_symbol_with_ascii_data_map();
+  }

   // No match found.
   return NULL;
=======================================
--- /branches/bleeding_edge/src/heap.h  Wed Jun 16 01:29:25 2010
+++ /branches/bleeding_edge/src/heap.h  Thu Jun 17 09:19:28 2010
@@ -69,10 +69,12 @@
V(Map, cons_symbol_map, ConsSymbolMap) \ V(Map, cons_ascii_symbol_map, ConsAsciiSymbolMap) \ V(Map, external_symbol_map, ExternalSymbolMap) \ + V(Map, external_symbol_with_ascii_data_map, ExternalSymbolWithAsciiDataMap) \ V(Map, external_ascii_symbol_map, ExternalAsciiSymbolMap) \ V(Map, cons_string_map, ConsStringMap) \ V(Map, cons_ascii_string_map, ConsAsciiStringMap) \ V(Map, external_string_map, ExternalStringMap) \ + V(Map, external_string_with_ascii_data_map, ExternalStringWithAsciiDataMap) \ V(Map, external_ascii_string_map, ExternalAsciiStringMap) \ V(Map, undetectable_string_map, UndetectableStringMap) \ V(Map, undetectable_ascii_string_map, UndetectableAsciiStringMap) \
=======================================
--- /branches/bleeding_edge/src/ia32/codegen-ia32.cc Wed Jun 16 05:32:34 2010 +++ /branches/bleeding_edge/src/ia32/codegen-ia32.cc Thu Jun 17 09:19:28 2010
@@ -12852,7 +12852,7 @@

// If result is not supposed to be flat allocate a cons string object. If both
   // strings are ascii the result is an ascii cons string.
-  Label non_ascii, allocated;
+  Label non_ascii, allocated, ascii_data;
   __ mov(edi, FieldOperand(eax, HeapObject::kMapOffset));
   __ movzx_b(ecx, FieldOperand(edi, Map::kInstanceTypeOffset));
   __ mov(edi, FieldOperand(edx, HeapObject::kMapOffset));
@@ -12861,6 +12861,7 @@
   ASSERT(kStringEncodingMask == kAsciiStringTag);
   __ test(ecx, Immediate(kAsciiStringTag));
   __ j(zero, &non_ascii);
+  __ bind(&ascii_data);
   // Allocate an acsii cons string.
   __ AllocateAsciiConsString(ecx, edi, no_reg, &string_add_runtime);
   __ bind(&allocated);
@@ -12875,6 +12876,19 @@
   __ IncrementCounter(&Counters::string_add_native, 1);
   __ ret(2 * kPointerSize);
   __ bind(&non_ascii);
+  // At least one of the strings is two-byte. Check whether it happens
+  // to contain only ascii characters.
+  // ecx: first instance type AND second instance type.
+  // edi: second instance type.
+  __ test(ecx, Immediate(kAsciiDataHintMask));
+  __ j(not_zero, &ascii_data);
+  __ mov(ecx, FieldOperand(eax, HeapObject::kMapOffset));
+  __ movzx_b(ecx, FieldOperand(ecx, Map::kInstanceTypeOffset));
+  __ xor_(edi, Operand(ecx));
+  ASSERT(kAsciiStringTag != 0 && kAsciiDataHintTag != 0);
+  __ and_(edi, kAsciiStringTag | kAsciiDataHintTag);
+  __ cmp(edi, kAsciiStringTag | kAsciiDataHintTag);
+  __ j(equal, &ascii_data);
   // Allocate a two byte cons string.
   __ AllocateConsString(ecx, edi, no_reg, &string_add_runtime);
   __ jmp(&allocated);
=======================================
--- /branches/bleeding_edge/src/objects-debug.cc        Thu May 27 05:30:45 2010
+++ /branches/bleeding_edge/src/objects-debug.cc        Thu Jun 17 09:19:28 2010
@@ -552,12 +552,14 @@
     case CONS_SYMBOL_TYPE: return "CONS_SYMBOL";
     case CONS_ASCII_SYMBOL_TYPE: return "CONS_ASCII_SYMBOL";
     case EXTERNAL_ASCII_SYMBOL_TYPE:
+    case EXTERNAL_SYMBOL_WITH_ASCII_DATA_TYPE:
     case EXTERNAL_SYMBOL_TYPE: return "EXTERNAL_SYMBOL";
     case ASCII_STRING_TYPE: return "ASCII_STRING";
     case STRING_TYPE: return "TWO_BYTE_STRING";
     case CONS_STRING_TYPE:
     case CONS_ASCII_STRING_TYPE: return "CONS_STRING";
     case EXTERNAL_ASCII_STRING_TYPE:
+    case EXTERNAL_STRING_WITH_ASCII_DATA_TYPE:
     case EXTERNAL_STRING_TYPE: return "EXTERNAL_STRING";
     case FIXED_ARRAY_TYPE: return "FIXED_ARRAY";
     case BYTE_ARRAY_TYPE: return "BYTE_ARRAY";
=======================================
--- /branches/bleeding_edge/src/objects-inl.h   Mon Jun  7 08:39:10 2010
+++ /branches/bleeding_edge/src/objects-inl.h   Thu Jun 17 09:19:28 2010
@@ -237,31 +237,20 @@

 bool String::IsAsciiRepresentation() {
   uint32_t type = map()->instance_type();
-  if ((type & kStringRepresentationMask) == kConsStringTag &&
-      ConsString::cast(this)->second()->length() == 0) {
-    return ConsString::cast(this)->first()->IsAsciiRepresentation();
-  }
   return (type & kStringEncodingMask) == kAsciiStringTag;
 }


 bool String::IsTwoByteRepresentation() {
   uint32_t type = map()->instance_type();
-  if ((type & kStringRepresentationMask) == kConsStringTag &&
-             ConsString::cast(this)->second()->length() == 0) {
-    return ConsString::cast(this)->first()->IsTwoByteRepresentation();
-  }
   return (type & kStringEncodingMask) == kTwoByteStringTag;
 }


-bool String::IsExternalTwoByteStringWithAsciiChars() {
-  if (!IsExternalTwoByteString()) return false;
-  const uc16* data = ExternalTwoByteString::cast(this)->resource()->data();
-  for (int i = 0, len = length(); i < len; i++) {
-    if (data[i] > kMaxAsciiCharCode) return false;
-  }
-  return true;
+bool String::HasOnlyAsciiChars() {
+  uint32_t type = map()->instance_type();
+  return (type & kStringEncodingMask) == kAsciiStringTag ||
+         (type & kAsciiDataHintMask) == kAsciiDataHintTag;
 }


=======================================
--- /branches/bleeding_edge/src/objects.cc      Tue Jun 15 10:01:02 2010
+++ /branches/bleeding_edge/src/objects.cc      Thu Jun 17 09:19:28 2010
@@ -678,6 +678,9 @@


 bool String::MakeExternal(v8::String::ExternalStringResource* resource) {
+  // Externalizing twice leaks the external resouce, so it's
+  // prohibited by the API.
+  ASSERT(!this->IsExternalString());
 #ifdef DEBUG
   if (FLAG_enable_slow_asserts) {
     // Assert that the resource and the string are equivalent.
@@ -697,13 +700,16 @@
     return false;
   }
   ASSERT(size >= ExternalString::kSize);
+  bool is_ascii = this->IsAsciiRepresentation();
   bool is_symbol = this->IsSymbol();
   int length = this->length();
   int hash_field = this->hash_field();

   // Morph the object to an external string by adjusting the map and
   // reinitializing the fields.
-  this->set_map(Heap::external_string_map());
+  this->set_map(is_ascii ?
+                Heap::external_string_with_ascii_data_map() :
+                Heap::external_string_map());
   ExternalTwoByteString* self = ExternalTwoByteString::cast(this);
   self->set_length(length);
   self->set_hash_field(hash_field);
@@ -713,7 +719,9 @@
   if (is_symbol) {
     self->Hash();  // Force regeneration of the hash value.
     // Now morph this external string into a external symbol.
-    this->set_map(Heap::external_symbol_map());
+    this->set_map(is_ascii ?
+                  Heap::external_symbol_with_ascii_data_map() :
+                  Heap::external_symbol_map());
   }

   // Fill the remainder of the string with dead wood.
=======================================
--- /branches/bleeding_edge/src/objects.h       Mon Jun 14 06:55:38 2010
+++ /branches/bleeding_edge/src/objects.h       Thu Jun 17 09:19:28 2010
@@ -320,6 +320,10 @@
ExternalTwoByteString::kSize, \ external_symbol, \ ExternalSymbol) \ + V(EXTERNAL_SYMBOL_WITH_ASCII_DATA_TYPE, \ + ExternalTwoByteString::kSize, \ + external_symbol_with_ascii_data, \ + ExternalSymbolWithAsciiData) \ V(EXTERNAL_ASCII_SYMBOL_TYPE, \ ExternalAsciiString::kSize, \ external_ascii_symbol, \
@@ -344,6 +348,10 @@
ExternalTwoByteString::kSize, \ external_string, \ ExternalString) \ + V(EXTERNAL_STRING_WITH_ASCII_DATA_TYPE, \ + ExternalTwoByteString::kSize, \ + external_string_with_ascii_data, \ + ExternalStringWithAsciiData) \ V(EXTERNAL_ASCII_STRING_TYPE, \ ExternalAsciiString::kSize, \ external_ascii_string, \
@@ -412,6 +420,11 @@
 };
 const uint32_t kIsConsStringMask = 0x1;

+// If bit 7 is clear, then bit 3 indicates whether this two-byte
+// string actually contains ascii data.
+const uint32_t kAsciiDataHintMask = 0x08;
+const uint32_t kAsciiDataHintTag = 0x08;
+

 // A ConsString with an empty string as the right side is a candidate
 // for being shortcut by the garbage collector unless it is a
@@ -427,18 +440,22 @@

 enum InstanceType {
   // String types.
-  SYMBOL_TYPE = kSymbolTag | kSeqStringTag,
+  SYMBOL_TYPE = kTwoByteStringTag | kSymbolTag | kSeqStringTag,
   ASCII_SYMBOL_TYPE = kAsciiStringTag | kSymbolTag | kSeqStringTag,
-  CONS_SYMBOL_TYPE = kSymbolTag | kConsStringTag,
+  CONS_SYMBOL_TYPE = kTwoByteStringTag | kSymbolTag | kConsStringTag,
   CONS_ASCII_SYMBOL_TYPE = kAsciiStringTag | kSymbolTag | kConsStringTag,
-  EXTERNAL_SYMBOL_TYPE = kSymbolTag | kExternalStringTag,
+ EXTERNAL_SYMBOL_TYPE = kTwoByteStringTag | kSymbolTag | kExternalStringTag,
+  EXTERNAL_SYMBOL_WITH_ASCII_DATA_TYPE =
+ kTwoByteStringTag | kSymbolTag | kExternalStringTag | kAsciiDataHintTag,
   EXTERNAL_ASCII_SYMBOL_TYPE =
       kAsciiStringTag | kSymbolTag | kExternalStringTag,
-  STRING_TYPE = kSeqStringTag,
+  STRING_TYPE = kTwoByteStringTag | kSeqStringTag,
   ASCII_STRING_TYPE = kAsciiStringTag | kSeqStringTag,
-  CONS_STRING_TYPE = kConsStringTag,
+  CONS_STRING_TYPE = kTwoByteStringTag | kConsStringTag,
   CONS_ASCII_STRING_TYPE = kAsciiStringTag | kConsStringTag,
-  EXTERNAL_STRING_TYPE = kExternalStringTag,
+  EXTERNAL_STRING_TYPE = kTwoByteStringTag | kExternalStringTag,
+  EXTERNAL_STRING_WITH_ASCII_DATA_TYPE =
+      kTwoByteStringTag | kExternalStringTag | kAsciiDataHintTag,
   EXTERNAL_ASCII_STRING_TYPE = kAsciiStringTag | kExternalStringTag,
   PRIVATE_EXTERNAL_ASCII_STRING_TYPE = EXTERNAL_ASCII_STRING_TYPE,

@@ -4069,12 +4086,14 @@
   inline bool IsAsciiRepresentation();
   inline bool IsTwoByteRepresentation();

-  // Check whether this string is an external two-byte string that in
-  // fact contains only ascii characters.
+  // Returns whether this string has ascii chars, i.e. all of them can
+  // be ascii encoded.  This might be the case even if the string is
+  // two-byte.  Such strings may appear when the embedder prefers
+  // two-byte external representations even for ascii data.
   //
-  // Such strings may appear when the embedder prefers two-byte
-  // representations even for ascii data.
-  inline bool IsExternalTwoByteStringWithAsciiChars();
+  // NOTE: this should be considered only a hint.  False negatives are
+  // possible.
+  inline bool HasOnlyAsciiChars();

   // Get and set individual two byte chars in the string.
   inline void Set(int index, uint16_t value);
=======================================
--- /branches/bleeding_edge/src/runtime.cc      Thu Jun 17 05:47:08 2010
+++ /branches/bleeding_edge/src/runtime.cc      Thu Jun 17 09:19:28 2010
@@ -4944,16 +4944,6 @@
     return s;
   }
 }
-
-
-static inline SeqAsciiString* TryGetSeqAsciiString(String* s) {
-  if (!s->IsFlat() || !s->IsAsciiRepresentation()) return NULL;
-  if (s->IsConsString()) {
-    ASSERT(ConsString::cast(s)->second()->length() == 0);
-    return SeqAsciiString::cast(ConsString::cast(s)->first());
-  }
-  return SeqAsciiString::cast(s);
-}


 namespace {
@@ -5002,7 +4992,7 @@
unibrow::Mapping<typename ConvertTraits::UnibrowConverter, 128>* mapping) {
   NoHandleAllocation ha;
   CONVERT_CHECKED(String, s, args[0]);
-  s->TryFlatten();
+  s = s->TryFlattenGetString();

   const int length = s->length();
   // Assume that the string is not empty; we need this assumption later
@@ -5014,13 +5004,12 @@
   // character is also ascii.  This is currently the case, but it
   // might break in the future if we implement more context and locale
   // dependent upper/lower conversions.
-  SeqAsciiString* seq_ascii = TryGetSeqAsciiString(s);
-  if (seq_ascii != NULL) {
+  if (s->IsSeqAsciiString()) {
     Object* o = Heap::AllocateRawAsciiString(length);
     if (o->IsFailure()) return o;
     SeqAsciiString* result = SeqAsciiString::cast(o);
     bool has_changed_character = ConvertTraits::ConvertAscii(
-        result->GetChars(), seq_ascii->GetChars(), length);
+        result->GetChars(), SeqAsciiString::cast(s)->GetChars(), length);
     return has_changed_character ? result : s;
   }

@@ -5564,7 +5553,7 @@
     if (first->IsString()) return first;
   }

-  bool ascii = special->IsAsciiRepresentation();
+  bool ascii = special->HasOnlyAsciiChars();
   int position = 0;
   for (int i = 0; i < array_length; i++) {
     int increment = 0;
@@ -5605,7 +5594,7 @@
       String* element = String::cast(elt);
       int element_length = element->length();
       increment = element_length;
-      if (ascii && !element->IsAsciiRepresentation()) {
+      if (ascii && !element->HasOnlyAsciiChars()) {
         ascii = false;
       }
     } else {
=======================================
--- /branches/bleeding_edge/src/x64/codegen-x64.cc      Thu Jun 17 08:48:43 2010
+++ /branches/bleeding_edge/src/x64/codegen-x64.cc      Thu Jun 17 09:19:28 2010
@@ -11205,16 +11205,17 @@
// If result is not supposed to be flat, allocate a cons string object. If
   // both strings are ascii the result is an ascii cons string.
   // rax: first string
-  // ebx: length of resulting flat string
+  // rbx: length of resulting flat string
   // rdx: second string
   // r8: instance type of first string
   // r9: instance type of second string
-  Label non_ascii, allocated;
+  Label non_ascii, allocated, ascii_data;
   __ movl(rcx, r8);
   __ and_(rcx, r9);
   ASSERT(kStringEncodingMask == kAsciiStringTag);
   __ testl(rcx, Immediate(kAsciiStringTag));
   __ j(zero, &non_ascii);
+  __ bind(&ascii_data);
   // Allocate an acsii cons string.
   __ AllocateAsciiConsString(rcx, rdi, no_reg, &string_add_runtime);
   __ bind(&allocated);
@@ -11228,6 +11229,18 @@
   __ IncrementCounter(&Counters::string_add_native, 1);
   __ ret(2 * kPointerSize);
   __ bind(&non_ascii);
+  // At least one of the strings is two-byte. Check whether it happens
+  // to contain only ascii characters.
+  // rcx: first instance type AND second instance type.
+  // r8: first instance type.
+  // r9: second instance type.
+  __ testb(rcx, Immediate(kAsciiDataHintMask));
+  __ j(not_zero, &ascii_data);
+  __ xor_(r8, r9);
+  ASSERT(kAsciiStringTag != 0 && kAsciiDataHintTag != 0);
+  __ andb(r8, Immediate(kAsciiStringTag | kAsciiDataHintTag));
+  __ cmpb(r8, Immediate(kAsciiStringTag | kAsciiDataHintTag));
+  __ j(equal, &ascii_data);
   // Allocate a two byte cons string.
   __ AllocateConsString(rcx, rdi, no_reg, &string_add_runtime);
   __ jmp(&allocated);
@@ -11235,7 +11248,7 @@
   // Handle creating a flat result. First check that both strings are not
   // external strings.
   // rax: first string
-  // ebx: length of resulting flat string as smi
+  // rbx: length of resulting flat string as smi
   // rdx: second string
   // r8: instance type of first string
   // r9: instance type of first string
@@ -11251,7 +11264,7 @@
   __ j(equal, &string_add_runtime);
   // Now check if both strings are ascii strings.
   // rax: first string
-  // ebx: length of resulting flat string
+  // rbx: length of resulting flat string
   // rdx: second string
   // r8: instance type of first string
   // r9: instance type of second string

--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to