Revision: 19783
Author:   [email protected]
Date:     Tue Mar 11 09:03:35 2014 UTC
Log:      Experimental parser: proper utf16 conversion

[email protected]

BUG=

Review URL: https://codereview.chromium.org/194613002
http://code.google.com/p/v8/source/detail?r=19783

Modified:
 /branches/experimental/parser/src/lexer/lexer-shell.cc

=======================================
--- /branches/experimental/parser/src/lexer/lexer-shell.cc Mon Mar 10 16:13:09 2014 UTC +++ /branches/experimental/parser/src/lexer/lexer-shell.cc Tue Mar 11 09:03:35 2014 UTC
@@ -52,7 +52,8 @@
   LATIN1,
   UTF8,
   UTF16,
- UTF8TO16 // Read as UTF8, convert to UTF16 before giving it to the lexers.
+  UTF8TO16,  // Convert stream via scanner input stream
+  UTF8TO16_PRECONVERT  // Convert stream during file read
 };


@@ -77,49 +78,15 @@
 };


-static uint16_t* ReadFile(const char* name, const uint8_t** end,
-                          const LexerShellSettings& settings) {
-  FILE* file = fopen(name, "rb");
-  CHECK(file != NULL);
-
-  fseek(file, 0, SEEK_END);
-  unsigned file_size = ftell(file);
-  rewind(file);
-
-  uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2];
-
-  uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data);
-  for (unsigned i = 0; i < file_size;) {
-    i += fread(&char_data[i], 1, file_size - i, file);
-  }
-  fclose(file);
-
-  if (settings.encoding == UTF8TO16) {
-    const uint32_t kMaxUtf16Character = 0xffff;
-    // Get utf8 length.
-    unsigned utf16_chars = 0;
-    {
-      unsigned position = 0;
-      while (position < file_size) {
-        uint32_t c = char_data[position];
-        if (c <= unibrow::Utf8::kMaxOneByteChar) {
-          position++;
-        } else {
-          c =  unibrow::Utf8::CalculateValue(char_data + position,
-                                             file_size - position,
-                                             &position);
-        }
-        if (c > kMaxUtf16Character) {
-          utf16_chars += 2;
-        } else {
-          utf16_chars += 1;
-        }
-      }
-    }
-    // Write new buffer out.
-    uint16_t* data = new uint16_t[utf16_chars];
+static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,
+                                    unsigned* length) {
+  const unsigned file_size = *length;
+  const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in);
+  const uint32_t kMaxUtf16Character = 0xffff;
+  // Get utf8 length.
+  unsigned utf16_chars = 0;
+  {
     unsigned position = 0;
-    unsigned i = 0;
     while (position < file_size) {
       uint32_t c = char_data[position];
       if (c <= unibrow::Utf8::kMaxOneByteChar) {
@@ -130,34 +97,91 @@
                                            &position);
       }
       if (c > kMaxUtf16Character) {
-        data[i++] = unibrow::Utf16::LeadSurrogate(c);
-        data[i++] = unibrow::Utf16::TrailSurrogate(c);
+        utf16_chars += 2;
       } else {
-        data[i++] = static_cast<uc16>(c);
+        utf16_chars += 1;
       }
     }
-    // Swap buffers.
-    delete two_byte_data;
-    file_size = utf16_chars * 2;
-    two_byte_data = data;
-    char_data = reinterpret_cast<uint8_t*>(two_byte_data);
   }
+  // Write new buffer out.
+  uint16_t* data = new uint16_t[utf16_chars];
+  unsigned position = 0;
+  unsigned i = 0;
+  while (position < file_size) {
+    uint32_t c = char_data[position];
+    if (c <= unibrow::Utf8::kMaxOneByteChar) {
+      position++;
+    } else {
+      c =  unibrow::Utf8::CalculateValue(char_data + position,
+                                         file_size - position,
+                                         &position);
+    }
+    if (c > kMaxUtf16Character) {
+      data[i++] = unibrow::Utf16::LeadSurrogate(c);
+      data[i++] = unibrow::Utf16::TrailSurrogate(c);
+    } else {
+      data[i++] = static_cast<uc16>(c);
+    }
+  }
+  *length = 2 * utf16_chars;
+  return data;
+}

-  // Duplicate buffer if necessary.
+
+static uint16_t* Repeat(int repeat,
+                        const uint16_t* const data_in,
+                        unsigned* length) {
+  const unsigned file_size = *length;
+  unsigned size = file_size * repeat;
+  uint16_t* data = new uint16_t[size / 2 + size % 2];
+  uint8_t* char_data = reinterpret_cast<uint8_t*>(data);
+  for (int i = 0; i < repeat; i++) {
+    memcpy(&char_data[i * file_size], data_in, file_size);
+  }
+  *length = size;
+  return data;
+}
+
+
+static uint16_t* ReadFile(const char* name, unsigned* length) {
+  FILE* file = fopen(name, "rb");
+  CHECK(file != NULL);
+  // Get file size.
+  fseek(file, 0, SEEK_END);
+  unsigned file_size = ftell(file);
+  rewind(file);
+  // Read file contents.
+  uint16_t* data = new uint16_t[file_size / 2 + file_size % 2];
+  uint8_t* char_data = reinterpret_cast<uint8_t*>(data);
+  for (unsigned i = 0; i < file_size;) {
+    i += fread(&char_data[i], 1, file_size - i, file);
+  }
+  fclose(file);
+  *length = file_size;
+  return data;
+}
+
+
+static uint16_t* ReadFile(const char* name,
+                          const LexerShellSettings& settings,
+                          unsigned* length) {
+  uint16_t* data = ReadFile(name, length);
+  CHECK_GE(*length, 0);
+  if (*length == 0) return data;
+
+  if (settings.encoding == UTF8TO16_PRECONVERT) {
+    uint16_t* new_data = ConvertUtf8ToUtf16(data, length);
+    delete data;
+    data = new_data;
+  }
+
   if (settings.repeat > 1) {
-    unsigned size = file_size * settings.repeat;
-    uint16_t* data = new uint16_t[size / 2 + size % 2];
-    char_data = reinterpret_cast<uint8_t*>(two_byte_data);
-    for (int i = 0; i < settings.repeat; i++) {
-      memcpy(&char_data[i * file_size], two_byte_data, file_size);
-    }
-    delete two_byte_data;
-    file_size = size;
-    two_byte_data = data;
+    uint16_t* new_data = Repeat(settings.repeat, data, length);
+    delete data;
+    data = new_data;
   }

-  *end = &char_data[file_size];
-  return two_byte_data;
+  return data;
 }


@@ -243,10 +267,11 @@
const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);
   int bytes = source_end - one_byte_source;
   switch (settings.encoding) {
+    case UTF8TO16:
     case UTF8:
       stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));
       break;
-    case UTF8TO16:
+    case UTF8TO16_PRECONVERT:
     case UTF16: {
       CHECK_EQ(0, bytes % 2);
       Handle<String> result = isolate->factory()->NewStringFromTwoByte(
@@ -300,9 +325,11 @@
   std::vector<TokenWithLocation> tokens;
   TimeDelta time;
   {
-    const uint8_t* buffer_end = 0;
-    const uint16_t* buffer = ReadFile(fname, &buffer_end, settings);
- if (truncate_by > buffer_end - reinterpret_cast<const uint8_t*>(buffer)) {
+    unsigned length_in_bytes;
+    const uint16_t* buffer = ReadFile(fname, settings, &length_in_bytes);
+    const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer);
+    const uint8_t* buffer_end = &char_data[length_in_bytes];
+    if (truncate_by > buffer_end - char_data) {
       *can_truncate = false;
     } else {
       buffer_end -= truncate_by;
@@ -337,7 +364,11 @@
     } else if (strcmp(argv[i], "--utf16") == 0) {
       settings.encoding = UTF16;
     } else if (strcmp(argv[i], "--utf8to16") == 0) {
+#ifdef V8_USE_GENERATED_LEXER
+      settings.encoding = UTF8TO16_PRECONVERT;
+#else
       settings.encoding = UTF8TO16;
+#endif
     } else if (strcmp(argv[i], "--print-tokens") == 0) {
       settings.print_tokens = true;
     } else if (strcmp(argv[i], "--no-baseline") == 0) {

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Reply via email to