[v8-dev] Experimental parser: add utf8tolatin1 conversion (issue 196943021)

dcarney Mon, 17 Mar 2014 07:05:46 -0700

Reviewers: marja,

Message:
Committed patchset #2 manually as r19999 (tree was closed).


Description:
Experimental parser: add utf8tolatin1 conversion

[email protected]

BUG=

Committed: https://code.google.com/p/v8/source/detail?r=19999

Please review this at https://codereview.chromium.org/196943021/

SVN Base: https://v8.googlecode.com/svn/branches/experimental/parser

Affected files (+75, -14 lines):
  M src/lexer/lexer-shell.cc
  M tools/lexer_generator/test/run_lexing_tests.py


Index: src/lexer/lexer-shell.cc
diff --git a/src/lexer/lexer-shell.cc b/src/lexer/lexer-shell.cc

index1eb906a3d06fd237442c1895401685aa66abdc47..b180a6681804f1f1f71ac92a80f5299eda006ad5100644

--- a/src/lexer/lexer-shell.cc
+++ b/src/lexer/lexer-shell.cc
@@ -53,7 +53,7 @@ enum Encoding {
   UTF8,
   UTF16,
   UTF8TO16,  // Convert stream via scanner input stream
-  UTF8TO16_PRECONVERT  // Convert stream during file read
+  UTF8TOLATIN1,  // Convert stream via scanner input stream
 };


@@ -81,12 +81,14 @@ struct LexerShellSettings {


 static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,
-                                    unsigned* length) {
+                                    unsigned* length,
+                                    bool* is_one_byte) {
   const unsigned file_size = *length;
   const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in);
   const uint32_t kMaxUtf16Character = 0xffff;
   // Get utf8 length.
   unsigned utf16_chars = 0;
+  *is_one_byte = true;
   {
     unsigned position = 0;
     while (position < file_size) {

@@ -94,6 +96,7 @@ static uint16_t* ConvertUtf8ToUtf16(const uint16_t* constdata_in,

       if (c <= unibrow::Utf8::kMaxOneByteChar) {
         position++;
       } else {
+        *is_one_byte = false;
         c =  unibrow::Utf8::CalculateValue(char_data + position,
                                            file_size - position,
                                            &position);

@@ -130,6 +133,17 @@ static uint16_t* ConvertUtf8ToUtf16(const uint16_t*const data_in,

 }


+static uint16_t* ConvertUtf16ToLatin1(const uint16_t* const data_in,
+                                      unsigned* length) {
+  const unsigned size = *length / 2 + *length % 2;
+  uint16_t* data = new uint16_t[size];
+  uint8_t* char_data = reinterpret_cast<uint8_t*>(data);
+  CopyChars(char_data, data_in, size);
+  *length = size;
+  return data;
+}
+
+
 static uint16_t* Repeat(int repeat,
                         const uint16_t* const data_in,
                         unsigned* length) {

@@ -166,13 +180,29 @@ static uint16_t* ReadFile(const char* name, unsigned*length) {


 static uint16_t* ReadFile(const char* name,
                           const LexerShellSettings& settings,
-                          unsigned* length) {
+                          unsigned* length,
+                          Encoding* output_encoding) {
   uint16_t* data = ReadFile(name, length);
   CHECK_GE(*length, 0);
   if (*length == 0) return data;

-  if (settings.encoding == UTF8TO16_PRECONVERT) {
-    uint16_t* new_data = ConvertUtf8ToUtf16(data, length);
+  *output_encoding = settings.encoding;
+
+  if (settings.encoding == UTF8TO16 ||
+      settings.encoding == UTF8TOLATIN1) {
+    bool is_one_byte;
+    uint16_t* new_data = ConvertUtf8ToUtf16(data, length, &is_one_byte);
+    if (settings.encoding == UTF8TOLATIN1 && is_one_byte) {
+      *output_encoding = LATIN1;
+    } else {
+      *output_encoding = UTF16;
+    }
+    delete data;
+    data = new_data;
+  }
+
+  if (settings.encoding == UTF8TOLATIN1 && *output_encoding == LATIN1) {
+    uint16_t* new_data = ConvertUtf16ToLatin1(data, length);
     delete data;
     data = new_data;
   }
@@ -265,16 +295,15 @@ class TokenWithLocation {
 static TimeDelta RunLexer(const uint16_t* source,
                           const uint8_t* source_end,
                           Isolate* isolate,
+                          Encoding output_encoding,
                           const LexerShellSettings& settings) {
   SmartPointer<Utf16CharacterStream> stream;

const uint8_t* one_byte_source = reinterpret_cast<constuint8_t*>(source);

   int bytes = source_end - one_byte_source;
-  switch (settings.encoding) {
-    case UTF8TO16:
+  switch (output_encoding) {
     case UTF8:
       stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));
       break;
-    case UTF8TO16_PRECONVERT:
     case UTF16: {
       CHECK_EQ(0, bytes % 2);
       Handle<String> result = isolate->factory()->NewStringFromTwoByte(
@@ -290,6 +319,9 @@ static TimeDelta RunLexer(const uint16_t* source,

new GenericStringUtf16CharacterStream(result, 0,result->length()));

       break;
     }
+    case UTF8TO16:
+    case UTF8TOLATIN1:
+      CHECK(false);
   }
   Scanner scanner(isolate->unicode_cache());
   scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals);
@@ -340,14 +372,16 @@ static TimeDelta ProcessFile(
   TimeDelta time;
   {
     unsigned length_in_bytes;
-    const uint16_t* buffer = ReadFile(fname, settings, &length_in_bytes);
+    Encoding output_encoding;
+    const uint16_t* buffer =
+        ReadFile(fname, settings, &length_in_bytes, &output_encoding);
     const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer);
     const uint8_t* buffer_end = &char_data[length_in_bytes];
     if (truncate_by > buffer_end - char_data) {
       *can_truncate = false;
     } else {
       buffer_end -= truncate_by;
-      time = RunLexer(buffer, buffer_end, isolate, settings);

+ time = RunLexer(buffer, buffer_end, isolate, output_encoding,settings);

     }
     delete[] buffer;
   }
@@ -370,9 +404,15 @@ int main(int argc, char* argv[]) {
       settings.encoding = UTF16;
     } else if (strcmp(argv[i], "--utf8to16") == 0) {
 #ifdef V8_USE_GENERATED_LEXER
-      settings.encoding = UTF8TO16_PRECONVERT;
-#else
       settings.encoding = UTF8TO16;
+#else
+      settings.encoding = UTF8;
+#endif
+    } else if (strcmp(argv[i], "--utf8tolatin1") == 0) {
+#ifdef V8_USE_GENERATED_LEXER
+      settings.encoding = UTF8TOLATIN1;
+#else
+      settings.encoding = UTF8;
 #endif
     } else if (strcmp(argv[i], "--print-tokens") == 0) {
       settings.print_tokens = true;
Index: tools/lexer_generator/test/run_lexing_tests.py

diff --git a/tools/lexer_generator/test/run_lexing_tests.pyb/tools/lexer_generator/test/run_lexing_tests.pyindex971b2587f6560896b9a718a64dcf2ad12c760029..6d950998433703da78b4ac88366b8ab1e7cce11c100644

--- a/tools/lexer_generator/test/run_lexing_tests.py
+++ b/tools/lexer_generator/test/run_lexing_tests.py
@@ -105,6 +105,26 @@ class ProcessRunner:
     data['buffer'].append(data['process'].stdout.read())
     return ''.join(data['buffer'])

+  @staticmethod
+  def analyse_diff(left_data, right_data):
+    left = left_data.split("\n");
+    right = right_data.split("\n");
+    for i in range(min(len(left), len(right))):
+      if left[i] != right[i]:
+        message = "differ at token %d" % i
+        for j in range(i-4, i-1):
+          if j >= 0:
+            message += "\n\n%s\n%s" % (left[j], right[j])
+        message += "\n\n%s\n%s\n" % (left[i], right[i])
+        logging.info(message)
+        return
+    if len(right) > len(left):
+      logging.info("right longer")
+      return
+    if len(left) > len(right):
+      logging.info("left longer")
+      return
+
   def compare_results(self, left, right):
     f = left['file']
     assert f == right['file']
@@ -117,7 +137,7 @@ class ProcessRunner:
     left_data = self.buffer_contents(left)
     right_data = self.buffer_contents(right)
     if left_data != right_data:
-      # TODO(dcarney): analyse differences
+      self.analyse_diff(left_data, right_data)
       print "%s failed" % f
       return
     print "%s succeeded" % f
@@ -184,7 +204,8 @@ if __name__ == '__main__':
   parser.add_argument('-f', '--single-file', default='')

parser.add_argument('-p', '--parallel-process-count', default=1,type=int)

   parser.add_argument('-e', '--encoding',
-    choices=['latin1', 'utf8', 'utf8to16', 'utf16'], default='utf8')
+    choices=['latin1', 'utf8', 'utf16', 'utf8to16', 'utf8tolatin1'],
+    default='utf8')
   parser.add_argument('--use-harmony', action='store_true')
   parser.add_argument('-v', '--verbose', action='store_true')
   args = parser.parse_args()


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

---You received this message because you are subscribed to the Google Groups "v8-dev" group.

To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

[v8-dev] Experimental parser: add utf8tolatin1 conversion (issue 196943021)

Reply via email to