Reviewers: ulan,

Message:
Committed patchset #1 manually as r18155.

Description:
Experimental scanner fix: recognize one byte strings inside utf16 files.

BUG=
[email protected]

Committed: https://code.google.com/p/v8/source/detail?r=18155

Please review this at https://codereview.chromium.org/96173004/

SVN Base: https://v8.googlecode.com/svn/branches/experimental/parser

Affected files (+19, -2 lines):
  M src/lexer/experimental-scanner.h
  M src/lexer/experimental-scanner.cc
  M src/lexer/lexer-shell.cc


Index: src/lexer/experimental-scanner.cc
diff --git a/src/lexer/experimental-scanner.cc b/src/lexer/experimental-scanner.cc index 120c33660411b6b584d5d7e919bc7f6ce6344f2a..a314e63e0959f41562f09fb284cb613a8f60f010 100644
--- a/src/lexer/experimental-scanner.cc
+++ b/src/lexer/experimental-scanner.cc
@@ -118,9 +118,24 @@ bool ExperimentalScanner<uint16_t>::FillLiteral(
     --end;
   }
   if (!token.has_escapes) {
- literal->is_ascii = false; // FIXME: utf16 can contain only ascii chars.
+    // UTF-16 can also contain only one byte chars. Note that is_ascii here
+    // means is_onebyte.
+    literal->is_ascii = true;
+    literal->buffer.Reset();
+    for (const uint16_t* cursor = start; cursor != end; ++cursor) {
+      if (*cursor >= unibrow::Latin1::kMaxChar) {
+        literal->is_ascii = false;
+        break;
+      }
+      literal->buffer.AddChar(*cursor);
+    }
     literal->length = end - start;
-    literal->utf16_string = Vector<const uint16_t>(start, literal->length);
+    if (literal->is_ascii) {
+      literal->ascii_string = literal->buffer.ascii_literal();
+    } else {
+      literal->buffer.Reset();
+ literal->utf16_string = Vector<const uint16_t>(start, literal->length);
+    }
     return true;
   }
   literal->buffer.Reset();
Index: src/lexer/experimental-scanner.h
diff --git a/src/lexer/experimental-scanner.h b/src/lexer/experimental-scanner.h index fea691a2246be1f6ac6ca5c9f8cd065c12da41eb..7f81565e652db36b77f22747d093b1c36ab9a7b5 100644
--- a/src/lexer/experimental-scanner.h
+++ b/src/lexer/experimental-scanner.h
@@ -163,6 +163,7 @@ class ScannerBase {
     return current_literal_->length;
   }

+ // This should be is_onebyte or is_latin1; it doesn't mean ASCII for real.
   bool is_literal_ascii() {
     if (!current_literal_->Valid(current_.beg_pos)) {
       FillLiteral(current_, current_literal_);
Index: src/lexer/lexer-shell.cc
diff --git a/src/lexer/lexer-shell.cc b/src/lexer/lexer-shell.cc
index af50c9c9e65564a3778178d154e6cd24adc422dd..83d273935b20fdd14983d4bd7657fe38ea91a8ff 100644
--- a/src/lexer/lexer-shell.cc
+++ b/src/lexer/lexer-shell.cc
@@ -196,6 +196,7 @@ struct TokenWithLocation {
       for (size_t i = 0; i < literal.size(); i++) {
         printf(is_ascii ? " %02x" : " %04x", literal[i]);
       }
+      printf(" (is ascii: %d)", is_ascii);
     }
     printf(" (last octal start: %d)\n", octal_beg);
   }


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to