Reviewers: ulan,
Message:
Committed patchset #1 manually as r18155.
Description:
Experimental scanner fix: recognize one byte strings inside utf16 files.
BUG=
[email protected]
Committed: https://code.google.com/p/v8/source/detail?r=18155
Please review this at https://codereview.chromium.org/96173004/
SVN Base: https://v8.googlecode.com/svn/branches/experimental/parser
Affected files (+19, -2 lines):
M src/lexer/experimental-scanner.h
M src/lexer/experimental-scanner.cc
M src/lexer/lexer-shell.cc
Index: src/lexer/experimental-scanner.cc
diff --git a/src/lexer/experimental-scanner.cc
b/src/lexer/experimental-scanner.cc
index
120c33660411b6b584d5d7e919bc7f6ce6344f2a..a314e63e0959f41562f09fb284cb613a8f60f010
100644
--- a/src/lexer/experimental-scanner.cc
+++ b/src/lexer/experimental-scanner.cc
@@ -118,9 +118,24 @@ bool ExperimentalScanner<uint16_t>::FillLiteral(
--end;
}
if (!token.has_escapes) {
- literal->is_ascii = false; // FIXME: utf16 can contain only ascii
chars.
+ // UTF-16 can also contain only one byte chars. Note that is_ascii here
+ // means is_onebyte.
+ literal->is_ascii = true;
+ literal->buffer.Reset();
+ for (const uint16_t* cursor = start; cursor != end; ++cursor) {
+ if (*cursor >= unibrow::Latin1::kMaxChar) {
+ literal->is_ascii = false;
+ break;
+ }
+ literal->buffer.AddChar(*cursor);
+ }
literal->length = end - start;
- literal->utf16_string = Vector<const uint16_t>(start, literal->length);
+ if (literal->is_ascii) {
+ literal->ascii_string = literal->buffer.ascii_literal();
+ } else {
+ literal->buffer.Reset();
+ literal->utf16_string = Vector<const uint16_t>(start,
literal->length);
+ }
return true;
}
literal->buffer.Reset();
Index: src/lexer/experimental-scanner.h
diff --git a/src/lexer/experimental-scanner.h
b/src/lexer/experimental-scanner.h
index
fea691a2246be1f6ac6ca5c9f8cd065c12da41eb..7f81565e652db36b77f22747d093b1c36ab9a7b5
100644
--- a/src/lexer/experimental-scanner.h
+++ b/src/lexer/experimental-scanner.h
@@ -163,6 +163,7 @@ class ScannerBase {
return current_literal_->length;
}
+ // This should be is_onebyte or is_latin1; it doesn't mean ASCII for
real.
bool is_literal_ascii() {
if (!current_literal_->Valid(current_.beg_pos)) {
FillLiteral(current_, current_literal_);
Index: src/lexer/lexer-shell.cc
diff --git a/src/lexer/lexer-shell.cc b/src/lexer/lexer-shell.cc
index
af50c9c9e65564a3778178d154e6cd24adc422dd..83d273935b20fdd14983d4bd7657fe38ea91a8ff
100644
--- a/src/lexer/lexer-shell.cc
+++ b/src/lexer/lexer-shell.cc
@@ -196,6 +196,7 @@ struct TokenWithLocation {
for (size_t i = 0; i < literal.size(); i++) {
printf(is_ascii ? " %02x" : " %04x", literal[i]);
}
+ printf(" (is ascii: %d)", is_ascii);
}
printf(" (last octal start: %d)\n", octal_beg);
}
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.