Reviewers: Yang,
Message:
yangguo@, I'm having fun with UTF-8 again. Ptal.
Description:
Streaming API: detect UTF-8 BOM.
It used to cause lazy function positions to be off by one.
BUG=430890
LOG=N
Please review this at https://codereview.chromium.org/708823002/
Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Affected files (+67, -18 lines):
M src/scanner-character-streams.h
M src/scanner-character-streams.cc
M test/cctest/test-api.cc
Index: src/scanner-character-streams.cc
diff --git a/src/scanner-character-streams.cc
b/src/scanner-character-streams.cc
index
50c3955c1bdc888200f593cad0f45ec69900ae18..0dec5daed4aa321a458c69041fe82def9fa941f1
100644
--- a/src/scanner-character-streams.cc
+++ b/src/scanner-character-streams.cc
@@ -232,14 +232,12 @@ unsigned
Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
static const byte kUtf8MultiByteMask = 0xC0;
static const byte kUtf8MultiByteCharFollower = 0x80;
+static const byte kUtf8MultiByteCharStart = 0xC0;
-#ifdef DEBUG
-static const byte kUtf8MultiByteCharStart = 0xC0;
static bool IsUtf8MultiCharacterStart(byte first_byte) {
return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
}
-#endif
static bool IsUtf8MultiCharacterFollower(byte later_byte) {
@@ -341,6 +339,14 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned
position) {
// A caveat: a data chunk might end with bytes from an incomplete
UTF-8
// character (the rest of the bytes will be in the next chunk).
if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
+ if (first_chunk_) {
+ // Get rid of the byte order mark (if any).
+ if (current_data_length_ >= 3 && current_data_[0] == 0xef &&
+ current_data_[1] == 0xbb && current_data_[2] == 0xbf) {
+ current_data_offset_ = 3;
+ }
+ }
+
HandleUtf8SplitCharacters(&data_in_buffer);
if (!data_ends && current_data_offset_ == current_data_length_) {
// The data stream didn't end, but we used all the data in the
@@ -360,6 +366,8 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned
position) {
DCHECK(utf8_split_char_buffer_length_ == 0);
return data_in_buffer;
}
+
+ first_chunk_ = false;
}
// Fill the buffer from current_data_.
@@ -396,11 +404,11 @@ void
ExternalStreamingStream::HandleUtf8SplitCharacters(
unibrow::uchar c;
if (utf8_split_char_buffer_length_ > 0) {
// Move the bytes which are part of the split character (which started
in
- // the previous chunk) into utf8_split_char_buffer_. Note that the
- // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.
- while (current_data_offset_ < current_data_length_ &&
- utf8_split_char_buffer_length_ < 4 &&
- (c = current_data_[current_data_offset_]) >> 6 == 2) {
+ // the previous chunk) into utf8_split_char_buffer_.
+ while (
+ current_data_offset_ < current_data_length_ &&
+ utf8_split_char_buffer_length_ < 4 &&
+ IsUtf8MultiCharacterFollower(c =
current_data_[current_data_offset_])) {
utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
++utf8_split_char_buffer_length_;
++current_data_offset_;
@@ -426,15 +434,16 @@ void
ExternalStreamingStream::HandleUtf8SplitCharacters(
// bytes long, but if the data is invalid, we can have character values
bigger
// than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
while (current_data_length_ > current_data_offset_ &&
- (c = current_data_[current_data_length_ - 1]) >
- unibrow::Utf8::kMaxOneByteChar &&
- utf8_split_char_buffer_length_ < 4) {
+ utf8_split_char_buffer_length_ < 4 &&
+ (IsUtf8MultiCharacterFollower(
+ c = current_data_[current_data_length_ - 1]) ||
+ IsUtf8MultiCharacterStart(c))) {
--current_data_length_;
++utf8_split_char_buffer_length_;
- if (c >= (3 << 6)) {
- // 3 << 6 = 0b11000000; this is the first byte of the multi-byte
- // character. No need to copy the previous characters into the
conversion
- // buffer (even if they're multi-byte).
+ if (IsUtf8MultiCharacterStart(c)) {
+ // This is the first byte of the multi-byte character. No need to
copy the
+ // previous characters into the conversion buffer (even if they're
+ // multi-byte).
break;
}
}
Index: src/scanner-character-streams.h
diff --git a/src/scanner-character-streams.h
b/src/scanner-character-streams.h
index
ff22de59f72a370d93abe1f67eb154eb6b2c1874..ac399f67200bb11e6d26328dd8fefa8266281080
100644
--- a/src/scanner-character-streams.h
+++ b/src/scanner-character-streams.h
@@ -87,7 +87,8 @@ class ExternalStreamingStream : public
BufferedUtf16CharacterStream {
current_data_(NULL),
current_data_offset_(0),
current_data_length_(0),
- utf8_split_char_buffer_length_(0) {}
+ utf8_split_char_buffer_length_(0),
+ first_chunk_(true) {}
virtual ~ExternalStreamingStream() { delete[] current_data_; }
@@ -113,6 +114,7 @@ class ExternalStreamingStream : public
BufferedUtf16CharacterStream {
// For converting UTF-8 characters which are split across two data
chunks.
uint8_t utf8_split_char_buffer_[4];
unsigned utf8_split_char_buffer_length_;
+ bool first_chunk_;
};
Index: test/cctest/test-api.cc
diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc
index
5fa2a2fb3a0af7e0102c0b5fc980281db03ab6d6..25c5049e45140155574f027fe8a29bf8d2f061d8
100644
--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc
@@ -23848,15 +23848,31 @@ class TestSourceStream : public
v8::ScriptCompiler::ExternalSourceStream {
// too).
static char* FullSourceString(const char** chunks) {
size_t total_len = 0;
+ bool has_bom = false;
for (size_t i = 0; chunks[i] != NULL; ++i) {
total_len += strlen(chunks[i]);
+
+ // Remove BOM when constructing the full source string; this
simluates
+ // what the embedder does.
+ if (i == 0 && strlen(chunks[i]) >= 3 &&
+ chunks[i][0] == static_cast<char>(0xef) &&
+ chunks[i][1] == static_cast<char>(0xbb) &&
+ chunks[i][2] == static_cast<char>(0xbf)) {
+ total_len -= 3;
+ has_bom = true;
+ }
}
char* full_string = new char[total_len + 1];
size_t offset = 0;
for (size_t i = 0; chunks[i] != NULL; ++i) {
size_t len = strlen(chunks[i]);
- memcpy(full_string + offset, chunks[i], len);
- offset += len;
+ if (has_bom) {
+ memcpy(full_string + offset, chunks[i] + 3, len - 3);
+ offset += len - 3;
+ } else {
+ memcpy(full_string + offset, chunks[i], len);
+ offset += len;
+ }
}
full_string[total_len] = 0;
return full_string;
@@ -23900,6 +23916,7 @@ void RunStreamingTest(const char** chunks,
CHECK(!script.IsEmpty());
v8::Handle<Value> result(script->Run());
// All scripts are supposed to return the fixed value 13 when ran.
+ CHECK_EQ(false, try_catch.HasCaught());
CHECK_EQ(13, result->Int32Value());
} else {
CHECK(script.IsEmpty());
@@ -24191,3 +24208,24 @@
TEST(StreamingUtf8ScriptWithMultipleMultibyteCharactersSomeSplit2) {
const char* chunks[] = {chunk1, chunk2, "foo();", NULL};
RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);
}
+
+
+TEST(StreamingUtf8ScriptWithByteOrderMark) {
+ // Some Windows editors add the "UTF-8 BOM". If we don't handle it
properly,
+ // some scripts might be parsed successfully but but the positions of
lazy
+ // functions will be off.
+ i::FLAG_min_preparse_length = 0;
+ i::FLAG_lazy = true;
+ // Note that in this case the byte order mark doesn't actually cause a
parse
+ // error, since it's a UTF-8 character followed by a comment.
+ char chunk1[] =
+ "XXX// That's the BOM.\n"
+ "function this_is_lazy() {\n"
+ " return 13;\n"
+ "}\n";
+ chunk1[0] = static_cast<char>(0xef);
+ chunk1[1] = static_cast<char>(0xbb);
+ chunk1[2] = static_cast<char>(0xbf);
+ const char* chunks[] = {chunk1, "this_is_lazy();", NULL};
+ RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);
+}
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/d/optout.