[v8-dev] Streaming API: detect UTF-8 BOM. (issue 708823002 by [email protected])

marja Fri, 07 Nov 2014 01:17:18 -0800

Reviewers: Yang,

Message:
yangguo@, I'm having fun with UTF-8 again. Ptal.


Description:
Streaming API: detect UTF-8 BOM.

It used to cause lazy function positions to be off by one.

BUG=430890
LOG=N

Please review this at https://codereview.chromium.org/708823002/

Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Affected files (+67, -18 lines):
  M src/scanner-character-streams.h
  M src/scanner-character-streams.cc
  M test/cctest/test-api.cc


Index: src/scanner-character-streams.cc

diff --git a/src/scanner-character-streams.ccb/src/scanner-character-streams.ccindex50c3955c1bdc888200f593cad0f45ec69900ae18..0dec5daed4aa321a458c69041fe82def9fa941f1100644

--- a/src/scanner-character-streams.cc
+++ b/src/scanner-character-streams.cc

@@ -232,14 +232,12 @@ unsignedUtf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {


 static const byte kUtf8MultiByteMask = 0xC0;
 static const byte kUtf8MultiByteCharFollower = 0x80;
+static const byte kUtf8MultiByteCharStart = 0xC0;


-#ifdef DEBUG
-static const byte kUtf8MultiByteCharStart = 0xC0;
 static bool IsUtf8MultiCharacterStart(byte first_byte) {
   return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
 }
-#endif


 static bool IsUtf8MultiCharacterFollower(byte later_byte) {

@@ -341,6 +339,14 @@ unsigned ExternalStreamingStream::FillBuffer(unsignedposition) {// A caveat: a data chunk might end with bytes from an incompleteUTF-8

       // character (the rest of the bytes will be in the next chunk).
       if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
+        if (first_chunk_) {
+          // Get rid of the byte order mark (if any).
+          if (current_data_length_ >= 3 && current_data_[0] == 0xef &&
+              current_data_[1] == 0xbb && current_data_[2] == 0xbf) {
+            current_data_offset_ = 3;
+          }
+        }
+
         HandleUtf8SplitCharacters(&data_in_buffer);
         if (!data_ends && current_data_offset_ == current_data_length_) {
           // The data stream didn't end, but we used all the data in the

@@ -360,6 +366,8 @@ unsigned ExternalStreamingStream::FillBuffer(unsignedposition) {

         DCHECK(utf8_split_char_buffer_length_ == 0);
         return data_in_buffer;
       }
+
+      first_chunk_ = false;
     }

     // Fill the buffer from current_data_.

@@ -396,11 +404,11 @@ voidExternalStreamingStream::HandleUtf8SplitCharacters(

   unibrow::uchar c;
   if (utf8_split_char_buffer_length_ > 0) {

// Move the bytes which are part of the split character (which startedin

-    // the previous chunk) into utf8_split_char_buffer_. Note that the
-    // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.
-    while (current_data_offset_ < current_data_length_ &&
-           utf8_split_char_buffer_length_ < 4 &&
-           (c = current_data_[current_data_offset_]) >> 6 == 2) {
+    // the previous chunk) into utf8_split_char_buffer_.
+    while (
+        current_data_offset_ < current_data_length_ &&
+        utf8_split_char_buffer_length_ < 4 &&

+ IsUtf8MultiCharacterFollower(c =current_data_[current_data_offset_])) {

       utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
       ++utf8_split_char_buffer_length_;
       ++current_data_offset_;

@@ -426,15 +434,16 @@ voidExternalStreamingStream::HandleUtf8SplitCharacters(// bytes long, but if the data is invalid, we can have character valuesbigger

   // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
   while (current_data_length_ > current_data_offset_ &&
-         (c = current_data_[current_data_length_ - 1]) >
-             unibrow::Utf8::kMaxOneByteChar &&
-         utf8_split_char_buffer_length_ < 4) {
+         utf8_split_char_buffer_length_ < 4 &&
+         (IsUtf8MultiCharacterFollower(
+              c = current_data_[current_data_length_ - 1]) ||
+          IsUtf8MultiCharacterStart(c))) {
     --current_data_length_;
     ++utf8_split_char_buffer_length_;
-    if (c >= (3 << 6)) {
-      // 3 << 6 = 0b11000000; this is the first byte of the multi-byte

- // character. No need to copy the previous characters into theconversion

-      // buffer (even if they're multi-byte).
+    if (IsUtf8MultiCharacterStart(c)) {

+ // This is the first byte of the multi-byte character. No need tocopy the

+      // previous characters into the conversion buffer (even if they're
+      // multi-byte).
       break;
     }
   }
Index: src/scanner-character-streams.h

diff --git a/src/scanner-character-streams.hb/src/scanner-character-streams.hindexff22de59f72a370d93abe1f67eb154eb6b2c1874..ac399f67200bb11e6d26328dd8fefa8266281080100644

--- a/src/scanner-character-streams.h
+++ b/src/scanner-character-streams.h

@@ -87,7 +87,8 @@ class ExternalStreamingStream : publicBufferedUtf16CharacterStream {

         current_data_(NULL),
         current_data_offset_(0),
         current_data_length_(0),
-        utf8_split_char_buffer_length_(0) {}
+        utf8_split_char_buffer_length_(0),
+        first_chunk_(true) {}

   virtual ~ExternalStreamingStream() { delete[] current_data_; }

@@ -113,6 +114,7 @@ class ExternalStreamingStream : publicBufferedUtf16CharacterStream {// For converting UTF-8 characters which are split across two datachunks.

   uint8_t utf8_split_char_buffer_[4];
   unsigned utf8_split_char_buffer_length_;
+  bool first_chunk_;
 };


Index: test/cctest/test-api.cc
diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc

index5fa2a2fb3a0af7e0102c0b5fc980281db03ab6d6..25c5049e45140155574f027fe8a29bf8d2f061d8100644

--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc

@@ -23848,15 +23848,31 @@ class TestSourceStream : publicv8::ScriptCompiler::ExternalSourceStream {

   // too).
   static char* FullSourceString(const char** chunks) {
     size_t total_len = 0;
+    bool has_bom = false;
     for (size_t i = 0; chunks[i] != NULL; ++i) {
       total_len += strlen(chunks[i]);
+

+ // Remove BOM when constructing the full source string; thissimluates

+      // what the embedder does.
+      if (i == 0 && strlen(chunks[i]) >= 3 &&
+          chunks[i][0] == static_cast<char>(0xef) &&
+          chunks[i][1] == static_cast<char>(0xbb) &&
+          chunks[i][2] == static_cast<char>(0xbf)) {
+        total_len -= 3;
+        has_bom = true;
+      }
     }
     char* full_string = new char[total_len + 1];
     size_t offset = 0;
     for (size_t i = 0; chunks[i] != NULL; ++i) {
       size_t len = strlen(chunks[i]);
-      memcpy(full_string + offset, chunks[i], len);
-      offset += len;
+      if (has_bom) {
+        memcpy(full_string + offset, chunks[i] + 3, len - 3);
+        offset += len - 3;
+      } else {
+        memcpy(full_string + offset, chunks[i], len);
+        offset += len;
+      }
     }
     full_string[total_len] = 0;
     return full_string;
@@ -23900,6 +23916,7 @@ void RunStreamingTest(const char** chunks,
     CHECK(!script.IsEmpty());
     v8::Handle<Value> result(script->Run());
     // All scripts are supposed to return the fixed value 13 when ran.
+    CHECK_EQ(false, try_catch.HasCaught());
     CHECK_EQ(13, result->Int32Value());
   } else {
     CHECK(script.IsEmpty());

@@ -24191,3 +24208,24 @@TEST(StreamingUtf8ScriptWithMultipleMultibyteCharactersSomeSplit2) {

   const char* chunks[] = {chunk1, chunk2, "foo();", NULL};
   RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);
 }
+
+
+TEST(StreamingUtf8ScriptWithByteOrderMark) {

+ // Some Windows editors add the "UTF-8 BOM". If we don't handle itproperly,+ // some scripts might be parsed successfully but but the positions oflazy

+  // functions will be off.
+  i::FLAG_min_preparse_length = 0;
+  i::FLAG_lazy = true;

+ // Note that in this case the byte order mark doesn't actually cause aparse

+  // error, since it's a UTF-8 character followed by a comment.
+  char chunk1[] =
+      "XXX// That's the BOM.\n"
+      "function this_is_lazy() {\n"
+      "  return 13;\n"
+      "}\n";
+  chunk1[0] = static_cast<char>(0xef);
+  chunk1[1] = static_cast<char>(0xbb);
+  chunk1[2] = static_cast<char>(0xbf);
+  const char* chunks[] = {chunk1, "this_is_lazy();", NULL};
+  RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);
+}


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

---You received this message because you are subscribed to the Google Groups "v8-dev" group.

To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

[v8-dev] Streaming API: detect UTF-8 BOM. (issue 708823002 by [email protected])

Reply via email to