[v8-dev] Script streaming: more UTF-8 handing fixes (again). (issue 662003003 by [email protected])

marja Fri, 17 Oct 2014 07:14:53 -0700

Reviewers: Yang,

Message:
yangguo@, I unfortunately had to fix the UTF-8 handling once more, PTAL.


Description:
Script streaming: more UTF-8 handing fixes (again).

1) Since we fill the output buffer both from the chunks and the conversion

buffer, it's possible that we run out of space and call CopyCharsHelperwith 0

length. The underlying functions don't handle it gracefully, so check there.

2) There was a bug where we used to try to copy too many characters from the

beginning of the data chunk into the conversion buffer; continuation bytesinUTF-8 are of the form 0b10XXXXXX. If a byte is bigger than that, it's thefirst

byte of a new UTF-8 character and we should ignore it.

These two together (or maybe in combination with surrogates) are a probable
reason for crbug.com/420932.

3) The test data was off; \uc481 is \xec\x92\x81.

BUG=420932
LOG=N

Please review this at https://codereview.chromium.org/662003003/

Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Affected files (+54, -22 lines):
  M src/scanner-character-streams.cc
  M test/cctest/test-api.cc


Index: src/scanner-character-streams.cc

diff --git a/src/scanner-character-streams.ccb/src/scanner-character-streams.ccindex732b2b43f6469ee8ce9dcb9e0373678cbf62cdce..50c3955c1bdc888200f593cad0f45ec69900ae18100644

--- a/src/scanner-character-streams.cc
+++ b/src/scanner-character-streams.cc
@@ -18,6 +18,10 @@ namespace {

unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t*src,

                          unsigned* src_pos, unsigned src_length,

ScriptCompiler::StreamedSource::Encodingencoding) {+ // It's possible that this will be called with length 0, but don'tassume that

+  // the functions this calls handle it gracefully.
+  if (length == 0) return 0;
+
   if (encoding == ScriptCompiler::StreamedSource::UTF8) {
     return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
         dest, length, src, src_pos, src_length);

@@ -381,15 +385,22 @@ unsigned ExternalStreamingStream::FillBuffer(unsignedposition) {


 void ExternalStreamingStream::HandleUtf8SplitCharacters(
     unsigned* data_in_buffer) {

+ // Note the following property of UTF-8 which makes this functionpossible:

+  // Given any byte, we can always read its local environment (in both
+  // directions) to find out the (possibly multi-byte) character it belongs

+ // to. Single byte characters are of the form 0b0XXXXXXX. The first byteof a

+  // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or
+  // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.
+
   // First check if we have leftover data from the last chunk.
   unibrow::uchar c;
   if (utf8_split_char_buffer_length_ > 0) {

// Move the bytes which are part of the split character (which startedin

-    // the previous chunk) into utf8_split_char_buffer_.
+    // the previous chunk) into utf8_split_char_buffer_. Note that the
+    // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.
     while (current_data_offset_ < current_data_length_ &&
            utf8_split_char_buffer_length_ < 4 &&
-           (c = current_data_[current_data_offset_]) >
-               unibrow::Utf8::kMaxOneByteChar) {
+           (c = current_data_[current_data_offset_]) >> 6 == 2) {
       utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
       ++utf8_split_char_buffer_length_;
       ++current_data_offset_;
Index: test/cctest/test-api.cc
diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc

index283bcb8e0d66d869a7faf3751d761b5a3fb5a3c3..35a7156e9e509ec11df1ab63a5d5e3ac4a2fa7f8100644

--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc
@@ -23636,14 +23636,14 @@ TEST(StreamingScriptWithParseError) {


 TEST(StreamingUtf8Script) {

- // We'd want to write \uc481 instead of \xeb\x91\x80, but Windowscompilers+ // We'd want to write \uc481 instead of \xec\x92\x81, but Windowscompilers

   // don't like it.
   const char* chunk1 =
       "function foo() {\n"

" // This function will contain an UTF-8 character which is notin\n"

       "  // ASCII.\n"
-      "  var foob\xeb\x91\x80r = 13;\n"
-      "  return foob\xeb\x91\x80r;\n"
+      "  var foob\xec\x92\x81r = 13;\n"
+      "  return foob\xec\x92\x81r;\n"
       "}\n";
   const char* chunks[] = {chunk1, "foo(); ", NULL};
   RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);

@@ -23654,7 +23654,7 @@TEST(StreamingUtf8ScriptWithSplitCharactersSanityCheck) {

   // A sanity check to prove that the approach of splitting UTF-8

// characters is correct. Here is an UTF-8 character which will takethree

   // bytes.
-  const char* reference = "\xeb\x91\x80";
+  const char* reference = "\xec\x92\x81";
   CHECK(3u == strlen(reference));  // NOLINT - no CHECK_EQ for unsigned.

   char chunk1[] =

@@ -23664,7 +23664,7 @@TEST(StreamingUtf8ScriptWithSplitCharactersSanityCheck) {

       "  var foob";
   char chunk2[] =
       "XXXr = 13;\n"
-      "  return foob\xeb\x91\x80r;\n"
+      "  return foob\xec\x92\x81r;\n"
       "}\n";
   for (int i = 0; i < 3; ++i) {
     chunk2[i] = reference[i];

@@ -23677,7 +23677,7 @@TEST(StreamingUtf8ScriptWithSplitCharactersSanityCheck) {

 TEST(StreamingUtf8ScriptWithSplitCharacters) {

// Stream data where a multi-byte UTF-8 character is split between twodata

   // chunks.
-  const char* reference = "\xeb\x91\x80";
+  const char* reference = "\xec\x92\x81";
   char chunk1[] =
       "function foo() {\n"

" // This function will contain an UTF-8 character which is notin\n"

@@ -23685,7 +23685,7 @@ TEST(StreamingUtf8ScriptWithSplitCharacters) {
       "  var foobX";
   char chunk2[] =
       "XXr = 13;\n"
-      "  return foob\xeb\x91\x80r;\n"
+      "  return foob\xec\x92\x81r;\n"
       "}\n";
   chunk1[strlen(chunk1) - 1] = reference[0];
   chunk2[0] = reference[1];

@@ -23701,7 +23701,7 @@TEST(StreamingUtf8ScriptWithSplitCharactersValidEdgeCases) {// Case 1: a chunk contains only bytes for a split character (and noother// data). This kind of a chunk would be exceptionally small, but weshould

   // still decode it correctly.
-  const char* reference = "\xeb\x91\x80";
+  const char* reference = "\xec\x92\x81";
   // The small chunk is at the beginning of the split character
   {
     char chunk1[] =

@@ -23712,7 +23712,7 @@TEST(StreamingUtf8ScriptWithSplitCharactersValidEdgeCases) {

     char chunk2[] = "XX";
     char chunk3[] =
         "Xr = 13;\n"
-        "  return foob\xeb\x91\x80r;\n"
+        "  return foob\xec\x92\x81r;\n"
         "}\n";
     chunk2[0] = reference[0];
     chunk2[1] = reference[1];

@@ -23730,7 +23730,7 @@TEST(StreamingUtf8ScriptWithSplitCharactersValidEdgeCases) {

     char chunk2[] = "XX";
     char chunk3[] =
         "r = 13;\n"
-        "  return foob\xeb\x91\x80r;\n"
+        "  return foob\xec\x92\x81r;\n"
         "}\n";
     chunk1[strlen(chunk1) - 1] = reference[0];
     chunk2[0] = reference[1];

@@ -23742,8 +23742,8 @@TEST(StreamingUtf8ScriptWithSplitCharactersValidEdgeCases) {

   // decoded correctly and not just ignored.
   {
     char chunk1[] =
-        "var foob\xeb\x91\x80 = 13;\n"
-        "foob\xeb\x91\x80";
+        "var foob\xec\x92\x81 = 13;\n"
+        "foob\xec\x92\x81";
     const char* chunks[] = {chunk1, NULL};
     RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);
   }

@@ -23754,7 +23754,7 @@TEST(StreamingUtf8ScriptWithSplitCharactersInvalidEdgeCases) {

   // Test cases where a UTF-8 character is split over several chunks. Those

// cases are not supported (the embedder should give the data in bigenough

   // chunks), but we shouldn't crash, just produce a parse error.
-  const char* reference = "\xeb\x91\x80";
+  const char* reference = "\xec\x92\x81";
   char chunk1[] =
       "function foo() {\n"

" // This function will contain an UTF-8 character which is notin\n"@@ -23763,7 +23763,7 @@TEST(StreamingUtf8ScriptWithSplitCharactersInvalidEdgeCases) {

   char chunk2[] = "X";
   char chunk3[] =
       "Xr = 13;\n"
-      "  return foob\xeb\x91\x80r;\n"
+      "  return foob\xec\x92\x81r;\n"
       "}\n";
   chunk1[strlen(chunk1) - 1] = reference[0];
   chunk2[0] = reference[1];
@@ -23805,7 +23805,7 @@ TEST(StreamingProducesParserCache) {
 TEST(StreamingScriptWithInvalidUtf8) {

// Regression test for a crash: test that invalid UTF-8 bytes in the endof a

   // chunk don't produce a crash.
-  const char* reference = "\xeb\x91\x80\x80\x80";
+  const char* reference = "\xec\x92\x81\x80\x80";
   char chunk1[] =
       "function foo() {\n"

" // This function will contain an UTF-8 character which is notin\n"

@@ -23813,7 +23813,7 @@ TEST(StreamingScriptWithInvalidUtf8) {

" var foobXXXXX"; // Too many bytes which look like incompletechars!

   char chunk2[] =
       "r = 13;\n"
-      "  return foob\xeb\x91\x80\x80\x80r;\n"
+      "  return foob\xec\x92\x81\x80\x80r;\n"
       "}\n";

for (int i = 0; i < 5; ++i) chunk1[strlen(chunk1) - 5 + i] =reference[i];


@@ -23825,15 +23825,36 @@ TEST(StreamingScriptWithInvalidUtf8) {
 TEST(StreamingUtf8ScriptWithMultipleMultibyteCharactersSomeSplit) {
   // Regression test: Stream data where there are several multi-byte UTF-8

// characters in a sequence and one of them is split between two datachunks.

-  const char* reference = "\xeb\x91\x80";
+  const char* reference = "\xec\x92\x81";
   char chunk1[] =
       "function foo() {\n"

" // This function will contain an UTF-8 character which is notin\n"

       "  // ASCII.\n"
-      "  var foob\xeb\x91\x80X";
+      "  var foob\xec\x92\x81X";
   char chunk2[] =
       "XXr = 13;\n"
-      "  return foob\xeb\x91\x80\xeb\x91\x80r;\n"
+      "  return foob\xec\x92\x81\xec\x92\x81r;\n"
+      "}\n";
+  chunk1[strlen(chunk1) - 1] = reference[0];
+  chunk2[0] = reference[1];
+  chunk2[1] = reference[2];
+  const char* chunks[] = {chunk1, chunk2, "foo();", NULL};
+  RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8);
+}
+
+
+TEST(StreamingUtf8ScriptWithMultipleMultibyteCharactersSomeSplit2) {

+ // Another regression test, similar to the previous one. The differenceis

+  // that the split character is not the last one in the sequence.
+  const char* reference = "\xec\x92\x81";
+  char chunk1[] =
+      "function foo() {\n"

+ " // This function will contain an UTF-8 character which is notin\n"

+      "  // ASCII.\n"
+      "  var foobX";
+  char chunk2[] =
+      "XX\xec\x92\x81r = 13;\n"
+      "  return foob\xec\x92\x81\xec\x92\x81r;\n"
       "}\n";
   chunk1[strlen(chunk1) - 1] = reference[0];
   chunk2[0] = reference[1];


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

---You received this message because you are subscribed to the Google Groups "v8-dev" group.

To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

[v8-dev] Script streaming: more UTF-8 handing fixes (again). (issue 662003003 by [email protected])

Reply via email to