Title: [266528] trunk
Revision
266528
Author
[email protected]
Date
2020-09-03 09:57:49 -0700 (Thu, 03 Sep 2020)

Log Message

TextDecoder should ignore byte-order-mark like other browsers and spec
https://bugs.webkit.org/show_bug.cgi?id=216108

Reviewed by Darin Adler.

LayoutTests/imported/w3c:

* web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt:
* web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt:

Source/WebCore:

Covered by newly passing web platform tests.

* dom/TextDecoder.cpp:
(WebCore::TextDecoder::ignoreBOMIfNecessary):
(WebCore::TextDecoder::decode):
(WebCore::TextDecoder::prependBOMIfNecessary): Deleted.
* dom/TextDecoder.h:

Modified Paths

Diff

Modified: trunk/LayoutTests/imported/w3c/ChangeLog (266527 => 266528)


--- trunk/LayoutTests/imported/w3c/ChangeLog	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/ChangeLog	2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,5 +1,15 @@
 2020-09-03  Alex Christensen  <[email protected]>
 
+        TextDecoder should ignore byte-order-mark like other browsers and spec
+        https://bugs.webkit.org/show_bug.cgi?id=216108
+
+        Reviewed by Darin Adler.
+
+        * web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt:
+        * web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt:
+
+2020-09-03  Alex Christensen  <[email protected]>
+
         Align ISO-8859-{3,6,7,8,8-I} and windows-{874,1253,1255,1257} encodings with Chrome, Firefox, and the specification
         https://bugs.webkit.org/show_bug.cgi?id=216094
 

Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt (266527 => 266528)


--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt	2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,14 +1,14 @@
 
 PASS ignoreBOM should work for encoding utf-8, split at character 0 
-FAIL ignoreBOM should work for encoding utf-8, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
-FAIL ignoreBOM should work for encoding utf-8, split at character 2 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-8, split at character 1 
+PASS ignoreBOM should work for encoding utf-8, split at character 2 
 PASS ignoreBOM should work for encoding utf-8, split at character 3 
 PASS ignoreBOM should work for encoding utf-16le, split at character 0 
-FAIL ignoreBOM should work for encoding utf-16le, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16le, split at character 1 
 PASS ignoreBOM should work for encoding utf-16le, split at character 2 
-FAIL ignoreBOM should work for encoding utf-16le, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16le, split at character 3 
 PASS ignoreBOM should work for encoding utf-16be, split at character 0 
-FAIL ignoreBOM should work for encoding utf-16be, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16be, split at character 1 
 PASS ignoreBOM should work for encoding utf-16be, split at character 2 
-FAIL ignoreBOM should work for encoding utf-16be, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16be, split at character 3 
 

Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any.worker-expected.txt (266527 => 266528)


--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any.worker-expected.txt	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any.worker-expected.txt	2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,14 +1,14 @@
 
 PASS ignoreBOM should work for encoding utf-8, split at character 0 
-FAIL ignoreBOM should work for encoding utf-8, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
-FAIL ignoreBOM should work for encoding utf-8, split at character 2 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-8, split at character 1 
+PASS ignoreBOM should work for encoding utf-8, split at character 2 
 PASS ignoreBOM should work for encoding utf-8, split at character 3 
 PASS ignoreBOM should work for encoding utf-16le, split at character 0 
-FAIL ignoreBOM should work for encoding utf-16le, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16le, split at character 1 
 PASS ignoreBOM should work for encoding utf-16le, split at character 2 
-FAIL ignoreBOM should work for encoding utf-16le, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16le, split at character 3 
 PASS ignoreBOM should work for encoding utf-16be, split at character 0 
-FAIL ignoreBOM should work for encoding utf-16be, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16be, split at character 1 
 PASS ignoreBOM should work for encoding utf-16be, split at character 2 
-FAIL ignoreBOM should work for encoding utf-16be, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16be, split at character 3 
 

Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any-expected.txt (266527 => 266528)


--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any-expected.txt	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any-expected.txt	2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,4 +1,4 @@
 
-FAIL Modify buffer after passing it in (ArrayBuffer) assert_equals: expected "@" but got "@"
-FAIL Modify buffer after passing it in (SharedArrayBuffer) assert_equals: expected "@" but got "@"
+PASS Modify buffer after passing it in (ArrayBuffer) 
+PASS Modify buffer after passing it in (SharedArrayBuffer) 
 

Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any.worker-expected.txt (266527 => 266528)


--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any.worker-expected.txt	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any.worker-expected.txt	2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,4 +1,4 @@
 
-FAIL Modify buffer after passing it in (ArrayBuffer) assert_equals: expected "@" but got "@"
-FAIL Modify buffer after passing it in (SharedArrayBuffer) assert_equals: expected "@" but got "@"
+PASS Modify buffer after passing it in (ArrayBuffer) 
+PASS Modify buffer after passing it in (SharedArrayBuffer) 
 

Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt (266527 => 266528)


--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt	2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,6 +1,6 @@
 
-FAIL BOM is ignored if ignoreBOM option is specified: utf-8 assert_equals: utf-8: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
-FAIL BOM is ignored if ignoreBOM option is specified: utf-16le assert_equals: utf-16le: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
-FAIL BOM is ignored if ignoreBOM option is specified: utf-16be assert_equals: utf-16be: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
+PASS BOM is ignored if ignoreBOM option is specified: utf-8 
+PASS BOM is ignored if ignoreBOM option is specified: utf-16le 
+PASS BOM is ignored if ignoreBOM option is specified: utf-16be 
 PASS The ignoreBOM attribute of TextDecoder 
 

Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any.worker-expected.txt (266527 => 266528)


--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any.worker-expected.txt	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any.worker-expected.txt	2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,6 +1,6 @@
 
-FAIL BOM is ignored if ignoreBOM option is specified: utf-8 assert_equals: utf-8: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
-FAIL BOM is ignored if ignoreBOM option is specified: utf-16le assert_equals: utf-16le: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
-FAIL BOM is ignored if ignoreBOM option is specified: utf-16be assert_equals: utf-16be: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
+PASS BOM is ignored if ignoreBOM option is specified: utf-8 
+PASS BOM is ignored if ignoreBOM option is specified: utf-16le 
+PASS BOM is ignored if ignoreBOM option is specified: utf-16be 
 PASS The ignoreBOM attribute of TextDecoder 
 

Modified: trunk/Source/WebCore/ChangeLog (266527 => 266528)


--- trunk/Source/WebCore/ChangeLog	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/Source/WebCore/ChangeLog	2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,5 +1,20 @@
 2020-09-03  Alex Christensen  <[email protected]>
 
+        TextDecoder should ignore byte-order-mark like other browsers and spec
+        https://bugs.webkit.org/show_bug.cgi?id=216108
+
+        Reviewed by Darin Adler.
+
+        Covered by newly passing web platform tests.
+
+        * dom/TextDecoder.cpp:
+        (WebCore::TextDecoder::ignoreBOMIfNecessary):
+        (WebCore::TextDecoder::decode):
+        (WebCore::TextDecoder::prependBOMIfNecessary): Deleted.
+        * dom/TextDecoder.h:
+
+2020-09-03  Alex Christensen  <[email protected]>
+
         Align ISO-8859-{3,6,7,8,8-I} and windows-{874,1253,1255,1257} encodings with Chrome, Firefox, and the specification
         https://bugs.webkit.org/show_bug.cgi?id=216094
 

Modified: trunk/Source/WebCore/dom/TextDecoder.cpp (266527 => 266528)


--- trunk/Source/WebCore/dom/TextDecoder.cpp	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/Source/WebCore/dom/TextDecoder.cpp	2020-09-03 16:57:49 UTC (rev 266528)
@@ -48,12 +48,51 @@
 {
 }
 
-void TextDecoder::ignoreBOMIfNecessary(const uint8_t*& data, size_t& length)
+constexpr uint8_t utf8BOMBytes[3] { 0xEF, 0xBB, 0xBF };
+constexpr uint8_t utf16BEBOMBytes[2] { 0xFE, 0xFF };
+constexpr uint8_t utf16LEBOMBytes[2] { 0xFF, 0xFE };
+
+size_t TextDecoder::bytesNeededForFullBOMIgnoreCheck() const
 {
-    const uint8_t utf8BOMBytes[3] = {0xEF, 0xBB, 0xBF};
-    const uint8_t utf16BEBOMBytes[2] = {0xFE, 0xFF};
-    const uint8_t utf16LEBOMBytes[2] = {0xFF, 0xFE};
+    if (m_textEncoding == UTF8Encoding())
+        return sizeof(utf8BOMBytes);
+    if (m_textEncoding == UTF16BigEndianEncoding())
+        return sizeof(utf16BEBOMBytes);
+    if (m_textEncoding == UTF16LittleEndianEncoding())
+        return sizeof(utf16LEBOMBytes);
+    return 0;
+}
 
+bool TextDecoder::isBeginningOfIncompleteBOM(const uint8_t* bytes, size_t length) const
+{
+    if (!length)
+        return true;
+
+    if (m_textEncoding == UTF8Encoding()) {
+        if (length == 1)
+            return bytes[0] == utf8BOMBytes[0];
+        return length == 2 && bytes[0] == utf8BOMBytes[0] && bytes[1] == utf8BOMBytes[1];
+    }
+    if (m_textEncoding == UTF16BigEndianEncoding())
+        return length == 1 && bytes[0] == utf16BEBOMBytes[0];
+    if (m_textEncoding == UTF16LittleEndianEncoding())
+        return length == 1 && bytes[0] == utf16LEBOMBytes[0];
+
+    return false;
+}
+
+auto TextDecoder::ignoreBOMIfNecessary(const uint8_t*& data, size_t& length, bool stream) -> WaitForMoreBOMBytes
+{
+    if (m_bomIgnoredIfNecessary || m_options.ignoreBOM)
+        return WaitForMoreBOMBytes::No;
+
+    if (stream && length < bytesNeededForFullBOMIgnoreCheck()) {
+        if (isBeginningOfIncompleteBOM(data, length))
+            return WaitForMoreBOMBytes::Yes;
+        m_bomIgnoredIfNecessary = true;
+        return WaitForMoreBOMBytes::No;
+    }
+
     if (m_textEncoding == UTF8Encoding()
         && length >= sizeof(utf8BOMBytes)
         && data[0] == utf8BOMBytes[0]
@@ -74,18 +113,10 @@
         data += sizeof(utf16LEBOMBytes);
         length -= sizeof(utf16LEBOMBytes);
     }
+    m_bomIgnoredIfNecessary = true;
+    return WaitForMoreBOMBytes::No;
 }
 
-String TextDecoder::prependBOMIfNecessary(const String& decoded)
-{
-    if (m_hasDecoded || !m_options.ignoreBOM)
-        return decoded;
-    const UChar utf16BEBOM[2] = {0xFEFF, '\0'};
-
-    // FIXME: Make TextCodec::decode take a flag for prepending BOM so we don't need to do this extra allocation and copy.
-    return makeString(utf16BEBOM, decoded);
-}
-
 static size_t codeUnitByteSize(const TextEncoding& encoding)
 {
     return encoding.isByteBasedEncoding() ? 1 : 2;
@@ -102,14 +133,24 @@
         length = inputBuffer->length();
     }
 
-    ignoreBOMIfNecessary(data, length);
+    if (!options.stream)
+        m_bomIgnoredIfNecessary = false;
 
+    bool alreadyBuffered = false;
     if (m_buffer.size()) {
         m_buffer.append(data, length);
         data = ""
         length = m_buffer.size();
+        alreadyBuffered = true;
     }
 
+    if (ignoreBOMIfNecessary(data, length, options.stream) == WaitForMoreBOMBytes::Yes) {
+        ASSERT(options.stream);
+        if (!alreadyBuffered)
+            m_buffer.append(data, length);
+        return String();
+    }
+
     const bool stopOnError = true;
     bool sawError = false;
     if (length % codeUnitByteSize(m_textEncoding))
@@ -117,7 +158,7 @@
     const char* charData = reinterpret_cast<const char*>(data);
     String result;
     if (!sawError)
-        result = prependBOMIfNecessary(m_textEncoding.decode(charData, length, stopOnError, sawError));
+        result = m_textEncoding.decode(charData, length, stopOnError, sawError);
 
     if (sawError) {
         if (options.stream) {
@@ -127,12 +168,11 @@
         } else {
             if (m_options.fatal)
                 return Exception { TypeError };
-            result = prependBOMIfNecessary(m_textEncoding.decode(charData, length));
+            result = m_textEncoding.decode(charData, length);
         }
     } else
         m_buffer.clear();
 
-    m_hasDecoded = true;
     return result;
 }
 

Modified: trunk/Source/WebCore/dom/TextDecoder.h (266527 => 266528)


--- trunk/Source/WebCore/dom/TextDecoder.h	2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/Source/WebCore/dom/TextDecoder.h	2020-09-03 16:57:49 UTC (rev 266528)
@@ -50,13 +50,17 @@
     ExceptionOr<String> decode(Optional<BufferSource::VariantType>, DecodeOptions);
 
 private:
-    String prependBOMIfNecessary(const String&);
-    void ignoreBOMIfNecessary(const uint8_t*& data, size_t& length);
     TextDecoder(const char*, Options);
+
+    enum class WaitForMoreBOMBytes : bool { No, Yes };
+    WaitForMoreBOMBytes ignoreBOMIfNecessary(const uint8_t*& data, size_t& length, bool stream);
+    size_t bytesNeededForFullBOMIgnoreCheck() const;
+    bool isBeginningOfIncompleteBOM(const uint8_t*, size_t) const;
+
     TextEncoding m_textEncoding;
     Options m_options;
-    bool m_hasDecoded { false };
     Vector<uint8_t> m_buffer;
+    bool m_bomIgnoredIfNecessary { false };
 };
 
 }
_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to