Diff
Modified: trunk/LayoutTests/imported/w3c/ChangeLog (266527 => 266528)
--- trunk/LayoutTests/imported/w3c/ChangeLog 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/ChangeLog 2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,5 +1,15 @@
2020-09-03 Alex Christensen <[email protected]>
+ TextDecoder should ignore byte-order-mark like other browsers and spec
+ https://bugs.webkit.org/show_bug.cgi?id=216108
+
+ Reviewed by Darin Adler.
+
+ * web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt:
+ * web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt:
+
+2020-09-03 Alex Christensen <[email protected]>
+
Align ISO-8859-{3,6,7,8,8-I} and windows-{874,1253,1255,1257} encodings with Chrome, Firefox, and the specification
https://bugs.webkit.org/show_bug.cgi?id=216094
Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt (266527 => 266528)
--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt 2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,14 +1,14 @@
PASS ignoreBOM should work for encoding utf-8, split at character 0
-FAIL ignoreBOM should work for encoding utf-8, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
-FAIL ignoreBOM should work for encoding utf-8, split at character 2 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-8, split at character 1
+PASS ignoreBOM should work for encoding utf-8, split at character 2
PASS ignoreBOM should work for encoding utf-8, split at character 3
PASS ignoreBOM should work for encoding utf-16le, split at character 0
-FAIL ignoreBOM should work for encoding utf-16le, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16le, split at character 1
PASS ignoreBOM should work for encoding utf-16le, split at character 2
-FAIL ignoreBOM should work for encoding utf-16le, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16le, split at character 3
PASS ignoreBOM should work for encoding utf-16be, split at character 0
-FAIL ignoreBOM should work for encoding utf-16be, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16be, split at character 1
PASS ignoreBOM should work for encoding utf-16be, split at character 2
-FAIL ignoreBOM should work for encoding utf-16be, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16be, split at character 3
Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any.worker-expected.txt (266527 => 266528)
--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any.worker-expected.txt 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any.worker-expected.txt 2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,14 +1,14 @@
PASS ignoreBOM should work for encoding utf-8, split at character 0
-FAIL ignoreBOM should work for encoding utf-8, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
-FAIL ignoreBOM should work for encoding utf-8, split at character 2 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-8, split at character 1
+PASS ignoreBOM should work for encoding utf-8, split at character 2
PASS ignoreBOM should work for encoding utf-8, split at character 3
PASS ignoreBOM should work for encoding utf-16le, split at character 0
-FAIL ignoreBOM should work for encoding utf-16le, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16le, split at character 1
PASS ignoreBOM should work for encoding utf-16le, split at character 2
-FAIL ignoreBOM should work for encoding utf-16le, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16le, split at character 3
PASS ignoreBOM should work for encoding utf-16be, split at character 0
-FAIL ignoreBOM should work for encoding utf-16be, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16be, split at character 1
PASS ignoreBOM should work for encoding utf-16be, split at character 2
-FAIL ignoreBOM should work for encoding utf-16be, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
+PASS ignoreBOM should work for encoding utf-16be, split at character 3
Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any-expected.txt (266527 => 266528)
--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any-expected.txt 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any-expected.txt 2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,4 +1,4 @@
-FAIL Modify buffer after passing it in (ArrayBuffer) assert_equals: expected "@" but got "@"
-FAIL Modify buffer after passing it in (SharedArrayBuffer) assert_equals: expected "@" but got "@"
+PASS Modify buffer after passing it in (ArrayBuffer)
+PASS Modify buffer after passing it in (SharedArrayBuffer)
Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any.worker-expected.txt (266527 => 266528)
--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any.worker-expected.txt 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any.worker-expected.txt 2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,4 +1,4 @@
-FAIL Modify buffer after passing it in (ArrayBuffer) assert_equals: expected "@" but got "@"
-FAIL Modify buffer after passing it in (SharedArrayBuffer) assert_equals: expected "@" but got "@"
+PASS Modify buffer after passing it in (ArrayBuffer)
+PASS Modify buffer after passing it in (SharedArrayBuffer)
Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt (266527 => 266528)
--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt 2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,6 +1,6 @@
-FAIL BOM is ignored if ignoreBOM option is specified: utf-8 assert_equals: utf-8: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
-FAIL BOM is ignored if ignoreBOM option is specified: utf-16le assert_equals: utf-16le: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
-FAIL BOM is ignored if ignoreBOM option is specified: utf-16be assert_equals: utf-16be: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
+PASS BOM is ignored if ignoreBOM option is specified: utf-8
+PASS BOM is ignored if ignoreBOM option is specified: utf-16le
+PASS BOM is ignored if ignoreBOM option is specified: utf-16be
PASS The ignoreBOM attribute of TextDecoder
Modified: trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any.worker-expected.txt (266527 => 266528)
--- trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any.worker-expected.txt 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any.worker-expected.txt 2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,6 +1,6 @@
-FAIL BOM is ignored if ignoreBOM option is specified: utf-8 assert_equals: utf-8: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
-FAIL BOM is ignored if ignoreBOM option is specified: utf-16le assert_equals: utf-16le: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
-FAIL BOM is ignored if ignoreBOM option is specified: utf-16be assert_equals: utf-16be: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
+PASS BOM is ignored if ignoreBOM option is specified: utf-8
+PASS BOM is ignored if ignoreBOM option is specified: utf-16le
+PASS BOM is ignored if ignoreBOM option is specified: utf-16be
PASS The ignoreBOM attribute of TextDecoder
Modified: trunk/Source/WebCore/ChangeLog (266527 => 266528)
--- trunk/Source/WebCore/ChangeLog 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/Source/WebCore/ChangeLog 2020-09-03 16:57:49 UTC (rev 266528)
@@ -1,5 +1,20 @@
2020-09-03 Alex Christensen <[email protected]>
+ TextDecoder should ignore byte-order-mark like other browsers and spec
+ https://bugs.webkit.org/show_bug.cgi?id=216108
+
+ Reviewed by Darin Adler.
+
+ Covered by newly passing web platform tests.
+
+ * dom/TextDecoder.cpp:
+ (WebCore::TextDecoder::ignoreBOMIfNecessary):
+ (WebCore::TextDecoder::decode):
+ (WebCore::TextDecoder::prependBOMIfNecessary): Deleted.
+ * dom/TextDecoder.h:
+
+2020-09-03 Alex Christensen <[email protected]>
+
Align ISO-8859-{3,6,7,8,8-I} and windows-{874,1253,1255,1257} encodings with Chrome, Firefox, and the specification
https://bugs.webkit.org/show_bug.cgi?id=216094
Modified: trunk/Source/WebCore/dom/TextDecoder.cpp (266527 => 266528)
--- trunk/Source/WebCore/dom/TextDecoder.cpp 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/Source/WebCore/dom/TextDecoder.cpp 2020-09-03 16:57:49 UTC (rev 266528)
@@ -48,12 +48,51 @@
{
}
-void TextDecoder::ignoreBOMIfNecessary(const uint8_t*& data, size_t& length)
+constexpr uint8_t utf8BOMBytes[3] { 0xEF, 0xBB, 0xBF };
+constexpr uint8_t utf16BEBOMBytes[2] { 0xFE, 0xFF };
+constexpr uint8_t utf16LEBOMBytes[2] { 0xFF, 0xFE };
+
+size_t TextDecoder::bytesNeededForFullBOMIgnoreCheck() const
{
- const uint8_t utf8BOMBytes[3] = {0xEF, 0xBB, 0xBF};
- const uint8_t utf16BEBOMBytes[2] = {0xFE, 0xFF};
- const uint8_t utf16LEBOMBytes[2] = {0xFF, 0xFE};
+ if (m_textEncoding == UTF8Encoding())
+ return sizeof(utf8BOMBytes);
+ if (m_textEncoding == UTF16BigEndianEncoding())
+ return sizeof(utf16BEBOMBytes);
+ if (m_textEncoding == UTF16LittleEndianEncoding())
+ return sizeof(utf16LEBOMBytes);
+ return 0;
+}
+bool TextDecoder::isBeginningOfIncompleteBOM(const uint8_t* bytes, size_t length) const
+{
+ if (!length)
+ return true;
+
+ if (m_textEncoding == UTF8Encoding()) {
+ if (length == 1)
+ return bytes[0] == utf8BOMBytes[0];
+ return length == 2 && bytes[0] == utf8BOMBytes[0] && bytes[1] == utf8BOMBytes[1];
+ }
+ if (m_textEncoding == UTF16BigEndianEncoding())
+ return length == 1 && bytes[0] == utf16BEBOMBytes[0];
+ if (m_textEncoding == UTF16LittleEndianEncoding())
+ return length == 1 && bytes[0] == utf16LEBOMBytes[0];
+
+ return false;
+}
+
+auto TextDecoder::ignoreBOMIfNecessary(const uint8_t*& data, size_t& length, bool stream) -> WaitForMoreBOMBytes
+{
+ if (m_bomIgnoredIfNecessary || m_options.ignoreBOM)
+ return WaitForMoreBOMBytes::No;
+
+ if (stream && length < bytesNeededForFullBOMIgnoreCheck()) {
+ if (isBeginningOfIncompleteBOM(data, length))
+ return WaitForMoreBOMBytes::Yes;
+ m_bomIgnoredIfNecessary = true;
+ return WaitForMoreBOMBytes::No;
+ }
+
if (m_textEncoding == UTF8Encoding()
&& length >= sizeof(utf8BOMBytes)
&& data[0] == utf8BOMBytes[0]
@@ -74,18 +113,10 @@
data += sizeof(utf16LEBOMBytes);
length -= sizeof(utf16LEBOMBytes);
}
+ m_bomIgnoredIfNecessary = true;
+ return WaitForMoreBOMBytes::No;
}
-String TextDecoder::prependBOMIfNecessary(const String& decoded)
-{
- if (m_hasDecoded || !m_options.ignoreBOM)
- return decoded;
- const UChar utf16BEBOM[2] = {0xFEFF, '\0'};
-
- // FIXME: Make TextCodec::decode take a flag for prepending BOM so we don't need to do this extra allocation and copy.
- return makeString(utf16BEBOM, decoded);
-}
-
static size_t codeUnitByteSize(const TextEncoding& encoding)
{
return encoding.isByteBasedEncoding() ? 1 : 2;
@@ -102,14 +133,24 @@
length = inputBuffer->length();
}
- ignoreBOMIfNecessary(data, length);
+ if (!options.stream)
+ m_bomIgnoredIfNecessary = false;
+ bool alreadyBuffered = false;
if (m_buffer.size()) {
m_buffer.append(data, length);
data = ""
length = m_buffer.size();
+ alreadyBuffered = true;
}
+ if (ignoreBOMIfNecessary(data, length, options.stream) == WaitForMoreBOMBytes::Yes) {
+ ASSERT(options.stream);
+ if (!alreadyBuffered)
+ m_buffer.append(data, length);
+ return String();
+ }
+
const bool stopOnError = true;
bool sawError = false;
if (length % codeUnitByteSize(m_textEncoding))
@@ -117,7 +158,7 @@
const char* charData = reinterpret_cast<const char*>(data);
String result;
if (!sawError)
- result = prependBOMIfNecessary(m_textEncoding.decode(charData, length, stopOnError, sawError));
+ result = m_textEncoding.decode(charData, length, stopOnError, sawError);
if (sawError) {
if (options.stream) {
@@ -127,12 +168,11 @@
} else {
if (m_options.fatal)
return Exception { TypeError };
- result = prependBOMIfNecessary(m_textEncoding.decode(charData, length));
+ result = m_textEncoding.decode(charData, length);
}
} else
m_buffer.clear();
- m_hasDecoded = true;
return result;
}
Modified: trunk/Source/WebCore/dom/TextDecoder.h (266527 => 266528)
--- trunk/Source/WebCore/dom/TextDecoder.h 2020-09-03 16:33:59 UTC (rev 266527)
+++ trunk/Source/WebCore/dom/TextDecoder.h 2020-09-03 16:57:49 UTC (rev 266528)
@@ -50,13 +50,17 @@
ExceptionOr<String> decode(Optional<BufferSource::VariantType>, DecodeOptions);
private:
- String prependBOMIfNecessary(const String&);
- void ignoreBOMIfNecessary(const uint8_t*& data, size_t& length);
TextDecoder(const char*, Options);
+
+ enum class WaitForMoreBOMBytes : bool { No, Yes };
+ WaitForMoreBOMBytes ignoreBOMIfNecessary(const uint8_t*& data, size_t& length, bool stream);
+ size_t bytesNeededForFullBOMIgnoreCheck() const;
+ bool isBeginningOfIncompleteBOM(const uint8_t*, size_t) const;
+
TextEncoding m_textEncoding;
Options m_options;
- bool m_hasDecoded { false };
Vector<uint8_t> m_buffer;
+ bool m_bomIgnoredIfNecessary { false };
};
}