Skip to site navigation (Press enter)

[webkit-changes] [123011] trunk/Source/WebCore

msaboff Wed, 18 Jul 2012 13:28:59 -0700

Title: [123011] trunk/Source/WebCore

Revision: 123011
Author: msab...@apple.com
Date: 2012-07-18 13:28:39 -0700 (Wed, 18 Jul 2012)

Log Message

Make TextCodecUTF8 handle 8 bit data without converting to UChar's
https://bugs.webkit.org/show_bug.cgi?id=90320


Reviewed by Oliver Hunt.

Change UTF8 Codec to produce 8-bit strings when data fits in 8-bit range.
First we try decoding the string as all 8-bit and then fall back to 16 bit
when we find the first character that doesn't fit in 8 bits.  Then we take
the already decoded data and copy / convert it to a 16-bit buffer and then
continue process the rest of the stream as 16-bits.

No new tests, no change in functionality.

* platform/text/TextCodecUTF8.cpp:
(WebCore::TextCodecUTF8::handleError):
(WebCore::TextCodecUTF8::decode):
* platform/text/TextCodecUTF8.h:
(TextCodecUTF8):

Modified Paths

trunk/Source/WebCore/ChangeLog
trunk/Source/WebCore/platform/text/TextCodecUTF8.cpp
trunk/Source/WebCore/platform/text/TextCodecUTF8.h

Diff

Modified: trunk/Source/WebCore/ChangeLog (123010 => 123011)


--- trunk/Source/WebCore/ChangeLog	2012-07-18 20:27:23 UTC (rev 123010)
+++ trunk/Source/WebCore/ChangeLog	2012-07-18 20:28:39 UTC (rev 123011)
@@ -1,3 +1,24 @@
+2012-07-18  Michael Saboff  <msab...@apple.com>
+
+        Make TextCodecUTF8 handle 8 bit data without converting to UChar's
+        https://bugs.webkit.org/show_bug.cgi?id=90320
+
+        Reviewed by Oliver Hunt.
+
+        Change UTF8 Codec to produce 8-bit strings when data fits in 8-bit range.
+        First we try decoding the string as all 8-bit and then fall back to 16 bit
+        when we find the first character that doesn't fit in 8 bits.  Then we take
+        the already decoded data and copy / convert it to a 16-bit buffer and then
+        continue process the rest of the stream as 16-bits.
+
+        No new tests, no change in functionality.
+
+        * platform/text/TextCodecUTF8.cpp:
+        (WebCore::TextCodecUTF8::handleError):
+        (WebCore::TextCodecUTF8::decode):
+        * platform/text/TextCodecUTF8.h:
+        (TextCodecUTF8):
+
 2012-07-18  Sailesh Agrawal  <s...@chromium.org>
 
         Chromium Mac: Add TEXTURE_RECTANGLE_ARB support to CCVideoLayerImpl

Modified: trunk/Source/WebCore/platform/text/TextCodecUTF8.cpp (123010 => 123011)


--- trunk/Source/WebCore/platform/text/TextCodecUTF8.cpp	2012-07-18 20:27:23 UTC (rev 123010)
+++ trunk/Source/WebCore/platform/text/TextCodecUTF8.cpp	2012-07-18 20:28:39 UTC (rev 123011)
@@ -167,7 +167,8 @@
     consumePartialSequenceByte();
 }
 
-void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
+template <>
+bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
 {
     ASSERT(m_partialSequenceSize);
     do {
@@ -177,10 +178,53 @@
             continue;
         }
         int count = nonASCIISequenceLength(m_partialSequence[0]);
+        if (!count)
+            return true;
+
+        if (count > m_partialSequenceSize) {
+            if (count - m_partialSequenceSize > end - source) {
+                if (!flush) {
+                    // The new data is not enough to complete the sequence, so
+                    // add it to the existing partial sequence.
+                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
+                    m_partialSequenceSize += end - source;
+                    return false;
+                }
+                // An incomplete partial sequence at the end is an error, but it will create
+                // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
+                // the error.
+                return true;
+            }
+            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
+            source += count - m_partialSequenceSize;
+            m_partialSequenceSize = count;
+        }
+        int character = decodeNonASCIISequence(m_partialSequence, count);
+        if ((character == nonCharacter) || (character > 0xff))
+            return true;
+
+        m_partialSequenceSize -= count;
+        *destination++ = character;
+    } while (m_partialSequenceSize);
+
+    return false;
+}
+
+template <>
+bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
+{
+    ASSERT(m_partialSequenceSize);
+    do {
+        if (isASCII(m_partialSequence[0])) {
+            *destination++ = m_partialSequence[0];
+            consumePartialSequenceByte();
+            continue;
+        }
+        int count = nonASCIISequenceLength(m_partialSequence[0]);
         if (!count) {
             handleError(destination, stopOnError, sawError);
             if (stopOnError)
-                return;
+                return false;
             continue;
         }
         if (count > m_partialSequenceSize) {
@@ -190,12 +234,12 @@
                     // add it to the existing partial sequence.
                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
                     m_partialSequenceSize += end - source;
-                    return;
+                    return false;
                 }
                 // An incomplete partial sequence at the end is an error.
                 handleError(destination, stopOnError, sawError);
                 if (stopOnError)
-                    return;
+                    return false;
                 continue;
             }
             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
@@ -206,34 +250,40 @@
         if (character == nonCharacter) {
             handleError(destination, stopOnError, sawError);
             if (stopOnError)
-                return;
+                return false;
             continue;
         }
+
         m_partialSequenceSize -= count;
         destination = appendCharacter(destination, character);
     } while (m_partialSequenceSize);
+
+    return false;
 }
-
+    
 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
 {
     // Each input byte might turn into a character.
     // That includes all bytes in the partial-sequence buffer because
     // each byte in an invalid sequence will turn into a replacement character.
-    StringBuffer<UChar> buffer(m_partialSequenceSize + length);
+    StringBuffer<LChar> buffer(m_partialSequenceSize + length);
 
     const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
     const uint8_t* end = source + length;
     const uint8_t* alignedEnd = alignToMachineWord(end);
-    UChar* destination = buffer.characters();
+    LChar* destination = buffer.characters();
 
     do {
         if (m_partialSequenceSize) {
             // Explicitly copy destination and source pointers to avoid taking pointers to the
             // local variables, which may harm code generation by disabling some optimizations
             // in some compilers.
-            UChar* destinationForHandlePartialSequence = destination;
+            LChar* destinationForHandlePartialSequence = destination;
             const uint8_t* sourceForHandlePartialSequence = source;
-            handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
+            if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
+                source = sourceForHandlePartialSequence;
+                goto upConvertTo16Bit;
+            }
             destination = destinationForHandlePartialSequence;
             source = sourceForHandlePartialSequence;
             if (m_partialSequenceSize)
@@ -279,19 +329,96 @@
                 sawError = true;
                 if (stopOnError)
                     break;
-                // Each error generates a replacement character and consumes one byte.
-                *destination++ = replacementCharacter;
-                ++source;
-                continue;
+                
+                goto upConvertTo16Bit;
             }
+            if (character > 0xff)
+                goto upConvertTo16Bit;
+
             source += count;
-            destination = appendCharacter(destination, character);
+            *destination++ = character;
         }
     } while (flush && m_partialSequenceSize);
 
     buffer.shrink(destination - buffer.characters());
 
     return String::adopt(buffer);
+
+upConvertTo16Bit:
+    StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
+
+    UChar* destination16 = buffer16.characters();
+
+    // Copy the already converted characters
+    for (LChar* converted8 = buffer.characters(); converted8 < destination;)
+        *destination16++ = *converted8++;
+
+    do {
+        if (m_partialSequenceSize) {
+            // Explicitly copy destination and source pointers to avoid taking pointers to the
+            // local variables, which may harm code generation by disabling some optimizations
+            // in some compilers.
+            UChar* destinationForHandlePartialSequence = destination16;
+            const uint8_t* sourceForHandlePartialSequence = source;
+            handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
+            destination16 = destinationForHandlePartialSequence;
+            source = sourceForHandlePartialSequence;
+            if (m_partialSequenceSize)
+                break;
+        }
+        
+        while (source < end) {
+            if (isASCII(*source)) {
+                // Fast path for ASCII. Most UTF-8 text will be ASCII.
+                if (isAlignedToMachineWord(source)) {
+                    while (source < alignedEnd) {
+                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
+                        if (!isAllASCII<LChar>(chunk))
+                            break;
+                        copyASCIIMachineWord(destination16, source);
+                        source += sizeof(MachineWord);
+                        destination16 += sizeof(MachineWord);
+                    }
+                    if (source == end)
+                        break;
+                    if (!isASCII(*source))
+                        continue;
+                }
+                *destination16++ = *source++;
+                continue;
+            }
+            int count = nonASCIISequenceLength(*source);
+            int character;
+            if (!count)
+                character = nonCharacter;
+            else {
+                if (count > end - source) {
+                    ASSERT(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
+                    ASSERT(!m_partialSequenceSize);
+                    m_partialSequenceSize = end - source;
+                    memcpy(m_partialSequence, source, m_partialSequenceSize);
+                    source = end;
+                    break;
+                }
+                character = decodeNonASCIISequence(source, count);
+            }
+            if (character == nonCharacter) {
+                sawError = true;
+                if (stopOnError)
+                    break;
+                // Each error generates a replacement character and consumes one byte.
+                *destination16++ = replacementCharacter;
+                ++source;
+                continue;
+            }
+            source += count;
+            destination16 = appendCharacter(destination16, character);
+        }
+    } while (flush && m_partialSequenceSize);
+    
+    buffer16.shrink(destination16 - buffer16.characters());
+    
+    return String::adopt(buffer16);
 }
 
 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)

Modified: trunk/Source/WebCore/platform/text/TextCodecUTF8.h (123010 => 123011)


--- trunk/Source/WebCore/platform/text/TextCodecUTF8.h	2012-07-18 20:27:23 UTC (rev 123010)
+++ trunk/Source/WebCore/platform/text/TextCodecUTF8.h	2012-07-18 20:28:39 UTC (rev 123011)
@@ -42,7 +42,8 @@
     virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError);
     virtual CString encode(const UChar*, size_t length, UnencodableHandling);
 
-    void handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError);
+    template <typename CharType>
+    bool handlePartialSequence(CharType*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError);
     void handleError(UChar*& destination, bool stopOnError, bool& sawError);
     void consumePartialSequenceByte();

_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
http://lists.webkit.org/mailman/listinfo/webkit-changes