Title: [91025] trunk/Source/WebCore
Revision
91025
Author
jp...@apple.com
Date
2011-07-14 13:08:08 -0700 (Thu, 14 Jul 2011)

Log Message

Character reference parser for new XML parser
https://bugs.webkit.org/show_bug.cgi?id=64398

Reviewed by Adam Barth.

Refactored out the HTML entity parser and added a common template for parsing character references.

* WebCore.xcodeproj/project.pbxproj:
* html/parser/HTMLEntityParser.cpp:
(WebCore::consumeHTMLEntity):
* xml/parser/CharacterReferenceParserInlineMethods.h: Copied from Source/WebCore/html/parser/HTMLEntityParser.cpp.
(WebCore::isHexDigit):
(WebCore::unconsumeCharacters):
(WebCore::consumeCharacterReference):
* xml/parser/XMLCharacterReferenceParser.cpp: Added.
(WebCore::consumeXMLCharacterReference):
* xml/parser/XMLCharacterReferenceParser.h: Added.

Modified Paths

Added Paths

Diff

Modified: trunk/Source/WebCore/ChangeLog (91024 => 91025)


--- trunk/Source/WebCore/ChangeLog	2011-07-14 19:51:40 UTC (rev 91024)
+++ trunk/Source/WebCore/ChangeLog	2011-07-14 20:08:08 UTC (rev 91025)
@@ -1,3 +1,23 @@
+2011-07-14  Jeffrey Pfau  <jp...@apple.com>
+
+        Character reference parser for new XML parser
+        https://bugs.webkit.org/show_bug.cgi?id=64398
+
+        Reviewed by Adam Barth.
+
+        Refactored out the HTML entity parser and added a common template for parsing character references.
+
+        * WebCore.xcodeproj/project.pbxproj:
+        * html/parser/HTMLEntityParser.cpp:
+        (WebCore::consumeHTMLEntity):
+        * xml/parser/CharacterReferenceParserInlineMethods.h: Copied from Source/WebCore/html/parser/HTMLEntityParser.cpp.
+        (WebCore::isHexDigit):
+        (WebCore::unconsumeCharacters):
+        (WebCore::consumeCharacterReference):
+        * xml/parser/XMLCharacterReferenceParser.cpp: Added.
+        (WebCore::consumeXMLCharacterReference):
+        * xml/parser/XMLCharacterReferenceParser.h: Added.
+
 2011-07-14  Ilya Tikhonovsky  <loi...@chromium.org>
 
         Reviewed by Pavel Feldman.

Modified: trunk/Source/WebCore/WebCore.xcodeproj/project.pbxproj (91024 => 91025)


--- trunk/Source/WebCore/WebCore.xcodeproj/project.pbxproj	2011-07-14 19:51:40 UTC (rev 91024)
+++ trunk/Source/WebCore/WebCore.xcodeproj/project.pbxproj	2011-07-14 20:08:08 UTC (rev 91025)
@@ -55,6 +55,9 @@
 		00B9318C13BA8DCC0035A948 /* XMLDocumentParserScope.h in Headers */ = {isa = PBXBuildFile; fileRef = 00B9318613BA867F0035A948 /* XMLDocumentParserScope.h */; };
 		00CA93B213C6691600F7FE95 /* NewXMLDocumentParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 00CA93B113C6691600F7FE95 /* NewXMLDocumentParser.h */; };
 		00CA93B513C6697C00F7FE95 /* NewXMLDocumentParser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 00CA93B413C6697C00F7FE95 /* NewXMLDocumentParser.cpp */; };
+		00D0464A13C4D14500326FCC /* XMLCharacterReferenceParser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 00D0464813C4D14500326FCC /* XMLCharacterReferenceParser.cpp */; };
+		00D0464B13C4D14500326FCC /* XMLCharacterReferenceParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 00D0464913C4D14500326FCC /* XMLCharacterReferenceParser.h */; };
+		00022E6913CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h in Headers */ = {isa = PBXBuildFile; fileRef = 00022E6813CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h */; };
 		052BFCE9128ABF1500FD338D /* GeolocationClientMock.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 052BFCE8128ABF1500FD338D /* GeolocationClientMock.cpp */; };
 		052BFCEB128ABF2100FD338D /* GeolocationClientMock.h in Headers */ = {isa = PBXBuildFile; fileRef = 052BFCEA128ABF2100FD338D /* GeolocationClientMock.h */; settings = {ATTRIBUTES = (Private, ); }; };
 		05FD69E012845D4300B2BEB3 /* DOMTimeStamp.h in Headers */ = {isa = PBXBuildFile; fileRef = 05FD69DF12845D4300B2BEB3 /* DOMTimeStamp.h */; settings = {ATTRIBUTES = (Private, ); }; };
@@ -6411,6 +6414,9 @@
 		00B9318613BA867F0035A948 /* XMLDocumentParserScope.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = XMLDocumentParserScope.h; sourceTree = "<group>"; };
 		00CA93B113C6691600F7FE95 /* NewXMLDocumentParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = NewXMLDocumentParser.h; sourceTree = "<group>"; };
 		00CA93B413C6697C00F7FE95 /* NewXMLDocumentParser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NewXMLDocumentParser.cpp; sourceTree = "<group>"; };
+		00D0464813C4D14500326FCC /* XMLCharacterReferenceParser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = XMLCharacterReferenceParser.cpp; sourceTree = "<group>"; };
+		00D0464913C4D14500326FCC /* XMLCharacterReferenceParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = XMLCharacterReferenceParser.h; sourceTree = "<group>"; };
+		00022E6813CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CharacterReferenceParserInlineMethods.h; sourceTree = "<group>"; };
 		052BFCE8128ABF1500FD338D /* GeolocationClientMock.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = GeolocationClientMock.cpp; path = mock/GeolocationClientMock.cpp; sourceTree = "<group>"; };
 		052BFCEA128ABF2100FD338D /* GeolocationClientMock.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = GeolocationClientMock.h; path = mock/GeolocationClientMock.h; sourceTree = "<group>"; };
 		05FD69DF12845D4300B2BEB3 /* DOMTimeStamp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DOMTimeStamp.h; sourceTree = "<group>"; };
@@ -12697,8 +12703,11 @@
 		00B9318013BA867F0035A948 /* parser */ = {
 			isa = PBXGroup;
 			children = (
+				00022E6813CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h */,
 				00CA93B413C6697C00F7FE95 /* NewXMLDocumentParser.cpp */,
 				00CA93B113C6691600F7FE95 /* NewXMLDocumentParser.h */,
+				00D0464813C4D14500326FCC /* XMLCharacterReferenceParser.cpp */,
+				00D0464913C4D14500326FCC /* XMLCharacterReferenceParser.h */,
 				00B9318113BA867F0035A948 /* XMLDocumentParser.cpp */,
 				00B9318213BA867F0035A948 /* XMLDocumentParser.h */,
 				00B9318313BA867F0035A948 /* XMLDocumentParserLibxml2.cpp */,
@@ -20278,6 +20287,7 @@
 				6550B69E099DF0270090D781 /* CDATASection.h in Headers */,
 				514185EE0CD65F0400763C99 /* ChangeVersionWrapper.h in Headers */,
 				6550B6A0099DF0270090D781 /* CharacterData.h in Headers */,
+				00022E6913CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h in Headers */,
 				B2C3DA2A0D006C1D00EF6F26 /* CharsetData.h in Headers */,
 				F55B3DB21251F12D003EF269 /* CheckboxInputType.h in Headers */,
 				A00B721A11DE6428008AB9FF /* CheckedInt.h in Headers */,
@@ -23048,6 +23058,7 @@
 				93309E24099E64920056E581 /* WrapContentsInDummySpanCommand.h in Headers */,
 				9BAF3B2412C1A39800014BF1 /* WritingDirection.h in Headers */,
 				6565820209D1508D000E61D7 /* XLinkNames.h in Headers */,
+				00D0464B13C4D14500326FCC /* XMLCharacterReferenceParser.h in Headers */,
 				00B9318813BA8DBA0035A948 /* XMLDocumentParser.h in Headers */,
 				00B9318C13BA8DCC0035A948 /* XMLDocumentParserScope.h in Headers */,
 				59C28046138DC2410079B7E2 /* XMLErrors.h in Headers */,
@@ -25817,6 +25828,7 @@
 				5112247810CFB8F4008099D7 /* WorkerThreadableWebSocketChannel.cpp in Sources */,
 				93309E23099E64920056E581 /* WrapContentsInDummySpanCommand.cpp in Sources */,
 				A833C7CC0A2CF07400D57664 /* XLinkNames.cpp in Sources */,
+				00D0464A13C4D14500326FCC /* XMLCharacterReferenceParser.cpp in Sources */,
 				00B9318713BA8DB30035A948 /* XMLDocumentParser.cpp in Sources */,
 				00B9318913BA8DBC0035A948 /* XMLDocumentParserLibxml2.cpp in Sources */,
 				00B9318B13BA8DC90035A948 /* XMLDocumentParserScope.cpp in Sources */,

Modified: trunk/Source/WebCore/html/parser/HTMLEntityParser.cpp (91024 => 91025)


--- trunk/Source/WebCore/html/parser/HTMLEntityParser.cpp	2011-07-14 19:51:40 UTC (rev 91024)
+++ trunk/Source/WebCore/html/parser/HTMLEntityParser.cpp	2011-07-14 20:08:08 UTC (rev 91025)
@@ -28,6 +28,7 @@
 #include "config.h"
 #include "HTMLEntityParser.h"
 
+#include "CharacterReferenceParserInlineMethods.h"
 #include "HTMLEntitySearch.h"
 #include "HTMLEntityTable.h"
 #include <wtf/Vector.h>
@@ -45,211 +46,104 @@
     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
 };
 
-inline UChar adjustEntity(UChar32 value)
-{
-    if ((value & ~0x1F) != 0x0080)
-        return value;
-    return windowsLatin1ExtensionArray[value - 0x80];
-}
-
-inline UChar32 legalEntityFor(UChar32 value)
-{
-    // FIXME: A number of specific entity values generate parse errors.
-    if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
-        return 0xFFFD;
-    if (U_IS_BMP(value))
-        return adjustEntity(value);
-    return value;
-}
-
-inline bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity)
-{
-    if (U_IS_BMP(value)) {
-        UChar character = static_cast<UChar>(value);
-        ASSERT(character == value);
-        decodedEntity.append(character);
-        return true;
-    }
-    decodedEntity.append(U16_LEAD(value));
-    decodedEntity.append(U16_TRAIL(value));
-    return true;
-}
-
-inline bool isHexDigit(UChar cc)
-{
-    return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
-}
-
 inline bool isAlphaNumeric(UChar cc)
 {
     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
 }
 
-void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters)
-{
-    if (consumedCharacters.size() == 1)
-        source.push(consumedCharacters[0]);
-    else if (consumedCharacters.size() == 2) {
-        source.push(consumedCharacters[0]);
-        source.push(consumedCharacters[1]);
-    } else
-        source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size())));
-}
+class HTMLEntityParser {
+public:
+    inline static UChar adjustEntity(UChar32 value)
+    {
+        if ((value & ~0x1F) != 0x0080)
+            return value;
+        return windowsLatin1ExtensionArray[value - 0x80];
+    }
 
-}
+    inline static UChar32 legalEntityFor(UChar32 value)
+    {
+        // FIXME: A number of specific entity values generate parse errors.
+        if (!value || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
+            return 0xFFFD;
+        if (U_IS_BMP(value))
+            return adjustEntity(value);
+        return value;
+    }
 
-bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
-{
-    ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
-    ASSERT(!notEnoughCharacters);
-    ASSERT(decodedEntity.isEmpty());
+    inline static bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity)
+    {
+        if (U_IS_BMP(value)) {
+            UChar character = static_cast<UChar>(value);
+            ASSERT(character == value);
+            decodedEntity.append(character);
+            return true;
+        }
+        decodedEntity.append(U16_LEAD(value));
+        decodedEntity.append(U16_TRAIL(value));
+        return true;
+    }
 
-    enum EntityState {
-        Initial,
-        Number,
-        MaybeHexLowerCaseX,
-        MaybeHexUpperCaseX,
-        Hex,
-        Decimal,
-        Named
-    };
-    EntityState entityState = Initial;
-    UChar32 result = 0;
-    Vector<UChar, 10> consumedCharacters;
+    inline static bool acceptMalformed() { return true; }
 
-    while (!source.isEmpty()) {
-        UChar cc = *source;
-        switch (entityState) {
-        case Initial: {
-            if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
-                return false;
-            if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
-                return false;
-            if (cc == '#') {
-                entityState = Number;
+    inline static bool consumeNamedEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
+    {
+        Vector<UChar, 10> consumedCharacters;
+        HTMLEntitySearch entitySearch;
+        while (!source.isEmpty()) {
+            cc = *source;
+            entitySearch.advance(cc);
+            if (!entitySearch.isEntityPrefix())
                 break;
-            }
-            if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
-                entityState = Named;
-                continue;
-            }
-            return false;
+            consumedCharacters.append(cc);
+            source.advanceAndASSERT(cc);
         }
-        case Number: {
-            if (cc == 'x') {
-                entityState = MaybeHexLowerCaseX;
-                break;
-            }
-            if (cc == 'X') {
-                entityState = MaybeHexUpperCaseX;
-                break;
-            }
-            if (cc >= '0' && cc <= '9') {
-                entityState = Decimal;
-                continue;
-            }
-            source.push('#');
+        notEnoughCharacters = source.isEmpty();
+        if (notEnoughCharacters) {
+            // We can't an entity because there might be a longer entity
+            // that we could match if we had more data.
+            unconsumeCharacters(source, consumedCharacters);
             return false;
         }
-        case MaybeHexLowerCaseX: {
-            if (isHexDigit(cc)) {
-                entityState = Hex;
-                continue;
-            }
-            source.push('#');
-            source.push('x');
+        if (!entitySearch.mostRecentMatch()) {
+            ASSERT(!entitySearch.currentValue());
+            unconsumeCharacters(source, consumedCharacters);
             return false;
         }
-        case MaybeHexUpperCaseX: {
-            if (isHexDigit(cc)) {
-                entityState = Hex;
-                continue;
-            }
-            source.push('#');
-            source.push('X');
-            return false;
-        }
-        case Hex: {
-            if (cc >= '0' && cc <= '9')
-                result = result * 16 + cc - '0';
-            else if (cc >= 'a' && cc <= 'f')
-                result = result * 16 + 10 + cc - 'a';
-            else if (cc >= 'A' && cc <= 'F')
-                result = result * 16 + 10 + cc - 'A';
-            else {
-                if (cc == ';')
-                    source.advanceAndASSERT(cc);
-                return convertToUTF16(legalEntityFor(result), decodedEntity);
-            }
-            break;
-        }
-        case Decimal: {
-            if (cc >= '0' && cc <= '9')
-                result = result * 10 + cc - '0';
-            else {
-                if (cc == ';')
-                    source.advanceAndASSERT(cc);
-                return convertToUTF16(legalEntityFor(result), decodedEntity);
-            }
-            break;
-        }
-        case Named: {
-            HTMLEntitySearch entitySearch;
-            while (!source.isEmpty()) {
+        if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
+            // We've consumed too many characters. We need to walk the
+            // source back to the point at which we had consumed an
+            // actual entity.
+            unconsumeCharacters(source, consumedCharacters);
+            consumedCharacters.clear();
+            const int length = entitySearch.mostRecentMatch()->length;
+            const UChar* reference = entitySearch.mostRecentMatch()->entity;
+            for (int i = 0; i < length; ++i) {
                 cc = *source;
-                entitySearch.advance(cc);
-                if (!entitySearch.isEntityPrefix())
-                    break;
+                ASSERT_UNUSED(reference, cc == *reference++);
                 consumedCharacters.append(cc);
                 source.advanceAndASSERT(cc);
+                ASSERT(!source.isEmpty());
             }
-            notEnoughCharacters = source.isEmpty();
-            if (notEnoughCharacters) {
-                // We can't an entity because there might be a longer entity
-                // that we could match if we had more data.
-                unconsumeCharacters(source, consumedCharacters);
-                return false;
-            }
-            if (!entitySearch.mostRecentMatch()) {
-                ASSERT(!entitySearch.currentValue());
-                unconsumeCharacters(source, consumedCharacters);
-                return false;
-            }
-            if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
-                // We've consumed too many characters.  We need to walk the
-                // source back to the point at which we had consumed an
-                // actual entity.
-                unconsumeCharacters(source, consumedCharacters);
-                consumedCharacters.clear();
-                const int length = entitySearch.mostRecentMatch()->length;
-                const UChar* reference = entitySearch.mostRecentMatch()->entity;
-                for (int i = 0; i < length; ++i) {
-                    cc = *source;
-                    ASSERT_UNUSED(reference, cc == *reference++);
-                    consumedCharacters.append(cc);
-                    source.advanceAndASSERT(cc);
-                    ASSERT(!source.isEmpty());
-                }
-                cc = *source;
-            }
-            if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
-                || !additionalAllowedCharacter
-                || !(isAlphaNumeric(cc) || cc == '=')) {
-                return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity);
-            }
-            unconsumeCharacters(source, consumedCharacters);
-            return false;
+            cc = *source;
         }
+        if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
+            || !additionalAllowedCharacter
+            || !(isAlphaNumeric(cc) || cc == '=')) {
+            return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity);
         }
-        consumedCharacters.append(cc);
-        source.advanceAndASSERT(cc);
+        unconsumeCharacters(source, consumedCharacters);
+        return false;
     }
-    ASSERT(source.isEmpty());
-    notEnoughCharacters = true;
-    unconsumeCharacters(source, consumedCharacters);
-    return false;
+};
+
 }
 
+
+bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
+{
+    return consumeCharacterReference<HTMLEntityParser>(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter);
+}
+
 UChar decodeNamedEntity(const char* name)
 {
     HTMLEntitySearch search;

Added: trunk/Source/WebCore/xml/parser/CharacterReferenceParserInlineMethods.h (0 => 91025)


--- trunk/Source/WebCore/xml/parser/CharacterReferenceParserInlineMethods.h	                        (rev 0)
+++ trunk/Source/WebCore/xml/parser/CharacterReferenceParserInlineMethods.h	2011-07-14 20:08:08 UTC (rev 91025)
@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2008 Apple Inc. All Rights Reserved.
+ * Copyright (C) 2010 Google, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef CharacterReferenceParserInlineMethods_h
+#define CharacterReferenceParserInlineMethods_h
+
+#include <wtf/Vector.h>
+
+namespace WebCore {
+
+inline bool isHexDigit(UChar cc)
+{
+    return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
+}
+
+inline void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters)
+{
+    if (consumedCharacters.size() == 1)
+        source.push(consumedCharacters[0]);
+    else if (consumedCharacters.size() == 2) {
+        source.push(consumedCharacters[0]);
+        source.push(consumedCharacters[1]);
+    } else
+        source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size())));
+}
+
+template <typename ParserFunctions>
+bool consumeCharacterReference(SegmentedString& source, Vector<UChar, 16>& decodedCharacter, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
+{
+    ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
+    ASSERT(!notEnoughCharacters);
+    ASSERT(decodedCharacter.isEmpty());
+    
+    enum EntityState {
+        Initial,
+        Number,
+        MaybeHexLowerCaseX,
+        MaybeHexUpperCaseX,
+        Hex,
+        Decimal,
+        Named
+    };
+    EntityState entityState = Initial;
+    UChar32 result = 0;
+    Vector<UChar, 10> consumedCharacters;
+    
+    while (!source.isEmpty()) {
+        UChar cc = *source;
+        switch (entityState) {
+        case Initial: {
+            if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
+                return false;
+            if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
+                return false;
+            if (cc == '#') {
+                entityState = Number;
+                break;
+            }
+            if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
+                entityState = Named;
+                continue;
+            }
+            return false;
+        }
+        case Number: {
+            if (cc == 'x') {
+                entityState = MaybeHexLowerCaseX;
+                break;
+            }
+            if (cc == 'X') {
+                entityState = MaybeHexUpperCaseX;
+                break;
+            }
+            if (cc >= '0' && cc <= '9') {
+                entityState = Decimal;
+                continue;
+            }
+            source.push('#');
+            return false;
+        }
+        case MaybeHexLowerCaseX: {
+            if (isHexDigit(cc)) {
+                entityState = Hex;
+                continue;
+            }
+            source.push('#');
+            source.push('x');
+            return false;
+        }
+        case MaybeHexUpperCaseX: {
+            if (isHexDigit(cc)) {
+                entityState = Hex;
+                continue;
+            }
+            source.push('#');
+            source.push('X');
+            return false;
+        }
+        case Hex: {
+            if (cc >= '0' && cc <= '9')
+                result = result * 16 + cc - '0';
+            else if (cc >= 'a' && cc <= 'f')
+                result = result * 16 + 10 + cc - 'a';
+            else if (cc >= 'A' && cc <= 'F')
+                result = result * 16 + 10 + cc - 'A';
+            else if (cc == ';') {
+                source.advanceAndASSERT(cc);
+                return ParserFunctions::convertToUTF16(ParserFunctions::legalEntityFor(result), decodedCharacter);
+            } else if (ParserFunctions::acceptMalformed())
+                return ParserFunctions::convertToUTF16(ParserFunctions::legalEntityFor(result), decodedCharacter);
+            else {
+                unconsumeCharacters(source, consumedCharacters);
+                return false;
+            }
+            break;
+        }
+        case Decimal: {
+            if (cc >= '0' && cc <= '9')
+                result = result * 10 + cc - '0';
+            else if (cc == ';') {
+                source.advanceAndASSERT(cc);
+                return ParserFunctions::convertToUTF16(ParserFunctions::legalEntityFor(result), decodedCharacter);
+            } else if (ParserFunctions::acceptMalformed())
+                return ParserFunctions::convertToUTF16(ParserFunctions::legalEntityFor(result), decodedCharacter);
+            else {
+                unconsumeCharacters(source, consumedCharacters);
+                return false;
+            }
+            break;
+        }
+        case Named: {
+            return ParserFunctions::consumeNamedEntity(source, decodedCharacter, notEnoughCharacters, additionalAllowedCharacter, cc);
+        }
+        }
+        consumedCharacters.append(cc);
+        source.advanceAndASSERT(cc);
+    }
+    ASSERT(source.isEmpty());
+    notEnoughCharacters = true;
+    unconsumeCharacters(source, consumedCharacters);
+    return false;
+}
+
+}
+
+#endif // CharacterReferenceParserInlineMethods_h

Added: trunk/Source/WebCore/xml/parser/XMLCharacterReferenceParser.cpp (0 => 91025)


--- trunk/Source/WebCore/xml/parser/XMLCharacterReferenceParser.cpp	                        (rev 0)
+++ trunk/Source/WebCore/xml/parser/XMLCharacterReferenceParser.cpp	2011-07-14 20:08:08 UTC (rev 91025)
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2008 Apple Inc. All Rights Reserved.
+ * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
+ * Copyright (C) 2010 Google, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include "XMLCharacterReferenceParser.h"
+
+using namespace WTF;
+
+#include "CharacterReferenceParserInlineMethods.h"
+
+namespace WebCore {
+
+namespace {
+
+class XMLCharacterReferenceParser {
+public:
+    inline static UChar32 legalEntityFor(UChar32 value)
+    {
+        // FIXME: A number of specific entity values generate parse errors.
+        if (!value || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
+        return 0xFFFD;
+        return value;
+    }
+
+    inline static bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedCharacter)
+    {
+        if (U_IS_BMP(value)) {
+        UChar character = static_cast<UChar>(value);
+        ASSERT(character == value);
+        decodedCharacter.append(character);
+        return true;
+        }
+        decodedCharacter.append(U16_LEAD(value));
+        decodedCharacter.append(U16_TRAIL(value));
+        return true;
+    }
+
+    inline static bool acceptMalformed() { return false; }
+
+    inline static bool consumeNamedEntity(SegmentedString&, Vector<UChar, 16>&, bool&, UChar, UChar&)
+    {
+        ASSERT_NOT_REACHED();
+        return false;
+    }
+};
+
+}
+
+bool consumeXMLCharacterReference(SegmentedString& source, Vector<UChar, 16>& decodedCharacter, bool& notEnoughCharacters)
+{
+    return consumeCharacterReference<XMLCharacterReferenceParser>(source, decodedCharacter, notEnoughCharacters, 0);
+}
+
+} // namespace WebCore

Added: trunk/Source/WebCore/xml/parser/XMLCharacterReferenceParser.h (0 => 91025)


--- trunk/Source/WebCore/xml/parser/XMLCharacterReferenceParser.h	                        (rev 0)
+++ trunk/Source/WebCore/xml/parser/XMLCharacterReferenceParser.h	2011-07-14 20:08:08 UTC (rev 91025)
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2008 Apple Inc. All Rights Reserved.
+ * Copyright (C) 2010 Google, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#ifndef XMLCharacterReferenceParser_h
+#define XMLCharacterReferenceParser_h
+
+#include "SegmentedString.h"
+
+namespace WebCore {
+    
+bool consumeXMLCharacterReference(SegmentedString&, Vector<UChar, 16>& decodedCharacter, bool& notEnoughCharacters);
+
+}
+
+#endif // XMLCharacterReferenceParser_h
_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
http://lists.webkit.org/mailman/listinfo.cgi/webkit-changes

Reply via email to