Author: jukka
Date: Tue Mar 18 17:20:18 2008
New Revision: 638657

URL: http://svn.apache.org/viewvc?rev=638657&view=rev
Log:
TIKA-128: HTML parser should produce XHTML SAX events

Modified:
    incubator/tika/trunk/CHANGES.txt
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
    incubator/tika/trunk/src/test/resources/test-documents/testHTML.html

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=638657&r1=638656&r2=638657&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Tue Mar 18 17:20:18 2008
@@ -31,6 +31,8 @@
 
 13. TIKA-131 - Lazy XHTML prefix generation (Jukka Zitting)
 
+14. TIKA-128 - HTML parser should produce XHTML SAX events (Jukka Zitting)
+
 
 Release 0.1-incubating - 12/27/2007
 

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=638657&r1=638656&r2=638657&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java 
Tue Mar 18 17:20:18 2008
@@ -18,12 +18,21 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
 import org.apache.tika.utils.Utils;
 import org.cyberneko.html.parsers.SAXParser;
 import org.xml.sax.Attributes;
@@ -32,97 +41,102 @@
 import org.xml.sax.SAXException;
 
 /**
- * Simple HTML parser that extracts title.
+ * HTML parser. Uses CyberNeko to turn the input document to HTML SAX events,
+ * and post-processes the events to produce XHTML and metadata expected by
+ * Tika clients.
  */
 public class HtmlParser extends AbstractParser {
 
+    /**
+     * Set of safe mappings from incoming HTML elements to outgoing
+     * XHTML elements. Ensures that the output is valid XHTML 1.0 Strict.
+     */
+    private static final Map<String, String> SAFE_ELEMENTS =
+        new HashMap<String, String>();
+
+    static {
+        // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+        SAFE_ELEMENTS.put("P", "p");
+        SAFE_ELEMENTS.put("H1", "h1");
+        SAFE_ELEMENTS.put("H2", "h2");
+        SAFE_ELEMENTS.put("H3", "h3");
+        SAFE_ELEMENTS.put("H4", "h4");
+        SAFE_ELEMENTS.put("H5", "h5");
+        SAFE_ELEMENTS.put("H6", "h6");
+        SAFE_ELEMENTS.put("UL", "ul");
+        SAFE_ELEMENTS.put("OL", "ol");
+        SAFE_ELEMENTS.put("LI", "li");
+        SAFE_ELEMENTS.put("DL", "dl");
+        SAFE_ELEMENTS.put("DT", "dt");
+        SAFE_ELEMENTS.put("DD", "dd");
+        SAFE_ELEMENTS.put("PRE", "pre");
+        SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
+        SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues
+    }
+
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
-        SAXParser parser = new SAXParser();
-        parser.setContentHandler(
-                new TitleExtractingContentHandler(handler, metadata));
-        parser.parse(new InputSource(Utils.getUTF8Reader(
-                new CloseShieldInputStream(stream), metadata)));
-    }
-
-    private static class TitleExtractingContentHandler extends
-            ContentHandlerDecorator {
-
-        private static final String TAG_TITLE = "TITLE";
-
-        private static final String TAG_HEAD = "HEAD";
-
-        private static final String TAG_HTML = "HTML";
-
-        private Phase phase = Phase.START;
-
-        private Metadata metadata;
-
-        private StringBuilder title = new StringBuilder();
+        // Protect the stream from being closed by CyberNeko
+        stream = new CloseShieldInputStream(stream);
 
-        private static enum Phase {
-            START, HTML, HEAD, TITLE, IGNORE;
-        }
+        // Prepare the HTML content handler that generates proper
+        // XHTML events to records relevant document metadata
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        XPathParser xpath = new XPathParser(null, "");
+        Matcher body = xpath.parse("/HTML/BODY//node()");
+        Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
+        handler = new TeeContentHandler(
+                new MatchingContentHandler(getBodyHandler(xhtml), body),
+                new MatchingContentHandler(getTitleHandler(metadata), title));
 
-        public TitleExtractingContentHandler(final ContentHandler handler,
-                final Metadata metadata) {
-            super(handler);
-            this.metadata = metadata;
-        }
-
-        @Override
-        public void startElement(String uri, String localName, String name,
-                Attributes atts) throws SAXException {
+        // Parse the HTML document
+        xhtml.startDocument();
+        SAXParser parser = new SAXParser();
+        parser.setContentHandler(handler);
+        parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
+        xhtml.endDocument();
+    }
 
-            switch (phase) {
-            case START:
-                if (TAG_HTML.equals(localName)) {
-                    phase = Phase.HTML;
-                }
-                break;
-            case HTML:
-                if (TAG_HEAD.equals(localName)) {
-                    phase = Phase.HEAD;
-                }
-                break;
-            case HEAD:
-                if (TAG_TITLE.equals(localName)) {
-                    phase = Phase.TITLE;
-                }
-                break;
+    private ContentHandler getTitleHandler(final Metadata metadata) {
+        final StringWriter writer = new StringWriter();
+        return new WriteOutContentHandler(writer) {
+            @Override
+            public void endElement(String u, String l, String n) {
+                metadata.set(Metadata.TITLE, writer.toString());
             }
-            super.startElement(uri, localName, name, atts);
-        }
+        };
+    }
 
-        @Override
-        public void characters(char[] ch, int start, int length)
-                throws SAXException {
-            switch (phase) {
-            case TITLE:
-                title.append(ch, start, length);
-                break;
+    private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
+        return new TextContentHandler(xhtml) {
+            @Override
+            public void startElement(
+                    String uri, String local, String name, Attributes atts)
+                    throws SAXException {
+                String safe = SAFE_ELEMENTS.get(name);
+                if (safe != null) {
+                    xhtml.startElement(safe);
+                } else if ("A".equals(name)) {
+                    String href = atts.getValue("href");
+                    if (href == null) {
+                        href = "";
+                    }
+                    xhtml.startElement("a", "href", href);
+                }
             }
-            super.characters(ch, start, length);
-        }
 
-        @Override
-        public void endElement(String uri, String localName, String name)
-                throws SAXException {
-            switch (phase) {
-            case TITLE:
-                if (TAG_TITLE.equals(localName)) {
-                    phase = Phase.IGNORE;
+            @Override
+            public void endElement(
+                    String uri, String local, String name) throws SAXException 
{
+                String safe = SAFE_ELEMENTS.get(name);
+                if (safe != null) {
+                    xhtml.endElement(safe);
+                } else if ("A".equals(name)) {
+                    xhtml.endElement("a");
                 }
-                break;
             }
-            super.endElement(uri, localName, name);
-        }
-
-        @Override
-        public void endDocument() throws SAXException {
-            metadata.set(Metadata.TITLE, title.toString());
-            super.endDocument();
-        }
+        };
     }
+
 }

Modified: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=638657&r1=638656&r2=638657&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Tue Mar 18 17:20:18 2008
@@ -27,6 +27,7 @@
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.WriteOutContentHandler;
+import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
 public class HtmlParserTest extends TestCase {
@@ -38,24 +39,38 @@
                 .getResourceAsStream(name);
     }
 
-    public void testParseAscii() throws IOException, SAXException,
-            TikaException {
-
+    public void testParseAscii() throws Exception {
         StringWriter writer = new StringWriter();
+        final StringWriter href = new StringWriter();
+
         Metadata metadata = new Metadata();
+        parser.parse(
+                getStream("test-documents/testHTML.html"),
+                new WriteOutContentHandler(writer) {
+                    @Override
+                    public void startElement(
+                            String uri, String local, String name,
+                            Attributes attributes) {
+                        if ("a".equals(local)) {
+                            href.append(attributes.getValue("href"));
+                        }
+                    }
+                },
+                metadata);
+
+        assertEquals(
+                "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
+        assertEquals("http://www.apache.org/";, href.toString());
 
-        parser.parse(getStream("test-documents/testHTML.html"),
-                new WriteOutContentHandler(writer), metadata);
         String content = writer.toString();
-
-        assertTrue("Did not contain expected text:"
-                + "Title : Test Indexation Html", content
-                .contains("Title : Test Indexation Html"));
-
-        assertTrue("Did not contain expected text:" + "Test Indexation Html",
+        assertTrue(
+                "Did not contain expected text: Title : Test Indexation Html",
+                content.contains("Title : Test Indexation Html"));
+        assertTrue(
+                "Did not contain expected text:" + "Test Indexation Html",
                 content.contains("Test Indexation Html"));
-
-        assertTrue("Did not contain expected text:" + "Indexation du fichier",
+        assertTrue(
+                "Did not contain expected text:" + "Indexation du fichier",
                 content.contains("Indexation du fichier"));
 
     }

Modified: incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testHTML.html?rev=638657&r1=638656&r2=638657&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testHTML.html 
(original)
+++ incubator/tika/trunk/src/test/resources/test-documents/testHTML.html Tue 
Mar 18 17:20:18 2008
@@ -4,6 +4,6 @@
        </head>
        <body>
                <h1>Test Indexation Html</h1>
-               <p>Indexation du fichier</p>
+               <p><a href="http://www.apache.org/";>Indexation</a> du 
fichier</p>
        </body> 
 </html>


Reply via email to