Author: jukka
Date: Tue Mar 18 17:20:18 2008
New Revision: 638657
URL: http://svn.apache.org/viewvc?rev=638657&view=rev
Log:
TIKA-128: HTML parser should produce XHTML SAX events
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=638657&r1=638656&r2=638657&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Tue Mar 18 17:20:18 2008
@@ -31,6 +31,8 @@
13. TIKA-131 - Lazy XHTML prefix generation (Jukka Zitting)
+14. TIKA-128 - HTML parser should produce XHTML SAX events (Jukka Zitting)
+
Release 0.1-incubating - 12/27/2007
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=638657&r1=638656&r2=638657&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Tue Mar 18 17:20:18 2008
@@ -18,12 +18,21 @@
import java.io.IOException;
import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.HashMap;
+import java.util.Map;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.utils.Utils;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
@@ -32,97 +41,102 @@
import org.xml.sax.SAXException;
/**
- * Simple HTML parser that extracts title.
+ * HTML parser. Uses CyberNeko to turn the input document to HTML SAX events,
+ * and post-processes the events to produce XHTML and metadata expected by
+ * Tika clients.
*/
public class HtmlParser extends AbstractParser {
+ /**
+ * Set of safe mappings from incoming HTML elements to outgoing
+ * XHTML elements. Ensures that the output is valid XHTML 1.0 Strict.
+ */
+ private static final Map<String, String> SAFE_ELEMENTS =
+ new HashMap<String, String>();
+
+ static {
+ // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+ SAFE_ELEMENTS.put("P", "p");
+ SAFE_ELEMENTS.put("H1", "h1");
+ SAFE_ELEMENTS.put("H2", "h2");
+ SAFE_ELEMENTS.put("H3", "h3");
+ SAFE_ELEMENTS.put("H4", "h4");
+ SAFE_ELEMENTS.put("H5", "h5");
+ SAFE_ELEMENTS.put("H6", "h6");
+ SAFE_ELEMENTS.put("UL", "ul");
+ SAFE_ELEMENTS.put("OL", "ol");
+ SAFE_ELEMENTS.put("LI", "li");
+ SAFE_ELEMENTS.put("DL", "dl");
+ SAFE_ELEMENTS.put("DT", "dt");
+ SAFE_ELEMENTS.put("DD", "dd");
+ SAFE_ELEMENTS.put("PRE", "pre");
+ SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
+ SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues
+ }
+
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
- SAXParser parser = new SAXParser();
- parser.setContentHandler(
- new TitleExtractingContentHandler(handler, metadata));
- parser.parse(new InputSource(Utils.getUTF8Reader(
- new CloseShieldInputStream(stream), metadata)));
- }
-
- private static class TitleExtractingContentHandler extends
- ContentHandlerDecorator {
-
- private static final String TAG_TITLE = "TITLE";
-
- private static final String TAG_HEAD = "HEAD";
-
- private static final String TAG_HTML = "HTML";
-
- private Phase phase = Phase.START;
-
- private Metadata metadata;
-
- private StringBuilder title = new StringBuilder();
+ // Protect the stream from being closed by CyberNeko
+ stream = new CloseShieldInputStream(stream);
- private static enum Phase {
- START, HTML, HEAD, TITLE, IGNORE;
- }
+ // Prepare the HTML content handler that generates proper
+ // XHTML events to records relevant document metadata
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ XPathParser xpath = new XPathParser(null, "");
+ Matcher body = xpath.parse("/HTML/BODY//node()");
+ Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
+ handler = new TeeContentHandler(
+ new MatchingContentHandler(getBodyHandler(xhtml), body),
+ new MatchingContentHandler(getTitleHandler(metadata), title));
- public TitleExtractingContentHandler(final ContentHandler handler,
- final Metadata metadata) {
- super(handler);
- this.metadata = metadata;
- }
-
- @Override
- public void startElement(String uri, String localName, String name,
- Attributes atts) throws SAXException {
+ // Parse the HTML document
+ xhtml.startDocument();
+ SAXParser parser = new SAXParser();
+ parser.setContentHandler(handler);
+ parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
+ xhtml.endDocument();
+ }
- switch (phase) {
- case START:
- if (TAG_HTML.equals(localName)) {
- phase = Phase.HTML;
- }
- break;
- case HTML:
- if (TAG_HEAD.equals(localName)) {
- phase = Phase.HEAD;
- }
- break;
- case HEAD:
- if (TAG_TITLE.equals(localName)) {
- phase = Phase.TITLE;
- }
- break;
+ private ContentHandler getTitleHandler(final Metadata metadata) {
+ final StringWriter writer = new StringWriter();
+ return new WriteOutContentHandler(writer) {
+ @Override
+ public void endElement(String u, String l, String n) {
+ metadata.set(Metadata.TITLE, writer.toString());
}
- super.startElement(uri, localName, name, atts);
- }
+ };
+ }
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- switch (phase) {
- case TITLE:
- title.append(ch, start, length);
- break;
+ private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
+ return new TextContentHandler(xhtml) {
+ @Override
+ public void startElement(
+ String uri, String local, String name, Attributes atts)
+ throws SAXException {
+ String safe = SAFE_ELEMENTS.get(name);
+ if (safe != null) {
+ xhtml.startElement(safe);
+ } else if ("A".equals(name)) {
+ String href = atts.getValue("href");
+ if (href == null) {
+ href = "";
+ }
+ xhtml.startElement("a", "href", href);
+ }
}
- super.characters(ch, start, length);
- }
- @Override
- public void endElement(String uri, String localName, String name)
- throws SAXException {
- switch (phase) {
- case TITLE:
- if (TAG_TITLE.equals(localName)) {
- phase = Phase.IGNORE;
+ @Override
+ public void endElement(
+ String uri, String local, String name) throws SAXException
{
+ String safe = SAFE_ELEMENTS.get(name);
+ if (safe != null) {
+ xhtml.endElement(safe);
+ } else if ("A".equals(name)) {
+ xhtml.endElement("a");
}
- break;
}
- super.endElement(uri, localName, name);
- }
-
- @Override
- public void endDocument() throws SAXException {
- metadata.set(Metadata.TITLE, title.toString());
- super.endDocument();
- }
+ };
}
+
}
Modified:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=638657&r1=638656&r2=638657&view=diff
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Tue Mar 18 17:20:18 2008
@@ -27,6 +27,7 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.WriteOutContentHandler;
+import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
public class HtmlParserTest extends TestCase {
@@ -38,24 +39,38 @@
.getResourceAsStream(name);
}
- public void testParseAscii() throws IOException, SAXException,
- TikaException {
-
+ public void testParseAscii() throws Exception {
StringWriter writer = new StringWriter();
+ final StringWriter href = new StringWriter();
+
Metadata metadata = new Metadata();
+ parser.parse(
+ getStream("test-documents/testHTML.html"),
+ new WriteOutContentHandler(writer) {
+ @Override
+ public void startElement(
+ String uri, String local, String name,
+ Attributes attributes) {
+ if ("a".equals(local)) {
+ href.append(attributes.getValue("href"));
+ }
+ }
+ },
+ metadata);
+
+ assertEquals(
+ "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
+ assertEquals("http://www.apache.org/", href.toString());
- parser.parse(getStream("test-documents/testHTML.html"),
- new WriteOutContentHandler(writer), metadata);
String content = writer.toString();
-
- assertTrue("Did not contain expected text:"
- + "Title : Test Indexation Html", content
- .contains("Title : Test Indexation Html"));
-
- assertTrue("Did not contain expected text:" + "Test Indexation Html",
+ assertTrue(
+ "Did not contain expected text: Title : Test Indexation Html",
+ content.contains("Title : Test Indexation Html"));
+ assertTrue(
+ "Did not contain expected text:" + "Test Indexation Html",
content.contains("Test Indexation Html"));
-
- assertTrue("Did not contain expected text:" + "Indexation du fichier",
+ assertTrue(
+ "Did not contain expected text:" + "Indexation du fichier",
content.contains("Indexation du fichier"));
}
Modified: incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testHTML.html?rev=638657&r1=638656&r2=638657&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
(original)
+++ incubator/tika/trunk/src/test/resources/test-documents/testHTML.html Tue
Mar 18 17:20:18 2008
@@ -4,6 +4,6 @@
</head>
<body>
<h1>Test Indexation Html</h1>
- <p>Indexation du fichier</p>
+ <p><a href="http://www.apache.org/">Indexation</a> du
fichier</p>
</body>
</html>