Author: jukka
Date: Fri Oct 16 14:46:52 2009
New Revision: 825915
URL: http://svn.apache.org/viewvc?rev=825915&view=rev
Log:
TIKA-304: HtmlParser could be easier to subclass
Added protected mapSafeElement() and isDiscardElement() methods that should
make it easier to customize the HTMLParser behaviour.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=825915&r1=825914&r2=825915&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Fri Oct 16 14:46:52 2009
@@ -18,10 +18,6 @@
import java.net.MalformedURLException;
import java.net.URL;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.TextContentHandler;
@@ -32,46 +28,7 @@
class HtmlHandler extends TextContentHandler {
- /**
- * Set of safe mappings from incoming HTML elements to outgoing
- * XHTML elements. Ensures that the output is valid XHTML 1.0 Strict.
- */
- private static final Map<String, String> SAFE_ELEMENTS =
- new HashMap<String, String>();
-
- /**
- * Set of HTML elements whose content will be discarded.
- */
- private static final Set<String> DISCARD_ELEMENTS = new HashSet<String>();
-
- static {
- // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
- SAFE_ELEMENTS.put("P", "p");
- SAFE_ELEMENTS.put("H1", "h1");
- SAFE_ELEMENTS.put("H2", "h2");
- SAFE_ELEMENTS.put("H3", "h3");
- SAFE_ELEMENTS.put("H4", "h4");
- SAFE_ELEMENTS.put("H5", "h5");
- SAFE_ELEMENTS.put("H6", "h6");
- SAFE_ELEMENTS.put("UL", "ul");
- SAFE_ELEMENTS.put("OL", "ol");
- SAFE_ELEMENTS.put("LI", "li");
- SAFE_ELEMENTS.put("MENU", "ul");
- SAFE_ELEMENTS.put("DL", "dl");
- SAFE_ELEMENTS.put("DT", "dt");
- SAFE_ELEMENTS.put("DD", "dd");
- SAFE_ELEMENTS.put("PRE", "pre");
- SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
- SAFE_ELEMENTS.put("TABLE", "table");
- SAFE_ELEMENTS.put("THEAD", "thead");
- SAFE_ELEMENTS.put("TBODY", "tbody");
- SAFE_ELEMENTS.put("TR", "tr");
- SAFE_ELEMENTS.put("TH", "th");
- SAFE_ELEMENTS.put("TD", "td");
-
- DISCARD_ELEMENTS.add("STYLE");
- DISCARD_ELEMENTS.add("SCRIPT");
- }
+ private final HtmlParser parser;
private final XHTMLContentHandler xhtml;
@@ -85,8 +42,10 @@
private final StringBuilder title = new StringBuilder();
- private HtmlHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ private HtmlHandler(
+ HtmlParser parser, XHTMLContentHandler xhtml, Metadata metadata) {
super(xhtml);
+ this.parser = parser;
this.xhtml = xhtml;
this.metadata = metadata;
@@ -105,8 +64,9 @@
}
}
- public HtmlHandler(ContentHandler handler, Metadata metadata) {
- this(new XHTMLContentHandler(handler, metadata), metadata);
+ public HtmlHandler(
+ HtmlParser parser, ContentHandler handler, Metadata metadata) {
+ this(parser, new XHTMLContentHandler(handler, metadata), metadata);
}
@Override
@@ -119,7 +79,7 @@
if ("BODY".equals(name) || bodyLevel > 0) {
bodyLevel++;
}
- if (DISCARD_ELEMENTS.contains(name) || discardLevel > 0) {
+ if (parser.isDiscardElement(name) || discardLevel > 0) {
discardLevel++;
}
@@ -143,8 +103,9 @@
}
if (bodyLevel > 0 && discardLevel == 0) {
- if (SAFE_ELEMENTS.containsKey(name)) {
- xhtml.startElement(SAFE_ELEMENTS.get(name));
+ String safe = parser.mapSafeElement(name);
+ if (safe != null) {
+ xhtml.startElement(safe);
} else if ("A".equals(name)) {
String href = atts.getValue("href");
if (href != null) {
@@ -167,8 +128,9 @@
public void endElement(
String uri, String local, String name) throws SAXException {
if (bodyLevel > 0 && discardLevel == 0) {
- if (SAFE_ELEMENTS.containsKey(name)) {
- xhtml.endElement(SAFE_ELEMENTS.get(name));
+ String safe = parser.mapSafeElement(name);
+ if (safe != null) {
+ xhtml.endElement(safe);
} else if ("A".equals(name)) {
xhtml.endElement("a");
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=825915&r1=825914&r2=825915&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Fri Oct 16 14:46:52 2009
@@ -53,8 +53,8 @@
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser =
new org.ccil.cowan.tagsoup.Parser();
- parser.setContentHandler(
- new XHTMLDowngradeHandler(new HtmlHandler(handler, metadata)));
+ parser.setContentHandler(new XHTMLDowngradeHandler(
+ new HtmlHandler(this, handler, metadata)));
parser.parse(source);
}
@@ -68,4 +68,65 @@
parse(stream, handler, metadata, context);
}
+ /**
+ * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+ * given element is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the element
+ * will be ignored but the content inside it is still processed. See
+ * the {...@link #isDiscardElement(String)} method for a way to discard
+ * the entire contents of an element.
+ * <p>
+ * Subclasses can override this method to customize the default mapping.
+ *
+ * @since Apache Tika 0.5
+ * @param name HTML element name (upper case)
+ * @return XHTML element name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ protected String mapSafeElement(String name) {
+ // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+
+ if ("H1".equals(name)) return "h1";
+ if ("H2".equals(name)) return "h2";
+ if ("H3".equals(name)) return "h3";
+ if ("H4".equals(name)) return "h4";
+ if ("H5".equals(name)) return "h5";
+ if ("H6".equals(name)) return "h6";
+
+ if ("P".equals(name)) return "p";
+ if ("PRE".equals(name)) return "pre";
+ if ("BLOCKQUOTE".equals(name)) return "blockquote";
+
+ if ("UL".equals(name)) return "ul";
+ if ("OL".equals(name)) return "ol";
+ if ("MENU".equals(name)) return "ul";
+ if ("LI".equals(name)) return "li";
+ if ("DL".equals(name)) return "dl";
+ if ("DT".equals(name)) return "dt";
+ if ("DD".equals(name)) return "dd";
+
+ if ("TABLE".equals(name)) return "table";
+ if ("THEAD".equals(name)) return "thead";
+ if ("TBODY".equals(name)) return "tbody";
+ if ("TR".equals(name)) return "tr";
+ if ("TH".equals(name)) return "th";
+ if ("TD".equals(name)) return "td";
+
+ return null;
+ }
+
+ /**
+ * Checks whether all content within the given HTML element should be
+ * discarded instead of including it in the parse output. Subclasses
+ * can override this method to customize the set of discarded elements.
+ *
+ * @since Apache Tika 0.5
+ * @param name HTML element name (upper case)
+ * @return <code>true</code> if content inside the named element
+ * should be ignored, <code>false</code> otherwise
+ */
+ protected boolean isDiscardElement(String name) {
+ return "STYLE".equals(name) || "SCRIPT".equals(name);
+ }
+
}