Author: jukka
Date: Tue Apr 8 08:59:54 2008
New Revision: 645982
URL: http://svn.apache.org/viewvc?rev=645982&view=rev
Log:
TIKA-138: Ignore HTML style and script content
- Added a set of elements to discard, currently style and script
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=645982&r1=645981&r2=645982&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Tue Apr 8 08:59:54 2008
@@ -41,6 +41,9 @@
17. TIKA-134 - mvn package does not produce packages for bin/src
(Karl Heinz Marbaise)
+18. TIKA-138 - Ignore HTML style and script content (Jukka Zitting)
+
+
Release 0.1-incubating - 12/27/2007
1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=645982&r1=645981&r2=645982&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Tue Apr 8 08:59:54 2008
@@ -20,7 +20,9 @@
import java.io.InputStream;
import java.io.StringWriter;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Map;
+import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
@@ -54,6 +56,11 @@
private static final Map<String, String> SAFE_ELEMENTS =
new HashMap<String, String>();
+ /**
+ * Set of HTML elements whose content will be discarded.
+ */
+ private static final Set<String> DISCARD_ELEMENTS = new HashSet<String>();
+
static {
// Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
SAFE_ELEMENTS.put("P", "p");
@@ -72,6 +79,9 @@
SAFE_ELEMENTS.put("PRE", "pre");
SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues
+
+ DISCARD_ELEMENTS.add("STYLE");
+ DISCARD_ELEMENTS.add("SCRIPT");
}
public void parse(
@@ -110,13 +120,19 @@
private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
return new TextContentHandler(xhtml) {
+
+ private int discardLevel = 0;
+
@Override
public void startElement(
String uri, String local, String name, Attributes atts)
throws SAXException {
- String safe = SAFE_ELEMENTS.get(name);
- if (safe != null) {
- xhtml.startElement(safe);
+ if (discardLevel != 0) {
+ discardLevel++;
+ } else if (DISCARD_ELEMENTS.contains(name)) {
+ discardLevel = 1;
+ } else if (SAFE_ELEMENTS.containsKey(name)) {
+ xhtml.startElement(SAFE_ELEMENTS.get(name));
} else if ("A".equals(name)) {
String href = atts.getValue("href");
if (href == null) {
@@ -129,13 +145,31 @@
@Override
public void endElement(
String uri, String local, String name) throws SAXException
{
- String safe = SAFE_ELEMENTS.get(name);
- if (safe != null) {
- xhtml.endElement(safe);
+ if (discardLevel != 0) {
+ discardLevel--;
+ } else if (SAFE_ELEMENTS.containsKey(name)) {
+ xhtml.endElement(SAFE_ELEMENTS.get(name));
} else if ("A".equals(name)) {
xhtml.endElement("a");
}
}
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (discardLevel == 0) {
+ super.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ if (discardLevel == 0) {
+ super.ignorableWhitespace(ch, start, length);
+ }
+ }
+
};
}