Author: kkrugler
Date: Thu Aug 12 21:42:55 2010
New Revision: 984997
URL: http://svn.apache.org/viewvc?rev=984997&view=rev
Log:
TIKA-478: Fix handling of <head> elements in HTML parser, and improve
robustness of XHTMLContentHandler.
Also tried to fix apparent bug w/indent & newline support in
XHTMLContentHandler.
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=984997&r1=984996&r2=984997&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
Thu Aug 12 21:42:55 2010
@@ -49,6 +49,19 @@ public class XHTMLContentHandler extends
private static final char[] TAB = new char[] { '\t' };
/**
+ * The elements that are in the <head> section.
+ */
+ private static final Set<String> HEAD =
+ unmodifiableSet("title", "link", "base", "meta");
+
+ /**
+ * The elements that are automatically emitted by lazyStartHead, so
+ * skip them if they get sent to startElement/endElement by mistake.
+ */
+ private static final Set<String> AUTO =
+ unmodifiableSet("html", "head", "body");
+
+ /**
* The elements that get prepended with the {...@link #TAB} character.
*/
private static final Set<String> INDENT =
@@ -62,6 +75,8 @@ public class XHTMLContentHandler extends
"pre", "hr", "blockquote", "address", "fieldset", "table", "form",
"noscript", "li", "dt", "dd", "noframes", "br", "tr");
+ private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
private static Set<String> unmodifiableSet(String... elements) {
return Collections.unmodifiableSet(
new HashSet<String>(Arrays.asList(elements)));
@@ -74,9 +89,10 @@ public class XHTMLContentHandler extends
private final Metadata metadata;
/**
- * Flag to indicate whether the document element has been started.
+ * Flags to indicate whether the document head element has been
started/ended.
*/
- private boolean started = false;
+ private boolean headStarted = false;
+ private boolean headEnded = false;
public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
super(handler);
@@ -104,19 +120,59 @@ public class XHTMLContentHandler extends
* <body>
* </pre>
*/
- private void lazyStartDocument() throws SAXException {
- if (!started) {
- started = true;
- startElement("html");
- startElement("head");
- startElement("title");
+ private void lazyStartHead() throws SAXException {
+ if (!headStarted) {
+ headStarted = true;
+
+ // Call directly, so we don't go through our startElement(), which
will
+ // ignore these elements.
+ super.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
+ super.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES);
+ }
+ }
+
+ /**
+ * Generates the following XHTML prefix when called for the first time:
+ * <pre>
+ * <html>
+ * <head>
+ * <title>...</title>
+ * </head>
+ * <body>
+ * </pre>
+ */
+ private void lazyEndHead() throws SAXException {
+ lazyStartHead();
+
+ if (!headEnded) {
+ headEnded = true;
+
+ // TIKA-478: Emit all metadata values (other than title). We have
to call
+ // startElement() and characters() directly to avoid recursive
problems.
+ for (String name : metadata.names()) {
+ if (name.equals("title")) {
+ continue;
+ }
+
+ for (String value : metadata.getValues(name)) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", name, name, "CDATA", value);
+ super.startElement(XHTML, "meta", "meta", attributes);
+ super.endElement(XHTML, "meta", "meta");
+ }
+ }
+
+ super.startElement(XHTML, "title", "title", EMPTY_ATTRIBUTES);
String title = metadata.get(Metadata.TITLE);
if (title != null && title.length() > 0) {
- characters(title);
+ char[] titleChars = title.toCharArray();
+ super.characters(titleChars, 0, titleChars.length);
}
- endElement("title");
- endElement("head");
- startElement("body");
+
+ super.endElement(XHTML, "title", "title");
+
+ super.endElement(XHTML, "head", "head");
+ super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
}
}
@@ -130,7 +186,8 @@ public class XHTMLContentHandler extends
*/
@Override
public void endDocument() throws SAXException {
- lazyStartDocument();
+ lazyEndHead();
+
endElement("body");
endElement("html");
endPrefixMapping("");
@@ -145,11 +202,20 @@ public class XHTMLContentHandler extends
public void startElement(
String uri, String local, String name, Attributes attributes)
throws SAXException {
- lazyStartDocument();
- if (XHTML.equals(uri) && INDENT.contains(local)) {
- ignorableWhitespace(TAB, 0, TAB.length);
+
+ if (!AUTO.contains(name)) {
+ if (HEAD.contains(name)) {
+ lazyStartHead();
+ } else {
+ lazyEndHead();
+ }
+
+ if (XHTML.equals(uri) && INDENT.contains(name)) {
+ ignorableWhitespace(TAB, 0, TAB.length);
+ }
+
+ super.startElement(uri, local, name, attributes);
}
- super.startElement(uri, local, name, attributes);
}
/**
@@ -157,11 +223,12 @@ public class XHTMLContentHandler extends
* by a newline character.
*/
@Override
- public void endElement(String uri, String local, String name)
- throws SAXException {
- super.endElement(uri, local, name);
- if (XHTML.equals(uri) && ENDLINE.contains(local)) {
- newline();
+ public void endElement(String uri, String local, String name) throws
SAXException {
+ if (!AUTO.contains(name)) {
+ super.endElement(uri, local, name);
+ if (XHTML.equals(uri) && ENDLINE.contains(name)) {
+ newline();
+ }
}
}
@@ -169,16 +236,15 @@ public class XHTMLContentHandler extends
* @see <a
href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
*/
@Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- lazyStartDocument();
+ public void characters(char[] ch, int start, int length) throws
SAXException {
+ lazyEndHead();
super.characters(ch, start, length);
}
//------------------------------------------< public convenience methods >
public void startElement(String name) throws SAXException {
- startElement(XHTML, name, name, new AttributesImpl());
+ startElement(XHTML, name, name, EMPTY_ATTRIBUTES);
}
public void startElement(String name, String attribute, String value)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=984997&r1=984996&r2=984997&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Thu Aug 12 21:42:55 2010
@@ -88,18 +88,20 @@ class HtmlHandler extends TextContentHan
if (bodyLevel == 0 && discardLevel == 0) {
if ("META".equals(name) && atts.getValue("content") != null) {
+
+ // TIKA-478: For cases where we have either a name or
"http-equiv", assume
+ // that XHTMLContentHandler will emit these in the <head>,
thus passing them
+ // through safely.
if (atts.getValue("http-equiv") != null) {
metadata.set(
atts.getValue("http-equiv"),
atts.getValue("content"));
- xhtml.startElement(uri, local, "meta", atts);
- }
- if (atts.getValue("name") != null) {
+ } else if (atts.getValue("name") != null) {
// Record the meta tag in the metadata
metadata.set(
atts.getValue("name"),
atts.getValue("content"));
- // Normalise if possible
+ // Normalize if possible
if(atts.getValue("name").equalsIgnoreCase("ICBM")) {
Matcher m = Pattern.compile(
"\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*"
@@ -109,8 +111,6 @@ class HtmlHandler extends TextContentHan
metadata.set(Metadata.LONGITUDE, m.group(2));
}
}
- // Allow downstream processing
- xhtml.startElement(uri, local, "meta", atts);
}
} else if ("BASE".equals(name) && atts.getValue("href") != null) {
metadata.set(
@@ -126,8 +126,9 @@ class HtmlHandler extends TextContentHan
String safe = mapper.mapSafeElement(name);
if (safe != null) {
// check if there are any attributes to process
- if (atts.getLength()==0) xhtml.startElement(safe);
- else {
+ if (atts.getLength() == 0) {
+ xhtml.startElement(safe);
+ } else {
AttributesImpl newAttributes = new AttributesImpl(atts);
for (int att=0;att<newAttributes.getLength();att++){
String normAttrName = mapper.mapSafeAttribute(safe,
newAttributes.getLocalName(att));
@@ -164,8 +165,6 @@ class HtmlHandler extends TextContentHan
xhtml.endElement("link");
} else if ("BASE".equals(name)) {
xhtml.endElement("base");
- } else if ("META".equals(name)) {
- xhtml.endElement("meta");
}
}
if (bodyLevel > 0 && discardLevel == 0) {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=984997&r1=984996&r2=984997&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Thu Aug 12 21:42:55 2010
@@ -22,6 +22,7 @@ import java.io.InputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
+import java.util.regex.Pattern;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
@@ -405,5 +406,45 @@ public class HtmlParserTest extends Test
}
+ /**
+ * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
+ */
+ public void testElementOrdering() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<meta http-equiv=\"content-type\" content=\"text/html\">" +
+ "<link rel=\"next\" href=\"next.html\" />" +
+ "</head><body><p>Simple Content</p></body></html>";
+
+ SAXTransformerFactory factory =
(SAXTransformerFactory)SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.getTransformer().setOutputProperty(OutputKeys.ENCODING,
"utf-8");
+ StringWriter sw = new StringWriter();
+ handler.setResult(new StreamResult(sw));
+
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ handler, new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // Title element in <head> section
+
assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$",
result));
+
+ // No meta elements in body
+ assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$",
result));
+
+ // meta elements should show up in <head> section
+ assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$",
result));
+
+ // No link elements in body
+ assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$",
result));
+
+ // link element should be in <head> section
+ assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$",
result));
+ }
+
}