Author: kkrugler
Date: Fri Aug 13 17:09:54 2010
New Revision: 985288
URL: http://svn.apache.org/viewvc?rev=985288&view=rev
Log:
TIKA-457: Fix frameset handling (both general, and for broken HTML)
This also is part of TIKA-463, as it improves handling of <frame> elements,
such that you now get them in the output with resolved src=xxx URL attributes.
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=985288&r1=985287&r2=985288&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
Fri Aug 13 17:09:54 2010
@@ -59,13 +59,13 @@ public class XHTMLContentHandler extends
* skip them if they get sent to startElement/endElement by mistake.
*/
private static final Set<String> AUTO =
- unmodifiableSet("html", "head", "body");
+ unmodifiableSet("html", "head", "body", "frameset");
/**
* The elements that get prepended with the {...@link #TAB} character.
*/
private static final Set<String> INDENT =
- unmodifiableSet("li", "dd", "dt", "td", "th");
+ unmodifiableSet("li", "dd", "dt", "td", "th", "frame");
/**
* The elements that get appended with the {...@link #NL} character.
@@ -93,7 +93,8 @@ public class XHTMLContentHandler extends
*/
private boolean headStarted = false;
private boolean headEnded = false;
-
+ private boolean useFrameset = false;
+
public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
super(handler);
this.metadata = metadata;
@@ -138,14 +139,15 @@ public class XHTMLContentHandler extends
* <head>
* <title>...</title>
* </head>
- * <body>
+ * <body> (or <frameset>
* </pre>
*/
- private void lazyEndHead() throws SAXException {
+ private void lazyEndHead(boolean isFrameset) throws SAXException {
lazyStartHead();
if (!headEnded) {
headEnded = true;
+ useFrameset = isFrameset;
// TIKA-478: Emit all metadata values (other than title). We have
to call
// startElement() and characters() directly to avoid recursive
problems.
@@ -156,7 +158,8 @@ public class XHTMLContentHandler extends
for (String value : metadata.getValues(name)) {
AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", name, name, "CDATA", value);
+ attributes.addAttribute("", "name", "name", "CDATA", name);
+ attributes.addAttribute("", "content", "content", "CDATA",
value);
super.startElement(XHTML, "meta", "meta", attributes);
super.endElement(XHTML, "meta", "meta");
}
@@ -172,7 +175,12 @@ public class XHTMLContentHandler extends
super.endElement(XHTML, "title", "title");
super.endElement(XHTML, "head", "head");
- super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
+
+ if (useFrameset) {
+ super.startElement(XHTML, "frameset", "frameset",
EMPTY_ATTRIBUTES);
+ } else {
+ super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
+ }
}
}
@@ -186,9 +194,14 @@ public class XHTMLContentHandler extends
*/
@Override
public void endDocument() throws SAXException {
- lazyEndHead();
+ lazyEndHead(useFrameset);
+
+ if (useFrameset) {
+ super.endElement(XHTML, "frameset", "frameset");
+ } else {
+ super.endElement(XHTML, "body", "body");
+ }
- super.endElement(XHTML, "body", "body");
super.endElement(XHTML, "html", "html");
endPrefixMapping("");
@@ -204,11 +217,13 @@ public class XHTMLContentHandler extends
String uri, String local, String name, Attributes attributes)
throws SAXException {
- if (!AUTO.contains(name)) {
+ if (name.equals("frameset")) {
+ lazyEndHead(true);
+ } else if (!AUTO.contains(name)) {
if (HEAD.contains(name)) {
lazyStartHead();
} else {
- lazyEndHead();
+ lazyEndHead(false);
}
if (XHTML.equals(uri) && INDENT.contains(name)) {
@@ -238,7 +253,7 @@ public class XHTMLContentHandler extends
*/
@Override
public void characters(char[] ch, int start, int length) throws
SAXException {
- lazyEndHead();
+ lazyEndHead(useFrameset);
super.characters(ch, start, length);
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=985288&r1=985287&r2=985288&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
Fri Aug 13 17:09:54 2010
@@ -62,6 +62,8 @@ public class DefaultHtmlMapper implement
// TIKA-463 - add additional elements that contain URLs
put("AREA", "area");
put("IMG", "img");
+ put("FRAMESET", "frameset");
+ put("FRAME", "frame");
}};
@@ -73,6 +75,7 @@ public class DefaultHtmlMapper implement
private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new
HashMap<String, Set<String>>() {{
put("a", attrSet("rel", "name"));
put("img", attrSet("src"));
+ put("frame", attrSet("src"));
// TODO KKr - fill out this set.
}};
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=985288&r1=985287&r2=985288&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Fri Aug 13 17:09:54 2010
@@ -79,7 +79,7 @@ class HtmlHandler extends TextContentHan
if ("TITLE".equals(name) || titleLevel > 0) {
titleLevel++;
}
- if ("BODY".equals(name) || bodyLevel > 0) {
+ if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0)
{
bodyLevel++;
}
if (mapper.isDiscardElement(name) || discardLevel > 0) {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=985288&r1=985287&r2=985288&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Fri Aug 13 17:09:54 2010
@@ -477,5 +477,88 @@ public class HtmlParserTest extends Test
assertTrue(Pattern.matches("(?s).*<img
src=\"http://domain.com/image.jpg\"/>.*$", result));
}
+ /**
+ * Test case for TIKA-463. Don't skip elements that have URLs.
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
+ */
+ public void testFrameSrcExtraction() throws Exception {
+ final String test = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "</head><frameset><frame src=\"frame.html\" /></frameset></html>";
+
+ SAXTransformerFactory factory =
(SAXTransformerFactory)SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.getTransformer().setOutputProperty(OutputKeys.ENCODING,
"utf-8");
+ StringWriter sw = new StringWriter();
+ handler.setResult(new StreamResult(sw));
+
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ handler, new Metadata(), new ParseContext());
+
+ String result = sw.toString();
+
+ // <frame> tag should exist, with fully resolved URL
+ assertTrue(Pattern.matches("(?s).*<frame
src=\"http://domain.com/frame.html\"/>.*$", result));
+ }
+
+ /**
+ * Test case for TIKA-457. Better handling for broken HTML that has
<frameset> inside of <body>.
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-457">TIKA-457</a>
+ */
+ public void testFBrokenrameset() throws Exception {
+ final String test2 = "<html><head><title> my title
</title></head><body>" +
+ "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
+ "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
+ "<frame src=\"invalid.html\"/></frame>" +
+ "<frame src=\"right.html\"></frame>" +
+ "</frameset></frameset></body></html>";
+
+ SAXTransformerFactory factory =
(SAXTransformerFactory)SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.getTransformer().setOutputProperty(OutputKeys.ENCODING,
"utf-8");
+
+ final String test1 = "<html><head><title>Title</title>" +
+ "<base href=\"http://domain.com\" />" +
+ "</head><body><frameset><frame src=\"frame.html\"
/></frameset></body></html>";
+
+ StringWriter sw1 = new StringWriter();
+ handler.setResult(new StreamResult(sw1));
+
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test1.getBytes("UTF-8")),
+ handler, new Metadata(), new ParseContext());
+
+ String result = sw1.toString();
+
+ // <frame> tag should exist, with fully resolved URL
+ assertTrue(Pattern.matches("(?s).*<frame
src=\"http://domain.com/frame.html\"/>.*$", result));
+
+ // <body> tag should not exist.
+ assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+
+ StringWriter sw2 = new StringWriter();
+ handler.setResult(new StreamResult(sw2));
+
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test2.getBytes("UTF-8")),
+ handler, new Metadata(), new ParseContext());
+
+ result = sw2.toString();
+
+ // <frame> tags should exist, with relative URL (no base element
specified)
+ assertTrue(Pattern.matches("(?s).*<frame src=\"top.html\"/>.*$",
result));
+ assertTrue(Pattern.matches("(?s).*<frame src=\"left.html\"/>.*$",
result));
+ assertTrue(Pattern.matches("(?s).*<frame src=\"invalid.html\"/>.*$",
result));
+ assertTrue(Pattern.matches("(?s).*<frame src=\"right.html\"/>.*$",
result));
+
+ // <body> tag should not exist.
+ assertFalse(Pattern.matches("(?s).*<body>.*$", result));
+ }
+
}