Author: jukka
Date: Fri Oct 16 12:22:23 2009
New Revision: 825863
URL: http://svn.apache.org/viewvc?rev=825863&view=rev
Log:
TIKA-287: HtmlParser should resolve relative paths in <a href="xxx"> elements
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=825863&r1=825862&r2=825863&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Fri Oct 16 12:22:23 2009
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.html;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
@@ -81,12 +83,26 @@
private int titleLevel = 0;
- private StringBuilder title = new StringBuilder();
+ private final StringBuilder title = new StringBuilder();
private HtmlHandler(XHTMLContentHandler xhtml, Metadata metadata) {
super(xhtml);
this.xhtml = xhtml;
this.metadata = metadata;
+
+ // Try to determine the default base URL, if one has not been given
+ if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ name = name.trim();
+ try {
+ new URL(name); // test URL format
+ metadata.set(Metadata.CONTENT_LOCATION, name);
+ } catch (MalformedURLException e) {
+ // The resource name is not a valid URL, ignore it
+ }
+ }
+ }
}
public HtmlHandler(ContentHandler handler, Metadata metadata) {
@@ -107,13 +123,22 @@
discardLevel++;
}
- if (bodyLevel == 0 && discardLevel == 0 && "META".equals(name)) {
- String content = atts.getValue("content");
- if (atts.getValue("http-equiv") != null && content != null) {
- metadata.set(atts.getValue("http-equiv"), content);
- }
- if (atts.getValue("name") != null && content != null) {
- metadata.set(atts.getValue("name"), content);
+ if (bodyLevel == 0 && discardLevel == 0) {
+ if ("META".equals(name) && atts.getValue("content") != null) {
+ if (atts.getValue("http-equiv") != null) {
+ metadata.set(
+ atts.getValue("http-equiv"),
+ atts.getValue("content"));
+ }
+ if (atts.getValue("name") != null) {
+ metadata.set(
+ atts.getValue("name"),
+ atts.getValue("content"));
+ }
+ } else if ("BASE".equals(name) && atts.getValue("href") != null) {
+ metadata.set(
+ Metadata.CONTENT_LOCATION,
+ resolve(atts.getValue("href").trim()));
}
}
@@ -123,11 +148,11 @@
} else if ("A".equals(name)) {
String href = atts.getValue("href");
if (href != null) {
- xhtml.startElement("a", "href", href);
+ xhtml.startElement("a", "href", resolve(href.trim()));
} else {
String anchor = atts.getValue("name");
if (anchor != null) {
- xhtml.startElement("a", "name", anchor);
+ xhtml.startElement("a", "name", anchor.trim());
} else {
xhtml.startElement("a");
}
@@ -182,4 +207,42 @@
}
}
+ private String resolve(String url) {
+ // Return the URL as-is if no base URL is available
+ if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+ return url;
+ }
+
+ // Check for common non-hierarchical and pseudo URI prefixes
+ String lower = url.toLowerCase();
+ if (lower.startsWith("urn:")
+ || lower.startsWith("mailto:")
+ || lower.startsWith("tel:")
+ || lower.startsWith("data:")
+ || lower.startsWith("javascript:")
+ || lower.startsWith("about:")) {
+ return url;
+ }
+
+ try {
+ URL base = new URL(metadata.get(Metadata.CONTENT_LOCATION).trim());
+
+ // We need to handle one special case, where the relativeUrl is
+ // just a query string (like "?pid=1"), and the baseUrl doesn't
+ // end with a '/'. In that case, the URL class removes the last
+ // portion of the path, which we don't want.
+ String path = base.getPath();
+ if (url.startsWith("?") && path.length() > 0 &&
!path.endsWith("/")) {
+ return new URL(
+ base.getProtocol(), base.getHost(), base.getPort(),
+ base.getPath() + url).toExternalForm();
+ } else {
+ return new URL(base, url).toExternalForm();
+ }
+ } catch (MalformedURLException e) {
+ // Unknown or broken format; just return the URL as received.
+ return url;
+ }
+ }
+
}
\ No newline at end of file
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=825863&r1=825862&r2=825863&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Fri Oct 16 12:22:23 2009
@@ -25,7 +25,6 @@
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@@ -51,14 +50,11 @@
source.setEncoding(encoding);
}
- // Prepare the HTML content handler that generates proper
- // XHTML events to records relevant document metadata
- handler = new HtmlHandler(handler, metadata);
-
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser =
new org.ccil.cowan.tagsoup.Parser();
- parser.setContentHandler(new XHTMLDowngradeHandler(handler));
+ parser.setContentHandler(
+ new XHTMLDowngradeHandler(new HtmlHandler(handler, metadata)));
parser.parse(source);
}
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=825863&r1=825862&r2=825863&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Fri Oct 16 12:22:23 2009
@@ -20,7 +20,9 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import junit.framework.TestCase;
@@ -136,6 +138,74 @@
}
/**
+ * Test case for TIKA-287
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a>
+ */
+ public void testBaseHref() throws Exception {
+ assertRelativeLink(
+ "http://lucene.apache.org/tika/",
+ "http://lucene.apache.org/", "tika/");
+
+ assertRelativeLink(
+ "http://domain.com/?pid=1",
+ "http://domain.com", "?pid=1");
+ assertRelativeLink(
+ "http://domain.com/?pid=2",
+ "http://domain.com?pid=1", "?pid=2");
+
+ assertRelativeLink(
+ "http://domain.com/file.html",
+ "http://domain.com/path/", "/file.html");
+ assertRelativeLink(
+ "http://domain.com/path/file.html",
+ "http://domain.com/path/", "./file.html");
+ assertRelativeLink(
+ "http://domain.com/path/file.html",
+ "http://domain.com/path/", "file.html");
+
+ assertRelativeLink(
+ "http://domain2.com/newpath",
+ "http://domain.com/path/to/file",
"http://domain2.com/newpath");
+
+ // See
http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
+ // Also http://www.ietf.org/rfc/rfc3986.txt
+ // Also http://issues.apache.org/jira/browse/NUTCH-566
+ // Also http://issues.apache.org/jira/browse/NUTCH-436
+ assertRelativeLink(
+ "http://domain.com/path/?pid=1",
+ "http://domain.com/path/", "?pid=1");
+ assertRelativeLink(
+ "http://domain.com/file?pid=1",
+ "http://domain.com/file", "?pid=1");
+ assertRelativeLink(
+ "http://domain.com/path/d;p?pid=1",
+ "http://domain.com/path/d;p?q#f", "?pid=1");
+ }
+
+ private void assertRelativeLink(String url, String base, String relative)
+ throws Exception {
+ String test =
+ "<html><head><base href=\"" + base + "\"></head>"
+ + "<body><a href=\"" + relative + "\">test</a></body></html>";
+ final List<String> links = new ArrayList<String>();
+ new HtmlParser().parse(
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new DefaultHandler() {
+ @Override
+ public void startElement(
+ String u, String l, String name, Attributes atts) {
+ if (atts.getValue("href") != null) {
+ links.add(atts.getValue("href"));
+ }
+ }
+ },
+ new Metadata(),
+ new HashMap<String, Object>());
+ assertEquals(1, links.size());
+ assertEquals(url, links.get(0));
+ }
+
+ /**
* Test case for TIKA-268
* @see <a
href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
*/