Author: jukka
Date: Fri Oct 16 12:22:23 2009
New Revision: 825863

URL: http://svn.apache.org/viewvc?rev=825863&view=rev
Log:
TIKA-287: HtmlParser should resolve relative paths in <a href="xxx"> elements

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=825863&r1=825862&r2=825863&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 Fri Oct 16 12:22:23 2009
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.parser.html;
 
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
@@ -81,12 +83,26 @@
 
     private int titleLevel = 0;
 
-    private StringBuilder title = new StringBuilder();
+    private final StringBuilder title = new StringBuilder();
 
     private HtmlHandler(XHTMLContentHandler xhtml, Metadata metadata) {
         super(xhtml);
         this.xhtml = xhtml;
         this.metadata = metadata;
+
+        // Try to determine the default base URL, if one has not been given
+        if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+            if (name != null) {
+                name = name.trim();
+                try {
+                    new URL(name); // test URL format
+                    metadata.set(Metadata.CONTENT_LOCATION, name);
+                } catch (MalformedURLException e) {
+                    // The resource name is not a valid URL, ignore it
+                }
+            }
+        }
     }
 
     public HtmlHandler(ContentHandler handler, Metadata metadata) {
@@ -107,13 +123,22 @@
             discardLevel++;
         }
 
-        if (bodyLevel == 0 && discardLevel == 0 && "META".equals(name)) {
-            String content = atts.getValue("content");
-            if (atts.getValue("http-equiv") != null && content != null) {
-                metadata.set(atts.getValue("http-equiv"), content);
-            }
-            if (atts.getValue("name") != null && content != null) {
-                metadata.set(atts.getValue("name"), content);
+        if (bodyLevel == 0 && discardLevel == 0) {
+            if ("META".equals(name) && atts.getValue("content") != null) {
+                if (atts.getValue("http-equiv") != null) {
+                    metadata.set(
+                            atts.getValue("http-equiv"),
+                            atts.getValue("content"));
+                }
+                if (atts.getValue("name") != null) {
+                    metadata.set(
+                            atts.getValue("name"),
+                            atts.getValue("content"));
+                }
+            } else if ("BASE".equals(name) && atts.getValue("href") != null) {
+                metadata.set(
+                        Metadata.CONTENT_LOCATION,
+                        resolve(atts.getValue("href").trim()));
             }
         }
 
@@ -123,11 +148,11 @@
             } else if ("A".equals(name)) {
                 String href = atts.getValue("href");
                 if (href != null) {
-                    xhtml.startElement("a", "href", href);
+                    xhtml.startElement("a", "href", resolve(href.trim()));
                 } else {
                     String anchor = atts.getValue("name");
                     if (anchor != null) {
-                        xhtml.startElement("a", "name", anchor);
+                        xhtml.startElement("a", "name", anchor.trim());
                     } else {
                         xhtml.startElement("a");
                     }
@@ -182,4 +207,42 @@
         }
     }
 
+    private String resolve(String url) {
+        // Return the URL as-is if no base URL is available
+        if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+            return url;
+        }
+
+        // Check for common non-hierarchical and pseudo URI prefixes
+        String lower = url.toLowerCase();
+        if (lower.startsWith("urn:")
+                || lower.startsWith("mailto:";)
+                || lower.startsWith("tel:")
+                || lower.startsWith("data:")
+                || lower.startsWith("javascript:")
+                || lower.startsWith("about:")) {
+            return url;
+        }
+
+        try {
+            URL base = new URL(metadata.get(Metadata.CONTENT_LOCATION).trim());
+
+            // We need to handle one special case, where the relativeUrl is
+            // just a query string (like "?pid=1"), and the baseUrl doesn't
+            // end with a '/'. In that case, the URL class removes the last
+            // portion of the path, which we don't want.
+            String path = base.getPath();
+            if (url.startsWith("?") && path.length() > 0 && 
!path.endsWith("/")) {
+                return new URL(
+                        base.getProtocol(), base.getHost(), base.getPort(),
+                        base.getPath() + url).toExternalForm();
+            } else {
+                return new URL(base, url).toExternalForm();
+            }
+        } catch (MalformedURLException e) {
+            // Unknown or broken format; just return the URL as received.
+            return url;
+        }
+    }
+
 }
\ No newline at end of file

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=825863&r1=825862&r2=825863&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Fri Oct 16 12:22:23 2009
@@ -25,7 +25,6 @@
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@@ -51,14 +50,11 @@
             source.setEncoding(encoding);
         }
 
-        // Prepare the HTML content handler that generates proper
-        // XHTML events to records relevant document metadata
-        handler = new HtmlHandler(handler, metadata);
-
         // Parse the HTML document
         org.ccil.cowan.tagsoup.Parser parser =
             new org.ccil.cowan.tagsoup.Parser();
-        parser.setContentHandler(new XHTMLDowngradeHandler(handler));
+        parser.setContentHandler(
+                new XHTMLDowngradeHandler(new HtmlHandler(handler, metadata)));
         parser.parse(source);
     }
 

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=825863&r1=825862&r2=825863&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Fri Oct 16 12:22:23 2009
@@ -20,7 +20,9 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 
 import junit.framework.TestCase;
 
@@ -136,6 +138,74 @@
     }
 
     /**
+     * Test case for TIKA-287
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-287";>TIKA-287</a>
+     */
+    public void testBaseHref() throws Exception {
+        assertRelativeLink(
+                "http://lucene.apache.org/tika/";,
+                "http://lucene.apache.org/";, "tika/");
+
+        assertRelativeLink(
+                "http://domain.com/?pid=1";,
+                "http://domain.com";, "?pid=1");
+        assertRelativeLink(
+                "http://domain.com/?pid=2";,
+                "http://domain.com?pid=1";, "?pid=2");
+
+        assertRelativeLink(
+                "http://domain.com/file.html";,
+                "http://domain.com/path/";, "/file.html");
+        assertRelativeLink(
+                "http://domain.com/path/file.html";,
+                "http://domain.com/path/";, "./file.html");
+        assertRelativeLink(
+                "http://domain.com/path/file.html";,
+                "http://domain.com/path/";, "file.html");
+
+        assertRelativeLink(
+                "http://domain2.com/newpath";,
+                "http://domain.com/path/to/file";, 
"http://domain2.com/newpath";);
+
+        // See 
http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
+        // Also http://www.ietf.org/rfc/rfc3986.txt
+        // Also http://issues.apache.org/jira/browse/NUTCH-566
+        // Also http://issues.apache.org/jira/browse/NUTCH-436
+        assertRelativeLink(
+                "http://domain.com/path/?pid=1";,
+                "http://domain.com/path/";, "?pid=1");
+        assertRelativeLink(
+                "http://domain.com/file?pid=1";,
+                "http://domain.com/file";, "?pid=1");
+        assertRelativeLink(
+                "http://domain.com/path/d;p?pid=1";,
+                "http://domain.com/path/d;p?q#f";, "?pid=1");
+    }
+
+    private void assertRelativeLink(String url, String base, String relative)
+            throws Exception {
+        String test =
+            "<html><head><base href=\"" + base + "\"></head>"
+            + "<body><a href=\"" + relative + "\">test</a></body></html>";
+        final List<String> links = new ArrayList<String>();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                new DefaultHandler() {
+                    @Override
+                    public void startElement(
+                            String u, String l, String name, Attributes atts) {
+                        if (atts.getValue("href") != null) {
+                            links.add(atts.getValue("href"));
+                        }
+                    }
+                },
+                new Metadata(),
+                new HashMap<String, Object>());
+        assertEquals(1, links.size());
+        assertEquals(url, links.get(0));
+    }
+
+    /**
      * Test case for TIKA-268
      * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-268";>TIKA-268</a>
      */


Reply via email to