This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 35193c2ddcbe8f24ea09eeabd9e90f7bc52097d5 Author: Sebastian Nagel <[email protected]> AuthorDate: Tue Dec 12 23:35:19 2017 +0100 NUTCH-2478 HTML parser should resolve base URL <base href=...> - fix parse-html and parse-tika - add unit test for parse-html --- .../apache/nutch/parse/html/DOMContentUtils.java | 7 ++---- .../org/apache/nutch/parse/html/HtmlParser.java | 9 ++++++-- .../apache/nutch/parse/html/TestHtmlParser.java | 26 +++++++++++++++++++++- .../apache/nutch/parse/tika/DOMContentUtils.java | 7 ++---- .../org/apache/nutch/parse/tika/TikaParser.java | 9 ++++++-- 5 files changed, 43 insertions(+), 15 deletions(-) diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java index 4527dd7..1f1061d 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java @@ -254,7 +254,7 @@ public class DOMContentUtils { } /** If Node contains a BASE tag then it's HREF is returned. */ - public URL getBase(Node node) { + public String getBase(Node node) { NodeWalker walker = new NodeWalker(node); @@ -276,10 +276,7 @@ public class DOMContentUtils { for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); if ("href".equalsIgnoreCase(attr.getNodeName())) { - try { - return new URL(attr.getNodeValue()); - } catch (MalformedURLException e) { - } + return attr.getNodeValue(); } } } diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index 7f60939..e940eb1 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -207,11 +207,16 @@ public class HtmlParser implements Parser { if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks - URL baseTag = utils.getBase(root); + URL baseTag = null; + try { + baseTag = new URL(base, utils.getBase(root)); + } catch (MalformedURLException e) { + baseTag = base; + } if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } - utils.getOutlinks(baseTag != null ? baseTag : base, l, root); + utils.getOutlinks(baseTag, l, root); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in " diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java index 0b39206..8fe94e6 100644 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java +++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java @@ -19,10 +19,12 @@ package org.apache.nutch.parse.html; import java.lang.invoke.MethodHandles; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.html.HtmlParser; +import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; @@ -78,17 +80,26 @@ public class TestHtmlParser { { "HTML5, utf-16, BOM", "utf-16", "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } }; + private static final String resolveBaseUrlTestContent = // + "<html>\\n<head>\n" + // + " <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + // + " <base href=\"//www.example.com/\">\n" + // + "</head>\n<body>\n" + // + " <a href=\"index.html\">outlink</a>\n" + // + "</body>\n</html>"; + private Configuration conf; private Parser parser; public TestHtmlParser() { conf = NutchConfiguration.create(); + conf.set("plugin.includes", "parse-html"); parser = new HtmlParser(); parser.setConf(conf); } protected Parse parse(byte[] contentBytes) { - String dummyUrl = "http://dummy.url/"; + String dummyUrl = "http://example.com/"; return parser.getParse( new Content(dummyUrl, dummyUrl, contentBytes, "text/html", new Metadata(), conf)).get(dummyUrl); @@ -120,4 +131,17 @@ public class TestHtmlParser { } } + @Test + public void testResolveBaseUrl() { + byte[] contentBytes = resolveBaseUrlTestContent + .getBytes(StandardCharsets.UTF_8); + // parse using http://example.com/ as "fetch" URL + Parse parse = parse(contentBytes); + LOG.info(parse.getData().toString()); + Outlink[] outlinks = parse.getData().getOutlinks(); + Assert.assertEquals(1, outlinks.length); + Assert.assertEquals("http://www.example.com/index.html", + outlinks[0].getToUrl()); + } + } diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java index af85480..d409589 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java @@ -259,7 +259,7 @@ public class DOMContentUtils { } /** If Node contains a BASE tag then it's HREF is returned. */ - URL getBase(Node node) { + public String getBase(Node node) { NodeWalker walker = new NodeWalker(node); @@ -281,10 +281,7 @@ public class DOMContentUtils { for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); if ("href".equalsIgnoreCase(attr.getNodeName())) { - try { - return new URL(attr.getNodeValue()); - } catch (MalformedURLException e) { - } + return attr.getNodeValue(); } } } diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index 73cd083..1173504 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -170,7 +170,12 @@ public class TikaParser implements org.apache.nutch.parse.Parser { if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks - URL baseTag = utils.getBase(root); + URL baseTag = null; + try { + baseTag = new URL(base, utils.getBase(root)); + } catch (MalformedURLException e) { + baseTag = base; + } if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } @@ -179,7 +184,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { //utils.getOutlinks(baseTag != null ? baseTag : base, l, root); // Get outlinks from Tika List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks(); - utils.getOutlinks(baseTag != null ? baseTag : base, l, tikaExtractedOutlinks); + utils.getOutlinks(baseTag, l, root); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in " -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
